Crawling Craiglisht with python (Not Scrapy) - python-3.x

I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy
This is the URL: https://chicago.craigslist.org/
At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.
import re
import requests
import csv
from html import unescape
def get_page_content(url):
response = requests.get(url)
return response.text
def get_category_list(content):
return category_pat.findall(content)[90:121]
def get_next_page(content):
result = next_page_pat.findall(content)
if len(result) == 0:
return None
else:
result = 'https://chicago.craigslist.org/' + result[0]
return result
def get_job_list(content):
result = job_list_pat.findall(content)
return result
def get_job_details(content):
result = desc_pat.findall(content)
if len(result) == 0:
description = ''
else:
description = str(result[0])
return description
def scrape_job_info(job_info, category_name):
job_url, job_name = job_info
job_name = unescape(job_name)
job_dict = {'jobname': job_name, 'category': category_name}
job_dict['JOBURL'] = job_url
print('scraping', job_name)
content = get_category_list(job_url)
description = get_job_details(content)
job_dict['Description'] = description
print(job_dict)
def crawl_category(category_name, category_url):
while True:
print(category_url)
content = get_page_content(category_url)
job_list = get_job_list(content)
print(job_list)
for job_info in job_list:
scrape_job_info(job_info, category_name)
next_page = get_next_page(content)
if next_page is None:
break
category_url = next_page
def crawl_website():
url = 'https://chicago.craigslist.org'
content = get_page_content(url)
category_list = get_category_list(content)
for category in category_list:
category_url, category_name = category
category_url = url + category_url
crawl_category(category_name, category_url)
if __name__ == '__main__':
url = 'https://chicago.craigslist.org'
response = requests.get(url)
content = response.text
category_pat = re.compile(r'<li><a href=\"(\/d\/[\w\-]+\/\w+\/\w+)\".+txt\">([\w\-\+\s+\/\<]+)<sup class')
next_page_pat = re.compile(
r'<a href=\"\/(.*)\" class=\"button next\" title=\"next\s+page\">next > <\/a>\s+<span class=\"button next\" title=\"next page\">\s+next >\s+<\/span>\s+<\/span>\s+<\/div>\s+<\/div>\s+.+\s+.+')
job_list_pat = re.compile(r'([\w\s*]+)')
desc_pat = re.compile(r'<\/div>\s*<section id=\"postingbody\">.+html\"><\/div>\s*<\/div>(.+)<\/section><ul')
img_pat = re.compile(r'<img src=\"(.*jpg)\" title')
crawl_website()

Related

Not getting the Multithreaading to work for GTK 3 python

I am trying to get the themes from the gnome-look.org and trying to create widgets by scraping the website.
I wanted to show the window first while updating the GtkWidgets necessary in the background via another Thread.
Here is my code
[code]
#!/usr/bin/python3
import gi
gi.require_version('Gtk', '3.0')
from gi.repository import Gtk, GdkPixbuf
import requests
import sys
import gi
import shutil
from bs4 import BeautifulSoup
import dryscrape
import json
import urllib.parse
import concurrent.futures
import threading
class ReadGnomeLook:
def format_bytes(self, size):
# 2**10 = 1024
power = 2**10
n = 0
power_labels = {0 : '', 1: 'KB', 2: 'MB', 3: 'GB'}
while size > power:
size /= power
n += 1
return str("{:.2f}".format(size)) +' ' + str(power_labels[n])
def getDownloadLinks(self, childURL):
#childURL = "https://www.gnome-look.org/s/Gnome/p/1519633"
childURL = childURL+"#files-panel"
session = dryscrape.Session()
session.set_attribute('auto_load_images', False)
session.visit(childURL)
response = session.body()
soup = BeautifulSoup(response, features='lxml')
downloadlink = []
allscripts = soup.find_all('script', {"src":False})
for each_script in range(len(allscripts)):
content = str(allscripts[each_script]).split("var")
for indx in content:
if 'filesJson' in str(indx):
content = indx.replace('filesJson = ','').replace(';','')
content = json.loads(content)
links = []
for each_item in content:
if each_item['active'] == '1':
links.append({'name':each_item['name'],'type':each_item['type'],'size':format_bytes(int(each_item['size'])),'md5sum':each_item['md5sum'],'title':each_item['title'],'description':each_item['description'],'url':urllib.parse.unquote(each_item['url'])})
for each in links:
downloadlink.append(each)
return downloadlink
def readWebpage(self, URL):
myProducts = []
baseURL="https://www.gnome-look.org"
#URL = "https://www.gnome-look.org/browse/cat/132/order/latest/"
session = dryscrape.Session()
session.set_header('Host','www.gnome-look.org')
session.visit(URL)
response = session.body()
soup = BeautifulSoup(response, features='lxml')
#print(soup)
#soup.find(class="product-browse-item-info")
mydivs = soup.find_all("div", {"class": "product-browse-item picture"})
for mydiv in mydivs:
myProducts.append([
{'name' : mydiv.div.a.findAll("div",{"class":"product-browse-item-info"})[0].h2.text},
{'category' : mydiv.div.a.findAll("div",{"class":"product-browse-item-info"})[0].findAll("span")[0].text},
{'author' : mydiv.div.a.findAll("div",{"class":"product-browse-item-info"})[0].findAll("span")[1].b.text},
{'img' : mydiv.div.a.div.img['src']},
{'href' : mydiv.div.a['href']}
])
productCatalog = []
for elements in myProducts:
productCatalog.append([
{
'Name':elements[0]['name'],
'Category': elements[1]['category'],
'Author': elements[2]['author'],
'Image': elements[3]['img'],
'Link': baseURL + elements[4]['href'],
#'DownloadLinks': getDownloadLinks(baseURL + elements[4]['href'])
}
])
return productCatalog
class AppicationWindow(Gtk.Window):
def __init__(self):
Gtk.Window.__init__(self, title="Themes Manager")
# Main Application Window
# self.set_title("Themes Manager v1.0")
#self.set_default_size(400, 400)
self.set_position(Gtk.WindowPosition.CENTER)
self.connect("destroy",Gtk.main_quit)
# Create Image and Title Grid
#self.image = self.getImageFromWeb('https://media.wired.com/photos/592697678d4ebc5ab806acf7/master/w_2560%2Cc_limit/GooglePlay.jpg')
#self.image.set_from_file("android-download.png")
#image.set_size(200,200)
print("Before 1st Show all")
self.show_all()
z = threading.Thread(target=self.doProcessing(),daemon=True)
z.start()
print("Started Z thread")
def doProcessing(self):
# Grid for Full Icon Themes
self.gridfulliconthemes = Gtk.FlowBox(valign = Gtk.Align.START)
self.gridfulliconthemesscroll = Gtk.ScrolledWindow(hexpand=True, vexpand=True) # Create scroll window
self.gridfulliconthemesscroll.add(self.gridfulliconthemes) # Adds the TreeView to the scroll container
self.getProductCatalog()
## Start
self.URLs = []
self.labels = []
self.images = []
self.threads = []
for each_item in self.productCatalog:
image = Gtk.Image()
image.new_from_file('/tmp/82682596e6c89475b2f21221d5dc61927887.png')
self.images.append(image)
self.labels.append(Gtk.Label("loading"))
for each_item in range(0,len(self.productCatalog)):
#print(each_item[0]['Name'])
self.URLs.append(self.productCatalog[each_item][0]['Image'])
vertical_box = Gtk.Box()
vertical_box.set_homogeneous(True)
vertical_items = Gtk.FlowBox(valign = Gtk.Align.START)
vertical_items.set_max_children_per_line(1)
label = Gtk.Label()
label.set_text(self.productCatalog[each_item][0]['Name'])
label.set_line_wrap(True)
label.set_max_width_chars(10)
label.set_hexpand(True)
self.labels.append(label)
#image = Gtk.Image()
#self.images.append(image)
vertical_items.add(self.images[each_item])
vertical_items.add(self.labels[each_item])
vertical_box.add(vertical_items)
vertical_box.connect("button-press-event", self.do_anything)
self.gridfulliconthemes.add(vertical_box)
## End
# Create Notebook to add to the Window
self.notebook = Gtk.Notebook()
self.add(self.notebook)
self.fullicontheme = Gtk.Label()
self.fullicontheme.set_text("Full Icon Themes")
self.gtkthemes = Gtk.Label()
self.gtkthemes.set_text("Gtk 3/4 Themes")
self.gnomeshellthemes = Gtk.Label()
self.gnomeshellthemes.set_text("Gnome Shell Themes")
self.fulliconthemepage = Gtk.Label()
self.fulliconthemepage.set_text("Full Icon Themes Page")
self.gtkthemespage = Gtk.Label()
self.gtkthemespage.set_text("GTK themes Page")
self.gnomeshellthemespage = Gtk.Label()
self.gnomeshellthemespage.set_text("Gnome Shell Themes Page")
#notebook.append_page(fullicontheme, Gtk.Label("Icon Page"))
self.notebook.append_page(self.gridfulliconthemesscroll, self.fulliconthemepage)
self.notebook.append_page(self.gtkthemes, self.gtkthemespage)
self.notebook.append_page(self.gnomeshellthemes, self.gnomeshellthemespage)
self.notebook.set_tab_reorderable(self.gridfulliconthemesscroll, True)
#self.add(hb)
#self.show_all()
#threadtemp = threading.Thread(target=self.getImageFromWeb(each_item[0]['Image']))
#self.threads.append(threadtemp)
#self.getAllImages()
x = threading.Thread(target=self.getAllImages(),daemon=True)
x.start()
self.show_all()
def getProductCatalog(self):
# Download Links from GnomeLook.org
URL = "https://www.gnome-look.org/s/Gnome/browse/cat/132/page/2/ord/latest/"
readgnomelook = ReadGnomeLook()
self.productCatalog = readgnomelook.readWebpage(URL)
#print(json.dumps(productCatalog, sort_keys=False, indent=4))
def getAllImages(self):
for i in range(0,len(self.productCatalog)):
#self.images.append(self.getImageFromWeb(self.productCatalog[i][0]['Image']))
#self.images[i] = self.getImageFromWeb(self.productCatalog[i][0]['Image'],self.images[i])
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
future = executor.submit(self.getImageFromWeb, self.productCatalog[i][0]['Image'], self.images[i])
# #self.images.append(future.result())
self.images[i]= future.result()
# #print(type(self.images[i]))
def do_anything(self):
print("clicked on box")
def getImageFromWeb(self, URL,image):
filename = '/tmp/'+URL.split("/")[-1]
try:
f = open(filename)
pixbuf = GdkPixbuf.Pixbuf.new_from_file_at_scale(
filename=filename,
width=100,
height=100,
preserve_aspect_ratio=False)
Gtk.Image.set_from_pixbuf(image, pixbuf)
#image.set_from_file(filename)
#print("Got the image : " + filename)
#del r
return image
except IOError:
#print("File not accessible")
r = requests.get(URL,stream=True)
if r.status_code == 200:
with open(filename,'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
pixbuf = GdkPixbuf.Pixbuf.new_from_file_at_scale(
filename=filename,
width=200,
height=200,
preserve_aspect_ratio=False)
Gtk.Image.set_from_pixbuf(image, pixbuf)
#image.set_from_file(filename)
#print("Got the image : " + filename)
return image
else:
#print("Failed to get the image : " + filename)
return None
del r
window = AppicationWindow()
#window.connect("destroy",Gtk.main_quit)
#window.show_all()
Gtk.main()
[/code]
Code works fine. But in below code, the thread doProcessing() is getting completed and then I am seeing the "Started Z thread"
print("Before 1st Show all")
self.show_all()
z = threading.Thread(target=self.doProcessing(),daemon=True)
z.start()
print("Started Z thread")
As I see, doProcessing should start in background and "Started Z thread" should be printed immediately but that's not happening.
Am I missing anything here ? Any help is appreciated.
Thanks, Debasish
z = threading.Thread(target=self.doProcessing(),daemon=True)
threading.Thread wants a function not the result of a function, thus try:
z = threading.Thread(target=self.doProcessing,daemon=True)

Having issues with TheadPoolExecutor _wait_for_tstate_lock python (thread deadlock?)

I am having issues with ThreadPoolExecutor. It starts out strong and then slows down to an eventual stop. I don't understand what I'm doing wrong, I've tried moving ThreadPoolExecutor section to outside the domains_loop and just not using domains_loop but it does the same thing.
Changing the maxworkers down to 5 just freezes it earlier, so I know I must be doing something wrong that has nothing to do with the amount of threads.
The file read, url concat, and file write stuff works just fine, it's the async http requests that seem to be broken.
Interestingly if I have less subdomains it will still occasionally lock
sub_file = [
'mail.myresmed.com',
'www.resmed.com',
'bcg29k.2163007t.resmed.com',
'account.resmed.com',
'account-uat.resmed.com',
'www.account-uat.resmed.com',
'adfs.resmed.com',
'admin-mysleep.resm'
]
dir_file = ['/.git', '/test', '/manage', '/login']
subfile_iterator = [0]
dirfile_iterator = [0]
subfile_readstack = []
dirfile_readstack = [""] #first element is blank so the base url will be fetched
domains_list = []
results_list = []
sleep_inc = 0.0001
stack_size = 100
#browser_list = []
results = []
'''
***************************************************************************************************************************************************************************
FILE FNs
***************************************************************************************************************************************************************************
'''
async def write_to_file(results_list):
file = open('results.txt', 'a')
print("Writing to log")
for result in results_list:
#print("writing...\n")
#print(result.headers)
#file.write("{}\n\n".format(result.headers))
headers = result.headers
cookiejar = result.cookies
cookies = cookiejar.items()
file.write("\n\n")
file.write("***************************************************************************************************************************************\n")
file.write("---------------------------------------------------------------------------------------------------------------------------------------\n")
file.write("---------------------------------------------------------------------------------------------------------------------------------------\n")
file.write(" {} \n".format(result.url))
file.write("---------------------------------------------------------------------------------------------------------------------------------------\n")
file.write("---------------------------------------------------------------------------------------------------------------------------------------\n")
file.write("- status: {}\n".format(result.status_code))
file.write("- reason: {}\n".format(result.reason))
#file.write("- is redirect? {}\n".format(result.is_redirect))
#if result.is_redirect:
# file.write("is permanent redirect? {}\n".format(result.is_permanent.redirect))
file.write("\n- headers: \n")
for key,value in headers.items():
file.write("\t{keys}: {values}\n".format(keys=key, values=value))
file.write("\n- cookies: \n")
for cookie in cookies:
file.write("\t{}\n".format(cookie))
result_bytes = result.content
html_formatted = result_bytes.decode('utf-8')
soup = bs(html_formatted, "html.parser")
file.write("\n----------------------\n")
file.write("- style tags: \n")
file.write("----------------------\n\n")
for tags in soup.find_all('style'):
#prettify the css
file.write("{}\n\n".format(tags))
file.write("\n----------------------\n")
file.write("- script tags: \n")
file.write("----------------------\n\n")
for tags in soup.find_all('script'):
#prettify the javascript
file.write("{}\n\n".format(tags))
file.write("\n----------------------\n")
file.write("- links: \n")
file.write("----------------------\n\n")
for tags in soup.find_all('a'):
#prettify the javascript
file.write("{}\n\n".format(tags))
file.write("---------------------------------------------------------------------------------------------------------------------------------------\n")
file.write("---------------------------------------------------------------------------------------------------------------------------------------\n")
file.write("***************************************************************************************************************************************\n")
file.write("\n")
file.close()
def files_exist(subfile, dirfile):
if os.path.isfile(subfile):
subfile_exist = True
else:
print('sub_file does not exit')
if os.path.isfile(dirfile):
dirfile_exist = True
else:
print('dir_file does not exit')
if subfile_exist and dirfile_exist:
return True
else:
sys.exit()
async def read_from_file(list_file, file_lines, read_stack, file_iterator):
global sleep_inc
if file_iterator[-1] >= file_lines -1:
return
if len(read_stack) < stack_size -1:
with open(list_file) as f:
for i in range(1, file_lines+1):
file_iterator.append(i)
line = linecache.getline(list_file, i, module_globals=None).strip()
if len(line) > 0:
print("reading: {}".format(line))
read_stack.append(line)
await asyncio.sleep(sleep_inc)
if i == stack_size:
await asyncio.sleep(sleep_inc)
else:
await asyncio.sleep(sleep_inc)
async def get_lines(list_file):
with open(list_file) as f:
f.seek(0) #ensure you're at the start of the file..
first_char = f.read(1) #get the first character
if not first_char:
print("FAIL: the sub or dir files (or both) are empty") #first character is the empty string..
sys.exit()
else:
f.seek(0) #f
for i, l in enumerate(f):
await asyncio.sleep(sleep_inc)
pass
return i + 1
async def file_lines():
global sub_file
global dir_file
#global subfile_lines
#global dirfile_lines
if files_exist(sub_file, dir_file):
print("Reading files... ")
subfile_lines = files_read_loop.create_task(get_lines(sub_file))
dirfile_lines = files_read_loop.create_task(get_lines(dir_file))
await asyncio.wait([subfile_lines, dirfile_lines])
return (subfile_lines, dirfile_lines)
async def load_files():
global sub_file
global dir_file
global subfile_iterator
global dirfile_iterator
global subfile_readstack
global dirfile_readstack
(subfile_lines, dirfile_lines) = await file_lines()
read_from_sub_file = files_read_loop.create_task(read_from_file(sub_file, subfile_lines.result(), subfile_readstack, subfile_iterator))
read_from_dir_file = files_read_loop.create_task(read_from_file(dir_file, dirfile_lines.result(), dirfile_readstack, dirfile_iterator))
concat_sub_to_dir = files_read_loop.create_task(concat_addr(subfile_readstack, dirfile_readstack))
await asyncio.wait([read_from_sub_file, read_from_dir_file, concat_sub_to_dir])
async def write_log():
global results
print("write_log")
ret = files_write_loop.create_task(write_to_file(results))
'''
***************************************************************************************************************************************************************************
URL FNs
***************************************************************************************************************************************************************************
'''
async def concat_addr(subread, dirread):
global results_list
global domains_list
global sleep_inc
global subfile_readstack
global dirfile_readstack
global subfile_lines
global dirfile_lines
domains_list_size = len(domains_list)
if domains_list_size < stack_size -1:
for i, j in enumerate(subfile_readstack):
for j, k in enumerate(dirfile_readstack):
domains_list.insert(0, subfile_readstack[i] + dirfile_readstack[j])
print("adding: {subf}{dirf} to domains_list".format(subf=subfile_readstack[i], dirf=dirfile_readstack[j]))
await asyncio.sleep(sleep_inc)
else:
await asyncio.sleep(sleep_inc)
def fetch(session, url):
FQDM = "https://{domain}?".format(domain=url)
try:
fresh_agent = user_agents.swap()
custom_header = {'user-agent': fresh_agent}
with session.get(FQDM, headers=custom_header) as response:
status = response.status_code
url = response.url
print(f"=== {status} - {url}")
results.append(response)
return response
except:
print(f"Server at {url} not found")
finally:
pass
async def get(domains):
global results
with ThreadPoolExecutor(max_workers=50) as executor:
with requests.Session() as session:
loop = asyncio.get_event_loop()
print('''\n\n
------------------------
RESULTS
------------------------
\n
''')
for url in domains:
loop.run_in_executor( executor, fetch, *(session, url))
return True
async def iterate_domains():
global results
global domains_list
ret = domains_loop.create_task(get(domains_list))
'''
***************************************************************************************************************************************************************************
MAIN
***************************************************************************************************************************************************************************
'''
if __name__ == "__main__":
try:
#file_sema = asyncio.BoundedSemaphore(value=10)
files_read_loop = asyncio.get_event_loop()
files_read_loop.run_until_complete(load_files())
domains_loop = asyncio.get_event_loop()
domains_loop.set_debug(1)
domains_loop.run_until_complete(iterate_domains())
files_write_loop = asyncio.get_event_loop()
files_write_loop.run_until_complete(write_log())
except Exception as e:
print("****** EXCEPTION: {} ".format(e))
pass
finally:
files_read_loop.close()
domains_loop.close()
files_write_loop.close()
Solution is to add a timeout to each request, like so:
session.get(FQDM, headers=custom_header, timeout=X)
Full example:
def fetch(session, url):
FQDM = "https://{domain}?".format(domain=url)
try:
fresh_agent = user_agents.swap()
custom_header = {'user-agent': fresh_agent}
with session.get(FQDM, headers=custom_header, timeout=X) as response: <<<<<--------
status = response.status_code
url = response.url
print(f"=== {status} - {url}")
results.append(response)
return response
except:
print(f"Server at {url} not found")
finally:
pass

Scrape gmail for selected content and get attachments

I have a problem to scrape gmail.
Candidates should extract or pull information relating to financial transactions from Gmail. The information could be invoices, subscription alerts, bills, etc. We want you to connect with a Gmail account and scrape or pull data of invoices, subscriptions, upcoming bills. You can scrape the emails for words like upcoming invoice or subscription or invoice etc and pull the
amount, date, attachment if any all these details.
I have to collect information and also store all the attachments. Is there any specific simple way to do it?
my Code
import imaplib
import os
import email, getpass
import sys
import json
class GmailFinin():
def helloWorld(self):
print("\nHello I'm here to help you")
def initializeVariables(self):
self.usr = ""
self.pwd = ""
self.mail = object
self.mailbox = ""
self.mailCount = 0
self.destFolder = ""
self.data = []
self.ids = []
self.idsList = []
def getLogin(self):
print("\nPlease enter your Gmail login details below.")
self.usr = input("Email: ")
# self.pwd = input("Password: ")
self.pwd = getpass.getpass("Enter your password --> ")
def attemptLogin(self):
self.mail = imaplib.IMAP4_SSL("imap.gmail.com", 993)
if self.mail.login(self.usr, self.pwd):
print("\nLogon SUCCESSFUL")
self.destFolder = input("\nPlease choose a destination folder in the form of /Users/username/dest/ (do not forget trailing slash!): ")
if not self.destFolder.endswith("/"): self.destFolder+="/"
return True
else:
print("\nLogon FAILED")
return False
def checkIfUsersWantsToContinue(self):
print("\nWe have found "+str(self.mailCount)+" emails in the mailbox "+self.mailbox+".")
return True if input("Do you wish to continue extracting all the emails into "+self.destFolder+"? (y/N) ").lower().strip()[:1] == "y" else False
def selectMailbox(self):
# self.mailbox = input("\nPlease type the name of the mailbox you want to extract, e.g. Inbox: ")
self.mailbox = "Inbox"
bin_count = self.mail.select(self.mailbox)[1]
self.mailCount = int(bin_count[0].decode("utf-8"))
return True if self.mailCount > 0 else False
def searchThroughMailbox(self):
type, self.data = self.mail.search(None, "ALL")
self.ids = self.data[0]
self.idsList = self.ids.split()
def parseEmails(self):
jsonOutput = {}
for anEmail in self.data[0].split():
type, self.data = self.mail.fetch(anEmail, '(UID RFC822)')
raw = self.data[0][1]
try:
raw_str = raw.decode("utf-8")
except UnicodeDecodeError:
try:
raw_str = raw.decode("ISO-8859-1") # ANSI support
except UnicodeDecodeError:
try:
raw_str = raw.decode("ascii") # ASCII ?
except UnicodeDecodeError:
pass
msg = email.message_from_string(raw_str)
jsonOutput['subject'] = msg['subject']
jsonOutput['from'] = msg['from']
jsonOutput['date'] = msg['date']
raw = self.data[0][0]
raw_str = raw.decode("utf-8")
uid = raw_str.split()[2]
# Body #
if msg.is_multipart():
for part in msg.walk():
partType = part.get_content_type()
## Get Body ##
if partType == "text/plain" and "attachment" not in part:
jsonOutput['body'] = part.get_payload()
## Get Attachments ##
if part.get('Content-Disposition') is not None:
attchName = part.get_filename()
print(attchName)
if bool(attchName):
attchFilePath = str(self.destFolder)+str(uid)+str("/")+str(attchName)
print(attchFilePath)
os.makedirs(os.path.dirname(attchFilePath), exist_ok=True)
with open(attchFilePath, "wb") as f:
f.write(part.get_payload(decode=True))
else:
# jsonOutput['body'] = msg.get_payload(decode=True).decode("utf-8") # Non-multipart email, perhaps no attachments or just text.
jsonOutput['body'] = msg.get_payload()
outputDump = json.dumps(jsonOutput)
emailInfoFilePath = str(self.destFolder)+str(uid)+str("/")+str(uid)+str(".json")
os.makedirs(os.path.dirname(emailInfoFilePath), exist_ok=True)
print(emailInfoFilePath)
with open(emailInfoFilePath, "w") as f:
f.write(outputDump)
def __init__(self):
self.initializeVariables()
self.helloWorld()
self.getLogin()
if self.attemptLogin():
not self.selectMailbox() and sys.exit()
else:
sys.exit()
not self.checkIfUsersWantsToContinue() and sys.exit()
self.searchThroughMailbox()
self.parseEmails()
if __name__ == "__main__":
run = GmailFinin()
I have tried using below for search, but I don't think i.e optimal because it is searching only in subject and How to add multiple or condition for list of keywords.
type, self.data = self.mail.search(None, '(OR TEXT "bill" SUBJECT "bill")')

Trouble printing assignment

I have gotten a code and after working out the indentation problem in it, it runs without errors, however now I cannot print the code into a list.
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import requests
symbol = 'AAPL'
url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=" + symbol + "&type=&dateb=&owner=exclude&start=0&count=100&output=atom"
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
html = soup(page_html, 'html.parser')
entries = html.findAll("entry")
shouldContinue = True
link = ""
for entry in entries:
if shouldContinue and (
entry.find("category")["term"].lower() == "10-k" or entry.find("category")["term"].lower() == "10-q" or
entry.find("category")["term"].lower() == "20-f"):
firstUrl = entry.find("link")["href"]
uClientFirstUrl = uReq(firstUrl)
page_html_firstUrl = uClientFirstUrl.read()
uClientFirstUrl.close()
htmlFirstUrl = soup(page_html_firstUrl, 'html.parser')
tds = htmlFirstUrl.findAll("table")[1].findAll("td")
foundtd = False
for td in tds:
if foundtd == True:
link = "https://www.sec.gov" + td.find("a")["href"]
foundtd = False
if "xbrl instance" in td.text.lower():
foundtd = True
shouldContinue = False
def getCash(url, symbol):
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
xml = soup(page_html, 'xml')
cash = xml.findAll("us-gaap:CashAndCashEquivalentsAtCarryingValue")
if len(cash) == 0:
cash = xml.findAll("ifrs-full:Cash")
if len(cash) == 0:
cash = xml.findAll("us-gaap:CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents")
if len(cash) == 0:
cash = xml.findAll("us-gaap:Cash")
return cash
print(getCash)
getCash(url, symbol)
I have tried printing the assignment, as well as calling the method without any success. A sense of direction would be appreciated. Thank you.
As mentioned in my comment above:
What effect do you expect from print(getCash)? If you want it to print the return from the getCash() function, delete it (it's not doing anything), and wrap your getCash(url, symbol) call in a print() function.
Basically, do this:
print(getCash(url, symbol))

Python slowly scrapes websites

I've implemented news website scraper that scrapes by using Selenium web driver to access dynamic web pages and BeautifulSoup to retrieve the content. While parsing websites, I'm also writing scraped data to MongoDB storage and downloading pictures. I want to implement full news search by given category or by text, that appears in the news content. What can be the suggestions in terms of parallelization/adding async code to speed up the performance?
# -*- coding: utf-8 -*-
import os
import json
import requests
from bs4 import BeautifulSoup
from mongo_setup import Database
import gridfs
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import time
import logging
import re
import pymongo
PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))
DRIVER_BIN = os.path.join(PROJECT_ROOT, "bin/chromedriver")
class Scraper:
tsn_resource = 'https://tsn.ua/'
ukrnet_resource = 'https://www.ukr.net/'
db_name = 'scraper_db'
category_coll = 'categories'
articles_coll = 'articles'
def __init__(self, limit=10):
self.limit = limit # max number of articles per category
self.db = Database(self.db_name).connect_db()
self.category_coll = self.init_collection(self.category_coll)
self.articles_coll = self.init_collection(self.articles_coll)
self.logger = self.init_logger()
self.driver = webdriver.Chrome(executable_path = DRIVER_BIN)
self.image_storage = os.path.join(PROJECT_ROOT, "image_storage/")
def init_logger(self):
'''
Initialize log file.
'''
logger = logging.getLogger('scraper_app')
logger.setLevel(logging.INFO)
# create a file handler
handler = logging.FileHandler('scraper_logfile.log')
handler.setLevel(logging.INFO)
# create a logging format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(handler)
return logger
def init_collection(self, name):
if name in self.db.collection_names():
self.db[name].drop()
return self.db[name]
def insert_one_to_collection(self, data, collection):
try:
collection.insert_one(data)
except pymongo.errors.DuplicateKeyError:
pass
def insert_many_to_collection(self, data, collection):
try:
collection.insert_many(data)
except pymongo.errors.DuplicateKeyError:
pass
def download_image(self, image_url):
'''
download images from news articles
to local storage
'''
if not image_url.startswith(("data:image", "javascript")):
local_filename = image_url.split('/')[-1].split("?")[0]
r = requests.get(image_url, stream=True, verify=False)
with open(self.image_storage + local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
f.write(chunk)
def upload_image_to_mongo(self, image_url):
response = requests.get(image_url, stream=True)
fs = gridfs.GridFS(self.db)
img = response.raw.read()
fs.put(img, filename=local_filename)
def get_page_content(self, url):
try:
self.driver.get(url)
except WebDriverException:
self.driver = webdriver.Chrome(executable_path = DRIVER_BIN)
page = self.driver.page_source
return page
def parse_page_content(self, url, parser_lib):
page_obj = self.get_page_content(url)
soup = BeautifulSoup(page_obj, parser_lib)
return soup
def tsn_categories(self):
categories = self.gather_categories(self.tsn_resource, 'ul.c-app-nav-more-list li a')
return categories
def ukrnet_categories(self):
categories = self.gather_categories(self.ukrnet_resource, 'h2.feed__section--title a')
return categories
def gather_categories(self, url, selector):
categories = []
soup = self.parse_page_content(url, "html.parser")
all_categories = soup.select(selector)
for item in all_categories:
category = {}
link = str(item.attrs.get('href'))
if link.startswith('javascript'):
continue
if not link.startswith('https:'):
link = 'https:' + link
category['link'] = link
category['name'] = item.get_text().strip()
categories.append(category)
self.insert_many_to_collection(categories, self.category_coll)
return categories
def search_by_category(self, category_name):
category_name = category_name.decode('utf-8')
category_list = []
category_list += self.tsn_categories()
category_list += self.ukrnet_categories()
category_obj = next(item for item in category_list if item['name'] == category_name)
link = category_obj['link']
if 'ukr.net' in link:
articles = self.get_ukrnet_articles(category_name, link)
else:
articles = self.get_tsn_articles(category_name, link)
return articles
def get_ukrnet_articles(self, category_name, url):
'''
retrieve all articles from ukr.net by given category link
'''
count = 0
result = []
soup = self.parse_page_content(url, "html.parser")
all_articles = soup.select('div.im-tl a')
for item in all_articles:
if count <= self.limit:
article = {}
link = item.attrs.get('href')
article['link'] = link
article['category'] = category_name
article['content'] = item.contents[0].encode('utf-8')
result.append(article)
self.insert_one_to_collection(article, self.articles_coll)
else:
break
count += 1
return result
def get_tsn_articles(self, category_name, url):
'''
retrieve all articles from tsn.ua by given category link
'''
count = 0
result = []
data = [] # temporary storage
# first parse through the list of articles
soup = self.parse_page_content(url, "html.parser")
all_articles = soup.select('div.c-entry-embed a.c-post-img-wrap')
for item in all_articles:
# iterate limit amount of articles
if count <= self.limit:
article = {}
link = item.attrs.get('href')
img_src = item.find('img').get('src')
if link.endswith(".html"):
article['link'] = link
if img_src is not None:
article['img_src'] = img_src
self.download_image(img_src)
article['category'] = category_name
data.append(article)
count += 1
else:
break
# then iterate over each article
for article in data:
new_soup = self.parse_page_content(article['link'], "html5lib")
news_content = new_soup.select('div.e-content p')
text_content = [] # article content
for chunk in news_content:
text_content.append(chunk.get_text().strip(''))
article_text = ' '.join(text_content)
news_header = new_soup.select('div.c-post-meta h1') # article title
if news_header:
header_text = "".join(news_header[0].contents)
article_image = new_soup.find('figure', class_='js-lightgallery')
if article_image:
img_src = article_image.find('img').get('src') # articles image
self.download_image(img_src)
news_chunk = {}
news_chunk['category'] = article['category']
news_chunk['link'] = article['link']
news_chunk['title'] = header_text
# news_chunk['title'] = ''
news_chunk['content'] = article_text
news_chunk['images'] = []
if 'img_src' in article:
news_chunk['images'].append(article['img_src']) # caption image
if article_image:
news_chunk['images'].append(img_src) # article image
result.append(news_chunk)
self.insert_one_to_collection(news_chunk, self.articles_coll)
return result
def search_by_text(self, text):
category_links = []
category_links += self.ukrnet_categories()
category_links += self.tsn_categories()
result = self.website_search_by_text(text, category_links)
return result
def website_search_by_text(self, text_searched, category_links):
result = []
text_searched = text_searched.decode('utf-8')
for link in category_links:
article = {}
soup = self.parse_page_content(link['link'], "html.parser")
all_articles = soup.find_all('a', text=re.compile(text_searched))
for item in all_articles:
article['link'] = item.attrs.get('href')
article['category'] = link['name']
article['content'] = (item.contents[0].strip()).encode('utf-8')
self.insert_one_to_collection(article, self.articles_coll)
result.append(article)
return result
def collect_ukrnet_articles(self):
'''
outdated
'''
categories = self.ukrnet_categories()
for category in categories:
count = 0
soup = self.parse_page_content(category['link'], "html.parser")
all_articles = soup.select('div.im-tl a')
for item in all_articles:
# only 10 first articles
if count < self.limit:
article = {}
link = item.attrs.get('href')
article['link'] = link
article['category'] = category['name']
article['content'] = item.contents[0].encode('utf-8')
self.insert_one_to_collection(article, self.articles_coll)
else:
break
count += 1
def run(self):
self.search_by_category('Economics', self.tsn_categories())
self.search_by_text('Economics')
self.driver.quit()
if __name__ == '__main__':
scraper = Scraper()
scraper.run()
scrapy is a solid python framework that automatically does things async/parallel.
There's also multiprocessing that's been conveniently put into one package.
And then there's multithreading, also conveniently put into one package.
With the multithreading library there's a way to call the function you're trying to thread with map() and then pass the lists/variables you're trying to use with it. map(your_func, your_list)
I don't remember the exact link, or structure for it, but it's a quick google search away. Really makes it easier.

Resources