Python slowly scrapes websites - python-3.x

I've implemented news website scraper that scrapes by using Selenium web driver to access dynamic web pages and BeautifulSoup to retrieve the content. While parsing websites, I'm also writing scraped data to MongoDB storage and downloading pictures. I want to implement full news search by given category or by text, that appears in the news content. What can be the suggestions in terms of parallelization/adding async code to speed up the performance?
# -*- coding: utf-8 -*-
import os
import json
import requests
from bs4 import BeautifulSoup
from mongo_setup import Database
import gridfs
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import time
import logging
import re
import pymongo
PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))
DRIVER_BIN = os.path.join(PROJECT_ROOT, "bin/chromedriver")
class Scraper:
tsn_resource = 'https://tsn.ua/'
ukrnet_resource = 'https://www.ukr.net/'
db_name = 'scraper_db'
category_coll = 'categories'
articles_coll = 'articles'
def __init__(self, limit=10):
self.limit = limit # max number of articles per category
self.db = Database(self.db_name).connect_db()
self.category_coll = self.init_collection(self.category_coll)
self.articles_coll = self.init_collection(self.articles_coll)
self.logger = self.init_logger()
self.driver = webdriver.Chrome(executable_path = DRIVER_BIN)
self.image_storage = os.path.join(PROJECT_ROOT, "image_storage/")
def init_logger(self):
'''
Initialize log file.
'''
logger = logging.getLogger('scraper_app')
logger.setLevel(logging.INFO)
# create a file handler
handler = logging.FileHandler('scraper_logfile.log')
handler.setLevel(logging.INFO)
# create a logging format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(handler)
return logger
def init_collection(self, name):
if name in self.db.collection_names():
self.db[name].drop()
return self.db[name]
def insert_one_to_collection(self, data, collection):
try:
collection.insert_one(data)
except pymongo.errors.DuplicateKeyError:
pass
def insert_many_to_collection(self, data, collection):
try:
collection.insert_many(data)
except pymongo.errors.DuplicateKeyError:
pass
def download_image(self, image_url):
'''
download images from news articles
to local storage
'''
if not image_url.startswith(("data:image", "javascript")):
local_filename = image_url.split('/')[-1].split("?")[0]
r = requests.get(image_url, stream=True, verify=False)
with open(self.image_storage + local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
f.write(chunk)
def upload_image_to_mongo(self, image_url):
response = requests.get(image_url, stream=True)
fs = gridfs.GridFS(self.db)
img = response.raw.read()
fs.put(img, filename=local_filename)
def get_page_content(self, url):
try:
self.driver.get(url)
except WebDriverException:
self.driver = webdriver.Chrome(executable_path = DRIVER_BIN)
page = self.driver.page_source
return page
def parse_page_content(self, url, parser_lib):
page_obj = self.get_page_content(url)
soup = BeautifulSoup(page_obj, parser_lib)
return soup
def tsn_categories(self):
categories = self.gather_categories(self.tsn_resource, 'ul.c-app-nav-more-list li a')
return categories
def ukrnet_categories(self):
categories = self.gather_categories(self.ukrnet_resource, 'h2.feed__section--title a')
return categories
def gather_categories(self, url, selector):
categories = []
soup = self.parse_page_content(url, "html.parser")
all_categories = soup.select(selector)
for item in all_categories:
category = {}
link = str(item.attrs.get('href'))
if link.startswith('javascript'):
continue
if not link.startswith('https:'):
link = 'https:' + link
category['link'] = link
category['name'] = item.get_text().strip()
categories.append(category)
self.insert_many_to_collection(categories, self.category_coll)
return categories
def search_by_category(self, category_name):
category_name = category_name.decode('utf-8')
category_list = []
category_list += self.tsn_categories()
category_list += self.ukrnet_categories()
category_obj = next(item for item in category_list if item['name'] == category_name)
link = category_obj['link']
if 'ukr.net' in link:
articles = self.get_ukrnet_articles(category_name, link)
else:
articles = self.get_tsn_articles(category_name, link)
return articles
def get_ukrnet_articles(self, category_name, url):
'''
retrieve all articles from ukr.net by given category link
'''
count = 0
result = []
soup = self.parse_page_content(url, "html.parser")
all_articles = soup.select('div.im-tl a')
for item in all_articles:
if count <= self.limit:
article = {}
link = item.attrs.get('href')
article['link'] = link
article['category'] = category_name
article['content'] = item.contents[0].encode('utf-8')
result.append(article)
self.insert_one_to_collection(article, self.articles_coll)
else:
break
count += 1
return result
def get_tsn_articles(self, category_name, url):
'''
retrieve all articles from tsn.ua by given category link
'''
count = 0
result = []
data = [] # temporary storage
# first parse through the list of articles
soup = self.parse_page_content(url, "html.parser")
all_articles = soup.select('div.c-entry-embed a.c-post-img-wrap')
for item in all_articles:
# iterate limit amount of articles
if count <= self.limit:
article = {}
link = item.attrs.get('href')
img_src = item.find('img').get('src')
if link.endswith(".html"):
article['link'] = link
if img_src is not None:
article['img_src'] = img_src
self.download_image(img_src)
article['category'] = category_name
data.append(article)
count += 1
else:
break
# then iterate over each article
for article in data:
new_soup = self.parse_page_content(article['link'], "html5lib")
news_content = new_soup.select('div.e-content p')
text_content = [] # article content
for chunk in news_content:
text_content.append(chunk.get_text().strip(''))
article_text = ' '.join(text_content)
news_header = new_soup.select('div.c-post-meta h1') # article title
if news_header:
header_text = "".join(news_header[0].contents)
article_image = new_soup.find('figure', class_='js-lightgallery')
if article_image:
img_src = article_image.find('img').get('src') # articles image
self.download_image(img_src)
news_chunk = {}
news_chunk['category'] = article['category']
news_chunk['link'] = article['link']
news_chunk['title'] = header_text
# news_chunk['title'] = ''
news_chunk['content'] = article_text
news_chunk['images'] = []
if 'img_src' in article:
news_chunk['images'].append(article['img_src']) # caption image
if article_image:
news_chunk['images'].append(img_src) # article image
result.append(news_chunk)
self.insert_one_to_collection(news_chunk, self.articles_coll)
return result
def search_by_text(self, text):
category_links = []
category_links += self.ukrnet_categories()
category_links += self.tsn_categories()
result = self.website_search_by_text(text, category_links)
return result
def website_search_by_text(self, text_searched, category_links):
result = []
text_searched = text_searched.decode('utf-8')
for link in category_links:
article = {}
soup = self.parse_page_content(link['link'], "html.parser")
all_articles = soup.find_all('a', text=re.compile(text_searched))
for item in all_articles:
article['link'] = item.attrs.get('href')
article['category'] = link['name']
article['content'] = (item.contents[0].strip()).encode('utf-8')
self.insert_one_to_collection(article, self.articles_coll)
result.append(article)
return result
def collect_ukrnet_articles(self):
'''
outdated
'''
categories = self.ukrnet_categories()
for category in categories:
count = 0
soup = self.parse_page_content(category['link'], "html.parser")
all_articles = soup.select('div.im-tl a')
for item in all_articles:
# only 10 first articles
if count < self.limit:
article = {}
link = item.attrs.get('href')
article['link'] = link
article['category'] = category['name']
article['content'] = item.contents[0].encode('utf-8')
self.insert_one_to_collection(article, self.articles_coll)
else:
break
count += 1
def run(self):
self.search_by_category('Economics', self.tsn_categories())
self.search_by_text('Economics')
self.driver.quit()
if __name__ == '__main__':
scraper = Scraper()
scraper.run()

scrapy is a solid python framework that automatically does things async/parallel.
There's also multiprocessing that's been conveniently put into one package.
And then there's multithreading, also conveniently put into one package.
With the multithreading library there's a way to call the function you're trying to thread with map() and then pass the lists/variables you're trying to use with it. map(your_func, your_list)
I don't remember the exact link, or structure for it, but it's a quick google search away. Really makes it easier.

Related

How can I scrape product data from flexbox elements with python HTMLSession method?

I am facing an issue with parse product detail because of the flexbox design. I use the same method for other websites and it works but for some websites where a flex box is there, I can't able to parse any data.
from requests_html import HTMLSession
import csv
url = 'https://gpltheme.com/product-category/themeforest/'
s = HTMLSession()
def get_links(url):
r = s.get(url)
items = r.html.find('div.product-element-top.wd-quick-shop')
links = []
for item in items:
links.append(item.find('a',first=True).attrs['href'])
return links
def get_productdata(link):
r = s.get(link)
title = r.html.find('h1', first=True).full_text
category = r.html.find('a.breadcrumb-link.breadcrumb-link-last', first=True).full_text
product = {
'title': title.strip(),
'category': category.strip(),
}
print(product)
return product
links = get_links(url)
results = []
for link in links:
results.append(get_productdata(link))
with open('gplproduct_py2.csv', 'w', encoding='utf8', newline='') as f:
wr = csv.DictWriter(f, fieldnames=results[0].keys(),)
wr.writeheader()
wr.writerows(results)
print('Fin.')

Concurrent is not working properly with beautifulsoup, not fetching all the links

In this code I want to extract content from a newspaper link using beautifulsoup. But it is not working properly, each link in the list "filtered_Final_LIST" has links which has multiple articles. The function 'ext_url' is not returning all the pages results when I am using concurrent library.
And, Normal for loop is working properly. I have used this concurrent library to increase extraction speed. Am I doing something wrong?
import concurrent.futures
import time
MAX_THREADS = 30
filtered_Final_LIST = ['https://www.financialexpress.com/economy/finmin-asks-ministries-to-restrict-expenses-within-prescribed-limit/2410766/"',
'https://www.financialexpress.com/economy/uk-inflation-hits-near-30-year-high-pressuring-boe-and-households/2410761/"',
'https://www.financialexpress.com/economy/economic-recovery-yet-to-attain-durability-says-report/2410690/"',
'https://www.financialexpress.com/economy/vagaries-of-weather-drive-near-13-lakh-maha-farmers-to-crop-insurance-scheme/2410030/"']
def ext_url(url):
global List_articles, List_header, List_date, List_month, List_year, List_source
## Lists to get dates and news articles
List_articles = []
List_header = []
List_date = []
List_month = []
List_year = []
List_source = []
# for i in range(len(filtered_Final_LIST)):
# if 'https://www.financialexpress.com/economy/' in str(Final_LIST[i]):
# opening the url for reading
html = urllib.request.urlopen(url , timeout = 10)
print(url)
# parsing the html file
htmlParse = BeautifulSoup(html, 'html.parser')
# getting all the paragraphs of articles
for para in htmlParse.find_all(['div'], class_='entry-content wp-block-post-content'):
List_articles.append(para.get_text())
# Getting respective month, date, year the article published
from datetime import datetime
date = htmlParse.find(itemprop="article:published_time").get("content")
match = re.search(r'\d{4}-\d{2}-\d{2}', date)
dt = datetime.strptime(match.group(), '%Y-%m-%d').date()
List_month.append(dt.month)
List_date.append(dt.day)
List_year.append(dt.year)
# getting all the headings of articles
for para in htmlParse.find_all(['h1'], class_='wp-block-post-title'):
List_header.append(para.get_text())
# getting all the source of articles
for para in htmlParse.find_all(['div'], class_='author-link ie_custom_theme_multiple_authors'):
List_source.append(para.get_text())
return List_articles, List_header, List_date, List_month, List_year, List_source
with concurrent.futures.ThreadPoolExecutor() as executor :
for i in range(len(filtered_Final_LIST)):
executor.submit(ext_url, (filtered_Final_LIST[i]))
import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
# pip install trio httpx
mainurl = 'https://www.financialexpress.com/economy/'
news = [
'finmin-asks-ministries-to-restrict-expenses-within-prescribed-limit/2410766/',
'uk-inflation-hits-near-30-year-high-pressuring-boe-and-households/2410761/',
'economic-recovery-yet-to-attain-durability-says-report/2410690/',
'vagaries-of-weather-drive-near-13-lakh-maha-farmers-to-crop-insurance-scheme/2410030/'
]
allin = []
async def get_soup(content):
return BeautifulSoup(content, 'lxml')
async def worker(receiver):
async with receiver:
async for client, new in receiver:
r = await client.get(mainurl + new)
soup = await get_soup(r.text)
prs = [x.text for x in soup.select(
'.entry-content > p:not(:last-child)')]
title = soup.select_one('.wp-block-post-title').text
author = soup.select_one('div.author-link a').text
publish = soup.select_one(
'[itemprop="article:published_time"]')['content'].split('T')[0].split('-')
target = [title, author, *publish, prs]
allin.append(target)
async def main():
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(5):
nurse.start_soon(worker, receiver.clone())
async with sender:
for new in news:
await sender.send([client, new])
if __name__ == "__main__":
trio.run(main)
df = pd.DataFrame(
allin, columns=['Title', 'Author', 'Year', 'Month', 'Day', 'Paragraphs'])
print(df)
df.to_csv('data.csv', index=False)

Script isn't retrieving all the info

I tried making a python script that gets all the fighter names and their records from boxrec.com. The issue is that it doesn't retrieve them all (Floyd Mayweather is missing) and some of them appear several times (Success Tetteh for example).
The output is too big to post it here: https://cryptpad.fr/pad/#/2/pad/view/mYd4jIMOxY7QNUqW2-5TvYIvvx84KXbiMdYvXINGV9M/
Edit: For some fighters the records are wrong (Vasyl Lomachenko for example appears to have 28 wins, but he has 14)
import numpy
from requests import Session
from bs4 import BeautifulSoup
import pandas as pd
import pyautogui
import time
def main():
fighter_names = []
fighter_wins = []
fighter_losses = []
fighter_draws = []
username = "username"
password = "password"
site = "https://boxrec.com/en/login"
payload = {
'_username': username,
'_password': password,
'login[go]': None
}
with Session() as s:
s.get(site)
s.post(site, data=payload, headers={
"Content-Type": "application/x-www-form-urlencoded"
})
pages = numpy.arange(1, 19152, 20)
for page in pages:
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
if not names_a:
print("solving captcha")
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
pyautogui.click(x=118, y=1061)
time.sleep(1)
pyautogui.click(x=1035, y=619)
time.sleep(2)
pyautogui.click(x=97, y=59)
time.sleep(1)
pyautogui.click(x=834, y=247)
time.sleep(2)
if not names_a:
print("please solve captcha manually")
while not names_a:
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
wins_span = soup.find_all('span', class_='textWon')
loses_span = soup.find_all('span', class_='textLost')
draws_span = soup.find_all('span', class_='textDraw')
for container in names_a:
name = container.text
print(name)
fighter_names.append(name)
for container in wins_span:
wins = container.text
fighter_wins.append(wins)
for container in loses_span:
losses = container.text
fighter_losses.append(losses)
for container in draws_span:
draws = container.text
fighter_draws.append(draws)
fighters = {
"name": fighter_names,
"wins": fighter_wins,
"loses": fighter_losses,
"draws": fighter_draws
}
df = pd.DataFrame.from_dict(fighters, orient="index")
df = df.transpose()
df.to_csv("fighters.csv")
if __name__ == '__main__':
main()
I would refrain from using the same variable name to represent 2 separate things...Looks like you have page variable being used in 2 separate instances, which can be confusing.
As far as some of the issues, I'm assuming at some point there's a mismatch in the lists so the corresponding data isn't lining up with the correct fighter name, etc. or there's something off with the sites actual data/html. Not entirely sure as I haven't debugged. Reason being, have you considered using pandas to parse the table then just split the 'w-l-d' column? I think it would be far easier to let pandas do the parsing as to not miss something in the 900+ pages you need to go through.
See if this helps:
import numpy
from requests import Session
from bs4 import BeautifulSoup
import pandas as pd
import pyautogui
import time
import math
def main():
final_df = pd.DataFrame()
username = 'username'
password = 'password'
site = "https://boxrec.com/en/login"
payload = {
'_username': username,
'_password': password,
'login[go]': None
}
with Session() as s:
s.get(site)
s.post(site, data=payload, headers={
"Content-Type": "application/x-www-form-urlencoded"
})
pages = numpy.arange(1, 19152, 20)
for page in pages:
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
if not names_a:
print("solving captcha")
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
pyautogui.click(x=118, y=1061)
time.sleep(1)
pyautogui.click(x=1035, y=619)
time.sleep(2)
pyautogui.click(x=97, y=59)
time.sleep(1)
pyautogui.click(x=834, y=247)
time.sleep(2)
if not names_a:
print("please solve captcha manually")
while not names_a:
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
df = pd.read_html(response.text)[-1]
df = df[['name','w-l-d']]
df = df[df['w-l-d'].astype(str).str.match(r"(^\d*.\d*.\d*$)")] # <--- ADD THIS LINE
df[['wins','loses','draws']] = df['w-l-d'].str.split(expand=True)
df = df.drop('w-l-d', axis=1)
print('Page: %d of %d' %(((page-1)/20)+1,math.ceil(19152/20)))
final_df = final_df.append(df, sort=False).reset_index(drop=True)
final_df.to_csv("fighters.csv")
if __name__ == '__main__':
main()

Not getting the Multithreaading to work for GTK 3 python

I am trying to get the themes from the gnome-look.org and trying to create widgets by scraping the website.
I wanted to show the window first while updating the GtkWidgets necessary in the background via another Thread.
Here is my code
[code]
#!/usr/bin/python3
import gi
gi.require_version('Gtk', '3.0')
from gi.repository import Gtk, GdkPixbuf
import requests
import sys
import gi
import shutil
from bs4 import BeautifulSoup
import dryscrape
import json
import urllib.parse
import concurrent.futures
import threading
class ReadGnomeLook:
def format_bytes(self, size):
# 2**10 = 1024
power = 2**10
n = 0
power_labels = {0 : '', 1: 'KB', 2: 'MB', 3: 'GB'}
while size > power:
size /= power
n += 1
return str("{:.2f}".format(size)) +' ' + str(power_labels[n])
def getDownloadLinks(self, childURL):
#childURL = "https://www.gnome-look.org/s/Gnome/p/1519633"
childURL = childURL+"#files-panel"
session = dryscrape.Session()
session.set_attribute('auto_load_images', False)
session.visit(childURL)
response = session.body()
soup = BeautifulSoup(response, features='lxml')
downloadlink = []
allscripts = soup.find_all('script', {"src":False})
for each_script in range(len(allscripts)):
content = str(allscripts[each_script]).split("var")
for indx in content:
if 'filesJson' in str(indx):
content = indx.replace('filesJson = ','').replace(';','')
content = json.loads(content)
links = []
for each_item in content:
if each_item['active'] == '1':
links.append({'name':each_item['name'],'type':each_item['type'],'size':format_bytes(int(each_item['size'])),'md5sum':each_item['md5sum'],'title':each_item['title'],'description':each_item['description'],'url':urllib.parse.unquote(each_item['url'])})
for each in links:
downloadlink.append(each)
return downloadlink
def readWebpage(self, URL):
myProducts = []
baseURL="https://www.gnome-look.org"
#URL = "https://www.gnome-look.org/browse/cat/132/order/latest/"
session = dryscrape.Session()
session.set_header('Host','www.gnome-look.org')
session.visit(URL)
response = session.body()
soup = BeautifulSoup(response, features='lxml')
#print(soup)
#soup.find(class="product-browse-item-info")
mydivs = soup.find_all("div", {"class": "product-browse-item picture"})
for mydiv in mydivs:
myProducts.append([
{'name' : mydiv.div.a.findAll("div",{"class":"product-browse-item-info"})[0].h2.text},
{'category' : mydiv.div.a.findAll("div",{"class":"product-browse-item-info"})[0].findAll("span")[0].text},
{'author' : mydiv.div.a.findAll("div",{"class":"product-browse-item-info"})[0].findAll("span")[1].b.text},
{'img' : mydiv.div.a.div.img['src']},
{'href' : mydiv.div.a['href']}
])
productCatalog = []
for elements in myProducts:
productCatalog.append([
{
'Name':elements[0]['name'],
'Category': elements[1]['category'],
'Author': elements[2]['author'],
'Image': elements[3]['img'],
'Link': baseURL + elements[4]['href'],
#'DownloadLinks': getDownloadLinks(baseURL + elements[4]['href'])
}
])
return productCatalog
class AppicationWindow(Gtk.Window):
def __init__(self):
Gtk.Window.__init__(self, title="Themes Manager")
# Main Application Window
# self.set_title("Themes Manager v1.0")
#self.set_default_size(400, 400)
self.set_position(Gtk.WindowPosition.CENTER)
self.connect("destroy",Gtk.main_quit)
# Create Image and Title Grid
#self.image = self.getImageFromWeb('https://media.wired.com/photos/592697678d4ebc5ab806acf7/master/w_2560%2Cc_limit/GooglePlay.jpg')
#self.image.set_from_file("android-download.png")
#image.set_size(200,200)
print("Before 1st Show all")
self.show_all()
z = threading.Thread(target=self.doProcessing(),daemon=True)
z.start()
print("Started Z thread")
def doProcessing(self):
# Grid for Full Icon Themes
self.gridfulliconthemes = Gtk.FlowBox(valign = Gtk.Align.START)
self.gridfulliconthemesscroll = Gtk.ScrolledWindow(hexpand=True, vexpand=True) # Create scroll window
self.gridfulliconthemesscroll.add(self.gridfulliconthemes) # Adds the TreeView to the scroll container
self.getProductCatalog()
## Start
self.URLs = []
self.labels = []
self.images = []
self.threads = []
for each_item in self.productCatalog:
image = Gtk.Image()
image.new_from_file('/tmp/82682596e6c89475b2f21221d5dc61927887.png')
self.images.append(image)
self.labels.append(Gtk.Label("loading"))
for each_item in range(0,len(self.productCatalog)):
#print(each_item[0]['Name'])
self.URLs.append(self.productCatalog[each_item][0]['Image'])
vertical_box = Gtk.Box()
vertical_box.set_homogeneous(True)
vertical_items = Gtk.FlowBox(valign = Gtk.Align.START)
vertical_items.set_max_children_per_line(1)
label = Gtk.Label()
label.set_text(self.productCatalog[each_item][0]['Name'])
label.set_line_wrap(True)
label.set_max_width_chars(10)
label.set_hexpand(True)
self.labels.append(label)
#image = Gtk.Image()
#self.images.append(image)
vertical_items.add(self.images[each_item])
vertical_items.add(self.labels[each_item])
vertical_box.add(vertical_items)
vertical_box.connect("button-press-event", self.do_anything)
self.gridfulliconthemes.add(vertical_box)
## End
# Create Notebook to add to the Window
self.notebook = Gtk.Notebook()
self.add(self.notebook)
self.fullicontheme = Gtk.Label()
self.fullicontheme.set_text("Full Icon Themes")
self.gtkthemes = Gtk.Label()
self.gtkthemes.set_text("Gtk 3/4 Themes")
self.gnomeshellthemes = Gtk.Label()
self.gnomeshellthemes.set_text("Gnome Shell Themes")
self.fulliconthemepage = Gtk.Label()
self.fulliconthemepage.set_text("Full Icon Themes Page")
self.gtkthemespage = Gtk.Label()
self.gtkthemespage.set_text("GTK themes Page")
self.gnomeshellthemespage = Gtk.Label()
self.gnomeshellthemespage.set_text("Gnome Shell Themes Page")
#notebook.append_page(fullicontheme, Gtk.Label("Icon Page"))
self.notebook.append_page(self.gridfulliconthemesscroll, self.fulliconthemepage)
self.notebook.append_page(self.gtkthemes, self.gtkthemespage)
self.notebook.append_page(self.gnomeshellthemes, self.gnomeshellthemespage)
self.notebook.set_tab_reorderable(self.gridfulliconthemesscroll, True)
#self.add(hb)
#self.show_all()
#threadtemp = threading.Thread(target=self.getImageFromWeb(each_item[0]['Image']))
#self.threads.append(threadtemp)
#self.getAllImages()
x = threading.Thread(target=self.getAllImages(),daemon=True)
x.start()
self.show_all()
def getProductCatalog(self):
# Download Links from GnomeLook.org
URL = "https://www.gnome-look.org/s/Gnome/browse/cat/132/page/2/ord/latest/"
readgnomelook = ReadGnomeLook()
self.productCatalog = readgnomelook.readWebpage(URL)
#print(json.dumps(productCatalog, sort_keys=False, indent=4))
def getAllImages(self):
for i in range(0,len(self.productCatalog)):
#self.images.append(self.getImageFromWeb(self.productCatalog[i][0]['Image']))
#self.images[i] = self.getImageFromWeb(self.productCatalog[i][0]['Image'],self.images[i])
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
future = executor.submit(self.getImageFromWeb, self.productCatalog[i][0]['Image'], self.images[i])
# #self.images.append(future.result())
self.images[i]= future.result()
# #print(type(self.images[i]))
def do_anything(self):
print("clicked on box")
def getImageFromWeb(self, URL,image):
filename = '/tmp/'+URL.split("/")[-1]
try:
f = open(filename)
pixbuf = GdkPixbuf.Pixbuf.new_from_file_at_scale(
filename=filename,
width=100,
height=100,
preserve_aspect_ratio=False)
Gtk.Image.set_from_pixbuf(image, pixbuf)
#image.set_from_file(filename)
#print("Got the image : " + filename)
#del r
return image
except IOError:
#print("File not accessible")
r = requests.get(URL,stream=True)
if r.status_code == 200:
with open(filename,'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
pixbuf = GdkPixbuf.Pixbuf.new_from_file_at_scale(
filename=filename,
width=200,
height=200,
preserve_aspect_ratio=False)
Gtk.Image.set_from_pixbuf(image, pixbuf)
#image.set_from_file(filename)
#print("Got the image : " + filename)
return image
else:
#print("Failed to get the image : " + filename)
return None
del r
window = AppicationWindow()
#window.connect("destroy",Gtk.main_quit)
#window.show_all()
Gtk.main()
[/code]
Code works fine. But in below code, the thread doProcessing() is getting completed and then I am seeing the "Started Z thread"
print("Before 1st Show all")
self.show_all()
z = threading.Thread(target=self.doProcessing(),daemon=True)
z.start()
print("Started Z thread")
As I see, doProcessing should start in background and "Started Z thread" should be printed immediately but that's not happening.
Am I missing anything here ? Any help is appreciated.
Thanks, Debasish
z = threading.Thread(target=self.doProcessing(),daemon=True)
threading.Thread wants a function not the result of a function, thus try:
z = threading.Thread(target=self.doProcessing,daemon=True)

Crawling Craiglisht with python (Not Scrapy)

I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy
This is the URL: https://chicago.craigslist.org/
At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.
import re
import requests
import csv
from html import unescape
def get_page_content(url):
response = requests.get(url)
return response.text
def get_category_list(content):
return category_pat.findall(content)[90:121]
def get_next_page(content):
result = next_page_pat.findall(content)
if len(result) == 0:
return None
else:
result = 'https://chicago.craigslist.org/' + result[0]
return result
def get_job_list(content):
result = job_list_pat.findall(content)
return result
def get_job_details(content):
result = desc_pat.findall(content)
if len(result) == 0:
description = ''
else:
description = str(result[0])
return description
def scrape_job_info(job_info, category_name):
job_url, job_name = job_info
job_name = unescape(job_name)
job_dict = {'jobname': job_name, 'category': category_name}
job_dict['JOBURL'] = job_url
print('scraping', job_name)
content = get_category_list(job_url)
description = get_job_details(content)
job_dict['Description'] = description
print(job_dict)
def crawl_category(category_name, category_url):
while True:
print(category_url)
content = get_page_content(category_url)
job_list = get_job_list(content)
print(job_list)
for job_info in job_list:
scrape_job_info(job_info, category_name)
next_page = get_next_page(content)
if next_page is None:
break
category_url = next_page
def crawl_website():
url = 'https://chicago.craigslist.org'
content = get_page_content(url)
category_list = get_category_list(content)
for category in category_list:
category_url, category_name = category
category_url = url + category_url
crawl_category(category_name, category_url)
if __name__ == '__main__':
url = 'https://chicago.craigslist.org'
response = requests.get(url)
content = response.text
category_pat = re.compile(r'<li><a href=\"(\/d\/[\w\-]+\/\w+\/\w+)\".+txt\">([\w\-\+\s+\/\<]+)<sup class')
next_page_pat = re.compile(
r'<a href=\"\/(.*)\" class=\"button next\" title=\"next\s+page\">next > <\/a>\s+<span class=\"button next\" title=\"next page\">\s+next >\s+<\/span>\s+<\/span>\s+<\/div>\s+<\/div>\s+.+\s+.+')
job_list_pat = re.compile(r'([\w\s*]+)')
desc_pat = re.compile(r'<\/div>\s*<section id=\"postingbody\">.+html\"><\/div>\s*<\/div>(.+)<\/section><ul')
img_pat = re.compile(r'<img src=\"(.*jpg)\" title')
crawl_website()

Resources