I modified an old script and got it running. However, the old values weren't outputting anything to JSON. I am pretty new to scraping. I am practicing on scraping indeed.com. Also how would I pull the keyword I am search for "remote" to list as "Job Type". I am also unsure if I have correct url and rule. Thanks
I know the script runs, but I need help on the response css or response.xpath. I can find the all the xpath values but some don't work. xpathing "jobtitle" I get a bunch of code/ like onmouse click. Providing code..
class IndeedSpider(CrawlSpider):
name = "indeed"
allowed_domains = ["indeed.com"]
start_urls = [
"https://www.indeed.com/jobs?q=remote&l=",
]
rules = (
Rule(LinkExtractor(allow=('/jobs.q=linux&l=remote&l$','q=linux&l=remote&sort=l&start=[0-9]+$',),deny=('/my/mysearches', '/preferences', '/advanced_search','/my/myjobs')), callback='parse_item', follow=True),
)
def parse_next_site(self, response):
item = response.request.meta['item']
item['source_url'] = response.url
item['source_page_body'] = response.body
item['crawl_timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S')
def parse_item(self, response):
self.log('\n Crawling %s\n' % response.url)
hxs = Selector(response)
sites = hxs.select("//div[#class='row ' or #class='row lastRow']")
#sites = hxs.select("//div[#class='row ']")
items = []
for site in sites:
item = IndeedItem(company='none')
# Producing output with onmouse click. etc. Gets title as well.
item['job_title'] = site.select("//a[contains(concat(' ', normalize-space(#class), ' '),' jobtitle ')]").extract()
# link not working
link_url= site.select('h2/a/#href').extract()
item['link_url'] = link_url
item['crawl_url'] = response.url
item['location'] = site.select("//span[contains(concat(' ', normalize-space(#class), ' '),' location ')]/text()").extract()
# salary returns ''
item['salary'] = site.select("//span[contains(concat(' ', normalize-space(#class), ' '),' salaryText ')]").extract()
# Not all entries have a company. got a lot of , '\n
if site.select("//span[contains(concat(' ', normalize-space(#class), ' '),' company ')]/text()").extract() == []:
item['company'] = [u'']
else:
item['company'] = site.select("//span[contains(concat(' ', normalize-space(#class), ' '),' company ')]/text()").extract()
# Summary seems to work
item['summary'] = site.select("//div[contains(concat(' ', normalize-space(#class), ' '),' summary ')]").extract()
item['source'] = site.select("table/tr/td/span[#class='source']/text()").extract()
item['found_date'] = site.select("table/tr/td/span[#class='date']/text()").extract()
#item['source_url'] = self.get_source(link_url)
request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site)
request.meta['item'] = item
yield request
items.append(item)
return
SPIDER=IndeedSpider()
Perhaps someone can test existing code to see some of the output as well as tell me what I need to do to fix whats not working. Would really help me to move forward to figure out what I'm doing wrong and to understand workings of these things better. Again thanks.
when iterating over scrapy selectors with xpath use './/myxpath' to use a realtive path, you can look at the code exmaple here
hope it helps :)
from scrapy.spiders import CrawlSpider
from scrapy.http import Request, Response
from scrapy.linkextractors import LinkExtractor
import time
class IndeedSpider(CrawlSpider):
name = "indeed"
allowed_domains = ["indeed.com"]
start_urls = [
"https://www.indeed.com/jobs?q=remote&l=",
]
def start_requests(self):
for link in IndeedSpider.start_urls:
yield Request(url=link, callback=self.parse_site)
def parse_site(self, response: Response):
extracted_links = LinkExtractor(
allow=['/jobs.q=linux&l=remote&l$', 'q=linux&l=remote&sort=l&start=[0-9]+$'],
deny=['/my/mysearches', '/preferences', '/advanced_search', '/my/myjobs']) \
.extract_links(response)
for link in extracted_links:
yield Request(url=link.url, callback=self.parse_item)
def parse_item(self, response: Response):
self.log('\n Crawling %s\n' % response.url)
sites = response.xpath("//div[#class='row ' or #class='row lastRow']")
# sites = hxs.select("//div[#class='row ']")
items = []
for site in sites:
item = IndeedItem(company='none')
# Producing output with onmouse click. etc. Gets title as well.
# when Iterating over selectors use .// to use a relative xpath
item['job_title'] = site.xpath(".//a[has-class('jobtitle')]").get()
# link not working
link_url = site.xpath('.//h2/a/#href').get()
item['link_url'] = link_url
item['crawl_url'] = response.url
item['location'] = site.xpath(".//span[has-class('location')]/text()").get()
# salary returns ''
item['salary'] = site.xpath(".//span[has-class('salaryText')]").get()
# Not all entries have a company. got a lot of , '\n
if not site.xpath(".//span[has-class('company')]/text()").getall():
item['company'] = [u'']
else:
item['company'] = site.xpath(".//span[has-class('company')/text()").get()
# Summary seems to work
item['summary'] = site.xpath("//div[has-class('summary')]").get()
item['source'] = site.xpath(".//table/tr/td/span[#class='source']/text()").get()
item['found_date'] = site.xpath(".//table/tr/td/span[#class='date']/text()").get()
# item['source_url'] = self.get_source(link_url)
request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site)
request.meta['item'] = item
yield request
items.append(item)
def parse_next_site(self, response: Response):
item = response.request.meta['item']
item['source_url'] = response.url
item['source_page_body'] = response.body
item['crawl_timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S')
Related
I am trying to scrape the rank, name and url of a company from a website. This involves two pages and I have nested functions to get all the information I need. However, when I try to print the details I get an error that the company_url variable is not defined. I thought that calling the company_button_url function within the main function would do the job, but something is wrong. I have tried calling company_button_url() at differing points in the code, but cannot get it to work.
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
# Handle 403- Forbidden Error
url = 'https://www.b.co.uk/the-lists/mid-companies/'
req = Request(url, headers={'User-Agent': 'Mozilla'})
html = urlopen(req).read()
html_page = html.decode('utf-8')
soup = BeautifulSoup(html_page, 'html.parser') # create soup object
'''Main Function'''
def company_details():
# find rank
rank = soup.find('div', class_="company-score-redesign").text
# find company name
company_name = soup.find('div', class_="company-name-redesign").text
# find company website
''' Find Button Url...Parse HTML from new Url...Find Company Website '''
def company_button_url():
comp = soup.find('div', class_="company-name-redesign-mobile")
comp_btn = comp.find('a', href = True)
comp_btn_url = comp_btn['href']
new_url = comp_btn_url
# Handle 403- Forbidden Error
new_req = Request(new_url, headers={'User-Agent': 'Mozilla'})
nhtml = urlopen(new_req).read() # Getting new page
nhtml_page = nhtml.decode('utf-8')
nsoup = BeautifulSoup(nhtml_page, 'html.parser') # create new soup object
div_company_url = nsoup.find('div', class_="profile-info")
href_company_url = div_company_url.find('a', href = True)
company_url = href_company_url['href']
return company_url
company_button_url()
print(rank, company_name, company_url)
return()
company_details()
Feel very free to pull my coding to pieces - I am very new to this!
Thanks in advance.
Saving data of one page and not going to second page and not showing any errors.
import scrapy
from ..items import QoutetutorialItem
class QouteSpider(scrapy.Spider):
name = 'qoute'
page_num =2;
allowed_domains = ['http://quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/page/1/']
def parse(self, response):
all_div_quote = response.css("div.quote")
items = QoutetutorialItem()
for x in all_div_quote:
title = x.css("span.text::text").extract();
author = x.css(".author::text").extract();
tag = x.css(".tag::text").extract();
items['title'] = title
items['author'] = author
items['tag'] = tag
yield items
next_page = 'http://quotes.toscrape.com/page/'+str(QouteSpider.page_num)+'/'
# if next_page is not None:
if QouteSpider.page_num <11:
QouteSpider.page_num+=1
yield response.follow(next_page , callback= self.parse)
Simply do that. Firstly, fetch next page URL from page source as it is present there and then make the request to that. This is how it will looks like.
next_page = response.css('.next ::attr(href)')
if next_page:
yield response.follow(next_page, callback=self.parse)
This will solve your issue and now you also don't need calculation for next page URL also.
I m trying to use the scrapy module in python to scrape the details, but I am currently stuck on trying to get the pagination crawler to work. I'm getting the output partially right, but as I said previously, it is not scraping from the following pages on the website
import scrapy
from time import sleep
from ..items import SunwayscrapyItem
class SunwaySpider(scrapy.Spider):
name = "sunway"
page_number = 20
allowed_domains = ['https://www.sunwaymedical.com/find-a-doctor/']
start_urls = [
'https://www.sunwaymedical.com/find-a-doctor/search/0/?
specialty=&doctor=&name='
]
def parse(self, response):
# all_details = response.css('.col-lg-9')
# for details in all_details:
for SunwaySpider.page_number in range(0, 220, 20):
items = SunwayscrapyItem()
next_page = "https://www.sunwaymedical.com/find-a-doctor/search/" + str(
SunwaySpider.page_number) + "/?specialty=&doctor=&name="
if SunwaySpider.page_number < 220:
name = response.css('.doctor_name a::text').extract()
specialty = response.css('.doc_label3:nth-child(4)::text').extract()
language = response.css('.doc_label3:nth-child(8)::text').extract()
gender = response.css('.doc_label3:nth-child(12)::text').extract()
qualifications = response.css('.doc_label3:nth-child(16)::text').extract()
location = response.css('.doc_label3:nth-child(20)::text').extract()
contact = response.css('.doc_label3 a::text').extract()
items['Name'] = name
items['Specialty'] = list(map(str.strip, specialty))
items['Languages'] = list(map(str.strip, language))
items['Gender'] = list(map(str.strip, gender))
items['Qualifications'] = list(map(str.strip, qualifications))
items['Location'] = list(map(str.strip, location))
items['Contact'] = list(map(str.strip, contact))
yield items
sleep(3)
yield response.follow(next_page, callback=self.parse)
You are not creating the structure of pagination properly. It is not advised to implement pagination and the yielding of items in a single method. Take a look at the sample code below:
class AnswersMicrosoft(CrawlSpider):
name = 'answersmicrosoft'
allowed_domains = ['answers.microsoft.com']
start_urls = ['https://answers.microsoft.com/en-us']
listings_css = ['#categoryListGridMed', '.nav-links']
products_css = ['#threads .c-card .thread-title']
rules = (
Rule(LinkExtractor(restrict_css=products_css), callback='parse_item'),
Rule(LinkExtractor(restrict_css=listings_css), callback='parse_pagination'),
)
def parse_pagination(self, response):
forum_id_css = '#currentForumId::attr(value)'
forum_id = response.css(forum_id_css).get()
url = 'https://answers.microsoft.com/en-us/forum/forumthreadlist?forumId=' + forum_id
yield Request(url, callback=self.get_max_page, meta={'url': response.url})
def get_max_page(self, response):
max_page_css = '.currentMaxPage::attr(value)'
max_page = int(response.css(max_page_css).get())
url = response.url
for page in range(max_page):
updated_url = add_or_replace_parameter(url, 'page', page)
yield Request(updated_url, callback=self.parse)
def parse_item(self, response):
article = AnswersMicrosoftItem()
article["title"] = self.get_title(response).strip()
article["url"] = response.url
article["votes"] = self.get_votes(response)
article["replies"] = self.get_replies(response)
article["category"] = self.get_category(response)
article["views"] = self.get_views(response)
article["date"] = self.get_date(response).strip()
article["last_updated"] = self.get_last_updated(response).strip()
yield article
See that parse_pagination is implemented and how rules are implemented to call the method. If you are novice and don't know much about rules, I prefer you give them a look. They will help you alot in your journey ahead. Also, try to implement a modular approach.
The rules above call only two things; if they see a product, they call parse_item and if they see the next page, they call parse_pagination.
I hope you understand my point. Best of luck!
I've implemented news website scraper that scrapes by using Selenium web driver to access dynamic web pages and BeautifulSoup to retrieve the content. While parsing websites, I'm also writing scraped data to MongoDB storage and downloading pictures. I want to implement full news search by given category or by text, that appears in the news content. What can be the suggestions in terms of parallelization/adding async code to speed up the performance?
# -*- coding: utf-8 -*-
import os
import json
import requests
from bs4 import BeautifulSoup
from mongo_setup import Database
import gridfs
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import time
import logging
import re
import pymongo
PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))
DRIVER_BIN = os.path.join(PROJECT_ROOT, "bin/chromedriver")
class Scraper:
tsn_resource = 'https://tsn.ua/'
ukrnet_resource = 'https://www.ukr.net/'
db_name = 'scraper_db'
category_coll = 'categories'
articles_coll = 'articles'
def __init__(self, limit=10):
self.limit = limit # max number of articles per category
self.db = Database(self.db_name).connect_db()
self.category_coll = self.init_collection(self.category_coll)
self.articles_coll = self.init_collection(self.articles_coll)
self.logger = self.init_logger()
self.driver = webdriver.Chrome(executable_path = DRIVER_BIN)
self.image_storage = os.path.join(PROJECT_ROOT, "image_storage/")
def init_logger(self):
'''
Initialize log file.
'''
logger = logging.getLogger('scraper_app')
logger.setLevel(logging.INFO)
# create a file handler
handler = logging.FileHandler('scraper_logfile.log')
handler.setLevel(logging.INFO)
# create a logging format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(handler)
return logger
def init_collection(self, name):
if name in self.db.collection_names():
self.db[name].drop()
return self.db[name]
def insert_one_to_collection(self, data, collection):
try:
collection.insert_one(data)
except pymongo.errors.DuplicateKeyError:
pass
def insert_many_to_collection(self, data, collection):
try:
collection.insert_many(data)
except pymongo.errors.DuplicateKeyError:
pass
def download_image(self, image_url):
'''
download images from news articles
to local storage
'''
if not image_url.startswith(("data:image", "javascript")):
local_filename = image_url.split('/')[-1].split("?")[0]
r = requests.get(image_url, stream=True, verify=False)
with open(self.image_storage + local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
f.write(chunk)
def upload_image_to_mongo(self, image_url):
response = requests.get(image_url, stream=True)
fs = gridfs.GridFS(self.db)
img = response.raw.read()
fs.put(img, filename=local_filename)
def get_page_content(self, url):
try:
self.driver.get(url)
except WebDriverException:
self.driver = webdriver.Chrome(executable_path = DRIVER_BIN)
page = self.driver.page_source
return page
def parse_page_content(self, url, parser_lib):
page_obj = self.get_page_content(url)
soup = BeautifulSoup(page_obj, parser_lib)
return soup
def tsn_categories(self):
categories = self.gather_categories(self.tsn_resource, 'ul.c-app-nav-more-list li a')
return categories
def ukrnet_categories(self):
categories = self.gather_categories(self.ukrnet_resource, 'h2.feed__section--title a')
return categories
def gather_categories(self, url, selector):
categories = []
soup = self.parse_page_content(url, "html.parser")
all_categories = soup.select(selector)
for item in all_categories:
category = {}
link = str(item.attrs.get('href'))
if link.startswith('javascript'):
continue
if not link.startswith('https:'):
link = 'https:' + link
category['link'] = link
category['name'] = item.get_text().strip()
categories.append(category)
self.insert_many_to_collection(categories, self.category_coll)
return categories
def search_by_category(self, category_name):
category_name = category_name.decode('utf-8')
category_list = []
category_list += self.tsn_categories()
category_list += self.ukrnet_categories()
category_obj = next(item for item in category_list if item['name'] == category_name)
link = category_obj['link']
if 'ukr.net' in link:
articles = self.get_ukrnet_articles(category_name, link)
else:
articles = self.get_tsn_articles(category_name, link)
return articles
def get_ukrnet_articles(self, category_name, url):
'''
retrieve all articles from ukr.net by given category link
'''
count = 0
result = []
soup = self.parse_page_content(url, "html.parser")
all_articles = soup.select('div.im-tl a')
for item in all_articles:
if count <= self.limit:
article = {}
link = item.attrs.get('href')
article['link'] = link
article['category'] = category_name
article['content'] = item.contents[0].encode('utf-8')
result.append(article)
self.insert_one_to_collection(article, self.articles_coll)
else:
break
count += 1
return result
def get_tsn_articles(self, category_name, url):
'''
retrieve all articles from tsn.ua by given category link
'''
count = 0
result = []
data = [] # temporary storage
# first parse through the list of articles
soup = self.parse_page_content(url, "html.parser")
all_articles = soup.select('div.c-entry-embed a.c-post-img-wrap')
for item in all_articles:
# iterate limit amount of articles
if count <= self.limit:
article = {}
link = item.attrs.get('href')
img_src = item.find('img').get('src')
if link.endswith(".html"):
article['link'] = link
if img_src is not None:
article['img_src'] = img_src
self.download_image(img_src)
article['category'] = category_name
data.append(article)
count += 1
else:
break
# then iterate over each article
for article in data:
new_soup = self.parse_page_content(article['link'], "html5lib")
news_content = new_soup.select('div.e-content p')
text_content = [] # article content
for chunk in news_content:
text_content.append(chunk.get_text().strip(''))
article_text = ' '.join(text_content)
news_header = new_soup.select('div.c-post-meta h1') # article title
if news_header:
header_text = "".join(news_header[0].contents)
article_image = new_soup.find('figure', class_='js-lightgallery')
if article_image:
img_src = article_image.find('img').get('src') # articles image
self.download_image(img_src)
news_chunk = {}
news_chunk['category'] = article['category']
news_chunk['link'] = article['link']
news_chunk['title'] = header_text
# news_chunk['title'] = ''
news_chunk['content'] = article_text
news_chunk['images'] = []
if 'img_src' in article:
news_chunk['images'].append(article['img_src']) # caption image
if article_image:
news_chunk['images'].append(img_src) # article image
result.append(news_chunk)
self.insert_one_to_collection(news_chunk, self.articles_coll)
return result
def search_by_text(self, text):
category_links = []
category_links += self.ukrnet_categories()
category_links += self.tsn_categories()
result = self.website_search_by_text(text, category_links)
return result
def website_search_by_text(self, text_searched, category_links):
result = []
text_searched = text_searched.decode('utf-8')
for link in category_links:
article = {}
soup = self.parse_page_content(link['link'], "html.parser")
all_articles = soup.find_all('a', text=re.compile(text_searched))
for item in all_articles:
article['link'] = item.attrs.get('href')
article['category'] = link['name']
article['content'] = (item.contents[0].strip()).encode('utf-8')
self.insert_one_to_collection(article, self.articles_coll)
result.append(article)
return result
def collect_ukrnet_articles(self):
'''
outdated
'''
categories = self.ukrnet_categories()
for category in categories:
count = 0
soup = self.parse_page_content(category['link'], "html.parser")
all_articles = soup.select('div.im-tl a')
for item in all_articles:
# only 10 first articles
if count < self.limit:
article = {}
link = item.attrs.get('href')
article['link'] = link
article['category'] = category['name']
article['content'] = item.contents[0].encode('utf-8')
self.insert_one_to_collection(article, self.articles_coll)
else:
break
count += 1
def run(self):
self.search_by_category('Economics', self.tsn_categories())
self.search_by_text('Economics')
self.driver.quit()
if __name__ == '__main__':
scraper = Scraper()
scraper.run()
scrapy is a solid python framework that automatically does things async/parallel.
There's also multiprocessing that's been conveniently put into one package.
And then there's multithreading, also conveniently put into one package.
With the multithreading library there's a way to call the function you're trying to thread with map() and then pass the lists/variables you're trying to use with it. map(your_func, your_list)
I don't remember the exact link, or structure for it, but it's a quick google search away. Really makes it easier.
I have created a simple scrapy project, In which, I got the total page number from the initial site example.com/full. Now I need to scrape all the page starting from example.com/page-2 to 100(if total page count is 100). How can I do that?
Any advice would be helpful.
Code:
import scrapy
class AllSpider(scrapy.Spider):
name = 'all'
allowed_domains = ['example.com']
start_urls = ['https://example.com/full/']
total_pages = 0
def parse(self, response):
total_pages = response.xpath("//body/section/div/section/div/div/ul/li[6]/a/text()").extract_first()
#urls = ('https://example.com/page-{}'.format(i) for i in range(1,total_pages))
print(total_pages)
Update #1:
I tried using that urls = ('https://example.com/page-{}'.format(i) for i in range(1,total_pages)) but its not working, may be i'm doing something wrong.
Update #2:
I have changed my code like this one
class AllSpider(scrapy.Spider):
name = 'all'
allowed_domains = ['sanet.st']
start_urls = ['https://sanet.st/full/']
total_pages = 0
def parse(self, response):
total_pages = response.xpath("//body/section/div/section/div/div/ul/li[6]/a/text()").extract_first()
for page in range(2, int(total_pages)):
url = 'https://sanet.st/page-'+str(page)
yield scrapy.Request(url)
title = response.xpath('//*[#class="list_item_title"]/h2/a/span/text()').extract()
print(title)
But still the loop showing only the first page title repeatedly.
I need to extract the title from different pages and print it in the prompt.
How can i do that?
You must search for the 'next_page' object and continue to loop while it is on the page.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class SanetSpider(scrapy.Spider):
name = 'sanet'
allowed_domains = ['sanet.st']
start_urls = ['https://sanet.st/full/']
def parse(self, response):
yield {
# Do something.
'result': response.xpath('//h3[#class="posts-results"]/text()').extract_first()
}
# next_page = /page-{}/ where {} number of page.
next_page = response.xpath('//a[#data-tip="Next page"]/#href').extract_first()
# next_page = https://sanet.st/page-{}/ where {} number of page.
next_page = response.urljoin(next_page)
# If next_page have value
if next_page:
# Recall parse with url https://sanet.st/page-{}/ where {} number of page.
yield scrapy.Request(url=next_page, callback=self.parse)
If you run this code with the "-o sanet.json" key you will get the following result.
scrapy runspider sanet.py -o sanet.json
[
{"result": "results 1 - 15 from 651"},
{"result": "results 16 - 30 from 651"},
{"result": "results 31 - 45 from 651"},
...
etc.
...
{"result": "results 631 - 645 from 651"},
{"result": "results 646 - 651 from 651"}
]
from scrapy.http import Request
def parse(self, response):
total_pages = response.xpath("//body/section/div/section/div/div/ul/li[6]/a/text()").extract_first()
urls = ('https://example.com/page-{}'.format(i) for i in range(1,total_pages))
for url in urls:
yield Request(url, callback=self.parse_page)
def parse_page(self, response):
# do the stuff
an alternative way as shown in the tutorial is to use yield response.follow(url, callback=self.parse_page) and it supports relative URLs directly.