Not going to second page to extract data by scrapy - python-3.x

Saving data of one page and not going to second page and not showing any errors.
import scrapy
from ..items import QoutetutorialItem
class QouteSpider(scrapy.Spider):
name = 'qoute'
page_num =2;
allowed_domains = ['http://quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/page/1/']
def parse(self, response):
all_div_quote = response.css("div.quote")
items = QoutetutorialItem()
for x in all_div_quote:
title = x.css("span.text::text").extract();
author = x.css(".author::text").extract();
tag = x.css(".tag::text").extract();
items['title'] = title
items['author'] = author
items['tag'] = tag
yield items
next_page = 'http://quotes.toscrape.com/page/'+str(QouteSpider.page_num)+'/'
# if next_page is not None:
if QouteSpider.page_num <11:
QouteSpider.page_num+=1
yield response.follow(next_page , callback= self.parse)

Simply do that. Firstly, fetch next page URL from page source as it is present there and then make the request to that. This is how it will looks like.
next_page = response.css('.next ::attr(href)')
if next_page:
yield response.follow(next_page, callback=self.parse)
This will solve your issue and now you also don't need calculation for next page URL also.

Related

Runtime Request URL change not working scrapy

I have written a script in Python using Scrapy. The code runs to fetch all the pages that exist containing the code. It works fine on the first page load when scrapy is started and as per the script logic gets us page no. 2. But after loading page 2 I am unable to get xpath of the new page loaded so I can move ahead this way and get all the web-page numbers.
Sharing the code snippet.
import scrapy
from scrapy import Spider
class PostsSpider(Spider):
name = "posts"
start_urls = [
'https://www.boston.com/category/news/'
]
def parse(self, response):
print("first time")
print(response)
results = response.xpath("//*[contains(#id, 'load-more')]/#data-next-page").extract_first()
print(results)
if results is not None:
for result in results:
page_number = 'page/' + result
new_url = self.start_urls[0] + page_number
print(new_url)
yield scrapy.Request(url=new_url, callback=self.parse)
else:
print("last page")
It is because the page doesn't create new get requests when it loads the next page, it makes an ajax call to an api that returns json.
I made some adjustments to your code so it should work properly now. I am assuming that there is something other than the next page number you are trying to extract from each page, so I wrapped the html string into a scrapy.Slector class so you can use Xpath and such on it. This script will crawl alot of pages really fast, so you might want to adjust your settings to slow it down too.
import scrapy
from scrapy import Spider
from scrapy.selector import Selector
class PostsSpider(Spider):
name = "posts"
ajaxurl = "https://www.boston.com/wp-json/boston/v1/load-more?taxonomy=category&term_id=779&search_query=&author=&orderby=&page=%s&_wpnonce=f43ab1aae4&ad_count=4&redundant_ids=25129871,25130264,25129873,25129799,25128140,25126233,25122755,25121853,25124456,25129584,25128656,25123311,25128423,25128100,25127934,25127250,25126228,25126222"
start_urls = [
'https://www.boston.com/category/news/'
]
def parse(self, response):
new_url = None
try:
json_result = response.json()
html = json_result['data']['html']
selector = Selector(text=html, type="html")
# ... do some xpath stuff with selector.xpath.....
new_url = self.ajaxurl % json_result["data"]["nextPage"]
except:
results = response.xpath("//*[contains(#id, 'load-more')]/#data-next-page").extract_first()
if results is not None:
for result in results:
new_url = self.ajaxurl % result
if new_url:
print(new_url)
yield scrapy.Request(url=new_url, callback=self.parse)

Unable to get the pagination crawler to work Python3

I m trying to use the scrapy module in python to scrape the details, but I am currently stuck on trying to get the pagination crawler to work. I'm getting the output partially right, but as I said previously, it is not scraping from the following pages on the website
import scrapy
from time import sleep
from ..items import SunwayscrapyItem
class SunwaySpider(scrapy.Spider):
name = "sunway"
page_number = 20
allowed_domains = ['https://www.sunwaymedical.com/find-a-doctor/']
start_urls = [
'https://www.sunwaymedical.com/find-a-doctor/search/0/?
specialty=&doctor=&name='
]
def parse(self, response):
# all_details = response.css('.col-lg-9')
# for details in all_details:
for SunwaySpider.page_number in range(0, 220, 20):
items = SunwayscrapyItem()
next_page = "https://www.sunwaymedical.com/find-a-doctor/search/" + str(
SunwaySpider.page_number) + "/?specialty=&doctor=&name="
if SunwaySpider.page_number < 220:
name = response.css('.doctor_name a::text').extract()
specialty = response.css('.doc_label3:nth-child(4)::text').extract()
language = response.css('.doc_label3:nth-child(8)::text').extract()
gender = response.css('.doc_label3:nth-child(12)::text').extract()
qualifications = response.css('.doc_label3:nth-child(16)::text').extract()
location = response.css('.doc_label3:nth-child(20)::text').extract()
contact = response.css('.doc_label3 a::text').extract()
items['Name'] = name
items['Specialty'] = list(map(str.strip, specialty))
items['Languages'] = list(map(str.strip, language))
items['Gender'] = list(map(str.strip, gender))
items['Qualifications'] = list(map(str.strip, qualifications))
items['Location'] = list(map(str.strip, location))
items['Contact'] = list(map(str.strip, contact))
yield items
sleep(3)
yield response.follow(next_page, callback=self.parse)
You are not creating the structure of pagination properly. It is not advised to implement pagination and the yielding of items in a single method. Take a look at the sample code below:
class AnswersMicrosoft(CrawlSpider):
name = 'answersmicrosoft'
allowed_domains = ['answers.microsoft.com']
start_urls = ['https://answers.microsoft.com/en-us']
listings_css = ['#categoryListGridMed', '.nav-links']
products_css = ['#threads .c-card .thread-title']
rules = (
Rule(LinkExtractor(restrict_css=products_css), callback='parse_item'),
Rule(LinkExtractor(restrict_css=listings_css), callback='parse_pagination'),
)
def parse_pagination(self, response):
forum_id_css = '#currentForumId::attr(value)'
forum_id = response.css(forum_id_css).get()
url = 'https://answers.microsoft.com/en-us/forum/forumthreadlist?forumId=' + forum_id
yield Request(url, callback=self.get_max_page, meta={'url': response.url})
def get_max_page(self, response):
max_page_css = '.currentMaxPage::attr(value)'
max_page = int(response.css(max_page_css).get())
url = response.url
for page in range(max_page):
updated_url = add_or_replace_parameter(url, 'page', page)
yield Request(updated_url, callback=self.parse)
def parse_item(self, response):
article = AnswersMicrosoftItem()
article["title"] = self.get_title(response).strip()
article["url"] = response.url
article["votes"] = self.get_votes(response)
article["replies"] = self.get_replies(response)
article["category"] = self.get_category(response)
article["views"] = self.get_views(response)
article["date"] = self.get_date(response).strip()
article["last_updated"] = self.get_last_updated(response).strip()
yield article
See that parse_pagination is implemented and how rules are implemented to call the method. If you are novice and don't know much about rules, I prefer you give them a look. They will help you alot in your journey ahead. Also, try to implement a modular approach.
The rules above call only two things; if they see a product, they call parse_item and if they see the next page, they call parse_pagination.
I hope you understand my point. Best of luck!

Scrapy with multiple pages

I have created a simple scrapy project, In which, I got the total page number from the initial site example.com/full. Now I need to scrape all the page starting from example.com/page-2 to 100(if total page count is 100). How can I do that?
Any advice would be helpful.
Code:
import scrapy
class AllSpider(scrapy.Spider):
name = 'all'
allowed_domains = ['example.com']
start_urls = ['https://example.com/full/']
total_pages = 0
def parse(self, response):
total_pages = response.xpath("//body/section/div/section/div/div/ul/li[6]/a/text()").extract_first()
#urls = ('https://example.com/page-{}'.format(i) for i in range(1,total_pages))
print(total_pages)
Update #1:
I tried using that urls = ('https://example.com/page-{}'.format(i) for i in range(1,total_pages)) but its not working, may be i'm doing something wrong.
Update #2:
I have changed my code like this one
class AllSpider(scrapy.Spider):
name = 'all'
allowed_domains = ['sanet.st']
start_urls = ['https://sanet.st/full/']
total_pages = 0
def parse(self, response):
total_pages = response.xpath("//body/section/div/section/div/div/ul/li[6]/a/text()").extract_first()
for page in range(2, int(total_pages)):
url = 'https://sanet.st/page-'+str(page)
yield scrapy.Request(url)
title = response.xpath('//*[#class="list_item_title"]/h2/a/span/text()').extract()
print(title)
But still the loop showing only the first page title repeatedly.
I need to extract the title from different pages and print it in the prompt.
How can i do that?
You must search for the 'next_page' object and continue to loop while it is on the page.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class SanetSpider(scrapy.Spider):
name = 'sanet'
allowed_domains = ['sanet.st']
start_urls = ['https://sanet.st/full/']
def parse(self, response):
yield {
# Do something.
'result': response.xpath('//h3[#class="posts-results"]/text()').extract_first()
}
# next_page = /page-{}/ where {} number of page.
next_page = response.xpath('//a[#data-tip="Next page"]/#href').extract_first()
# next_page = https://sanet.st/page-{}/ where {} number of page.
next_page = response.urljoin(next_page)
# If next_page have value
if next_page:
# Recall parse with url https://sanet.st/page-{}/ where {} number of page.
yield scrapy.Request(url=next_page, callback=self.parse)
If you run this code with the "-o sanet.json" key you will get the following result.
scrapy runspider sanet.py -o sanet.json
[
{"result": "results 1 - 15 from 651"},
{"result": "results 16 - 30 from 651"},
{"result": "results 31 - 45 from 651"},
...
etc.
...
{"result": "results 631 - 645 from 651"},
{"result": "results 646 - 651 from 651"}
]
from scrapy.http import Request
def parse(self, response):
total_pages = response.xpath("//body/section/div/section/div/div/ul/li[6]/a/text()").extract_first()
urls = ('https://example.com/page-{}'.format(i) for i in range(1,total_pages))
for url in urls:
yield Request(url, callback=self.parse_page)
def parse_page(self, response):
# do the stuff
an alternative way as shown in the tutorial is to use yield response.follow(url, callback=self.parse_page) and it supports relative URLs directly.

Getting info from the links that were scraped

I am trying to get links of the book at first and then get into that link and grab the title of the book. At the end, I want to store titles in a column and links in another column in csv file. This is how I write the book. I only get links not titles.
import scrapy
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon_spider'
allowed_domains = ['www.amazon.com']
start_urls = ['https://www.amazon.com/s/ref=dp_bc_3?ie=UTF8&node=468216&rh=n%3A283155%2Cn%3A%212349030011%2Cn%3A465600%2C']
def parse(self, response):
links = response.xpath('//*[#class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"]/#href').extract()
for link in links:
yield {'Book Urls': link}
yield scrapy.Request(link, callback=self.book_title)
def book_title(self, response):
title = response.xpath('//*[#id="productTitle"]/text()').extract_first()
yield {'Title': title}
I solved it with response.meta.
import scrapy
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon_spider'
allowed_domains = ['www.amazon.com']
start_urls = ['https://www.amazon.com/s/ref=dp_bc_3?ie=UTF8&node=468216&rh=n%3A283155%2Cn%3A%212349030011%2Cn%3A465600%2C']
def parse(self, response):
links = response.xpath('//*[#class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"]/#href').extract()
for link in links:
title = response.meta.get('title')
yield scrapy.Request(link, callback=self.book_title, meta = {'title':title, 'Link': link})
def book_title(self, response):
title = response.xpath('//*[#id="productTitle"]/text()').extract()
response.meta['title'] = title
yield response.meta

How to use Scrapy to crawl data on the second level of a Page

I want to use scrapy spider to get data (question title + content & answer) from all posts of the following website:
https://forums.att.com/t5/custom/page/page-id/latest-activity/category-id/Customer_Care/page/1?page-type=latest-solutions-topics
The problem is I just dont know how to make it first to follow the link of the post and then to crawl the data of all 15 posts/site.
{import scrapy
class ArticleSpider(scrapy.Spider):
name = "post"
start_urls = ['https://forums.att.com/t5/Data-Messaging-Features-Internet/Throttling-for-unlimited-data/m-p/4805201#M73235']
def parse(self, response):
SET_SELECTOR = 'body'
for post in response.css(SET_SELECTOR):
# Selector for title, content and answer
TITLE_SELECTOR = '.lia-message-subject h5 ::text'
CONTENT_SELECTOR = '.lia-message-body-content'
ANSWER_SELECTOR = '.lia-message-body-content'
yield {
# [0].extract() = extract_first()
'Qtitle': post.css(TITLE_SELECTOR)[0].extract(),
'Qcontent': post.css(CONTENT_SELECTOR)[0].extract(),
'Answer': post.css(ANSWER_SELECTOR)[1].extract(),
}
# Running through all 173 pages
NEXT_PAGE_SELECTOR = '.lia-paging-page-next a ::attr(href)'
next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)}
I hope u can help me out. Thanks in advance!
You need to add a method for scraping post content. You can rewrite your spider code like this (I use xpath selector):
# -*- coding: utf-8 -*-
import scrapy
class ArticleSpider(scrapy.Spider):
name = "post"
start_urls = ['https://forums.att.com/t5/custom/page/page-id/latest-activity/category-id/Customer_Care/page/1?page-type=latest-solutions-topics']
def parse(self, response):
for post_link in response.xpath('//h2//a/#href').extract():
link = response.urljoin(post_link)
yield scrapy.Request(link, callback=self.parse_post)
# Checks if the main page has a link to next page if True keep parsing.
next_page = response.xpath('(//a[#rel="next"])[1]/#href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
def parse_post(self, response):
# Scrape title, content from post.
for post in response.xpath('//div[contains(#class, "lia-quilt-forum-message")]'):
item = dict()
item['title'] = post.xpath('.//h5/text()').extract_first()
item['content'] = post.xpath('.//div[#class="lia-message-body-content"]//text()').extract()
yield item
# If the post page has a link to next page keep parsing.
next_page = response.xpath('(//a[#rel="next"])[1]/#href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse_post)
This code parses all links from the main page and calls parse _post methods for scraping each post content. Both parse and parse_post methods check if there is next link and if True proceed scraping.

Resources