Scrapy not crawling all links recursively - python-3.x

I need all internal links from all pages in the website for analysis. I have searched found lot of similar question.
I found this code by Mithu which gives closes possible answer. However this is not providing all possible links from the second level of depth of pages.
The generated csv file has only 676 records however the website has 1000 records.
Working Codes
import csv // Done to avoid line gaps in the generated csv file
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from eylinks.items import LinkscrawlItem
outfile = open("data.csv", "w", newline='')
writer = csv.writer(outfile)
class ToscrapeSpider(scrapy.Spider):
name = "toscrapesp"
start_urls = ["http://books.toscrape.com/"]
rules = ([Rule(LinkExtractor(allow=r".*"), callback='parse', follow=True)])
def parse(self, response):
extractor = LinkExtractor(allow_domains='toscrape.com')
links = extractor.extract_links(response)
for link in links:
yield scrapy.Request(link.url, callback=self.collect_data)
def collect_data(self, response):
global writer
for item in response.css('.product_pod'):
product = item.css('h3 a::text').extract_first()
value = item.css('.price_color::text').extract_first()
lnk = response.url
stats = response.status
print(lnk)
yield {'Name': product, 'Price': value,"URL":lnk,"Status":stats}
writer.writerow([product,value,lnk,stats])

For extract links try this:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
import csv
outfile = open("data.csv", "w", newline='')
writer = csv.writer(outfile)
class BooksScrapySpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
books = response.xpath('//h3/a/#href').extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
next_page_url = response.xpath(
"//a[text()='next']/#href").extract_first()
absolute_next_page = response.urljoin(next_page_url)
yield Request(absolute_next_page)
def parse_book(self, response):
title = response.css("h1::text").extract_first()
price = response.xpath(
"//*[#class='price_color']/text()").extract_first()
url = response.request.url
yield {'title': title,
'price': price,
'url': url,
'status': response.status}
writer.writerow([title,price,url,response.status])

Related

How to use css selector in object from HtmlResponse

I'm currently developing an application using Scrapy.
I want to get some value using CSS selector out of def parse, So I create a HtmlResponse object first and tried to get some value using css(), But I can't get any value...
Within def parse, I can get the value in the same way.
What should I do if it is outside of def parse?
Here is the code:
import scrapy
from scrapy.http import HtmlResponse
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = ['https://sample.com/search']
my_response = HtmlResponse(url=start_urls[0])
print('HtmlResponse')
print(my_response)
h3s = my_response.css('h3')
print(str(len(h3s)))
print('----------')
def parse(self, response, **kwargs):
print('def parse')
print(response)
h3s = response.css('h3')
print(str(len(h3s)))
Console display:
HtmlResponse
<200 https://sample.com/search>
0 # <- I want to show '3' here
----------
def parse
<200 https://sample.com/search>
3
update
The program I want to finally create is the following code:
[ (Note) The code below does not work for reference ]
import scrapy
from scrapy.http import HtmlResponse
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = []
response_url = 'https://sample.com/search'
my_response = HtmlResponse(url=response_url)
categories = my_response.css('.categories a::attr(href)').getall()
for category in categories:
start_urls.append(category)
def parse(self, response, **kwargs):
pages = response.css('h3')
for page in pages:
print(page.css('::text').get())
Python 3.8.5
Scrapy 2.5.0
I know what do you mean,your start url is the basic domain,but you also want to fetch all category page to extract h3.
in scrapy you can extract data and follow new links in the same parse method,here is a example.
import scrapy
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = ['https://sample.com/search']
def parse(self, response, **kwargs):
print('def parse')
print(response)
pages = response.css('h3')
#extract data at here
for page in pages:
print(page.css('::text').get())
yield page.css('::text').get()
#follow new links here
categories = response.css('.categories a::attr(href)').getall()
for category in categories:
yield scrapy.Request(category,callback=self.parse)
you can read scrapy document for more information

How do I scrape multiple table rows from this url using scrapy?

import scrapy
class SsoSpider(scrapy.Spider):
name = 'sso'
allowed_domains = ['www.sso.agc.gov.sg']
start_urls = ['https://sso.agc.gov.sg/Browse/Act/Current']
def parse(self, response):
acts = response.xpath("//table[#class='table browse-list']/tbody")
for act in acts:
yield {
#'Act title': act.xpath(".//tr[#class='alternate']/td/a/text()").get(),
'Act title': act.xpath(".//tr/td/a/text()").get(),
#'Short-hand code': act.xpath(".//tr[#class='alternate']/td/a/#href").get()
'Short-hand code': act.xpath(".//tr/td/a/#href").get()
}
So this is my code above for scraping. After running it, I only get 1 scraped result.
I think the issue got to do with how the table rows are created? Like some has class while some does not.
I'm new to scraping so any help would be appreciated!
Try this I hope it will work.
import scrapy
class SsoSpider(scrapy.Spider):
name = 'sso'
allowed_domains = ['www.sso.agc.gov.sg']
start_urls = ['https://sso.agc.gov.sg/Browse/Act/Current']
def parse(self, response):
acts = response.xpath("//*[#id='listPanel']/table/tbody/tr")
for act in acts:
yield {
'Act title': act.xpath("td[1]/a/text()").extract_first(),
'Short-hand code': act.xpath("td/a/#href").extract_first()
}
Let me know.

Extract articles from its corresponding links from a webpage using scrapy

Hi I am new to scrapy and I am Trying to extract text from links in a given webpage. Here is the code I wrote for the same and after running scrapy crawl article, it gives no module named article. Can you help me find where I am wrong? Thanks in advance.
import scrapy
from urllib.parse import urljoin
class ArticleSpider(scrapy.Spider):
name = 'article'
allowed_domains = ['www.timesofindia.indiatimes.com/business']
start_urls = ['https://timesofindia.indiatimes.com/business']
def parse(self, response):
links = response.css('span.w_tle a::attr(href)').extract()
for link in links:
url = urljoin(response.url, link)
yield scrapy.Request(url,callback=self.parse_article)
def parse_article(self,response):
for info in response.css('div.article_content clearfix'):
yield {'Article':info.css('div.Normal::text').extract()}
If you take a look at your log you'll see 'offsite/filtered': 211, and that the cause of not getting anything. In order to dodge this you can do two things:
Remove allowed_domains field
Add dont_filter=True in your request like:
yield scrapy.Request(url,callback=self.parse_article, dont_filter=True)
I tested your code it does not seems to work properly if you want to get text body so i rewrote it with XPath which I am more comfortable with.
import scrapy
from urllib.parse import urljoin
class ArticleSpider(scrapy.Spider):
name = 'article'
allowed_domains = ['www.timesofindia.indiatimes.com']
start_urls = ['https://timesofindia.indiatimes.com/business']
def parse(self, response):
links = response.xpath('//*[#id="c_listing_wdt_1"]//span[1]/a/#href').getall()
for link in links:
url = urljoin(response.url, link)
yield scrapy.Request(url,callback=self.parse_article, dont_filter=True)
def parse_article(self, response):
print(response.xpath('//*[#id="content"]//arttextxml//div//text()').getall())
for info in response.xpath('//*[#id="content"]//arttextxml//div//text()').getall():
yield {'Article':info}
getall() can be used instead of extract(), they are almost equal.

Getting info from the links that were scraped

I am trying to get links of the book at first and then get into that link and grab the title of the book. At the end, I want to store titles in a column and links in another column in csv file. This is how I write the book. I only get links not titles.
import scrapy
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon_spider'
allowed_domains = ['www.amazon.com']
start_urls = ['https://www.amazon.com/s/ref=dp_bc_3?ie=UTF8&node=468216&rh=n%3A283155%2Cn%3A%212349030011%2Cn%3A465600%2C']
def parse(self, response):
links = response.xpath('//*[#class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"]/#href').extract()
for link in links:
yield {'Book Urls': link}
yield scrapy.Request(link, callback=self.book_title)
def book_title(self, response):
title = response.xpath('//*[#id="productTitle"]/text()').extract_first()
yield {'Title': title}
I solved it with response.meta.
import scrapy
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon_spider'
allowed_domains = ['www.amazon.com']
start_urls = ['https://www.amazon.com/s/ref=dp_bc_3?ie=UTF8&node=468216&rh=n%3A283155%2Cn%3A%212349030011%2Cn%3A465600%2C']
def parse(self, response):
links = response.xpath('//*[#class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"]/#href').extract()
for link in links:
title = response.meta.get('title')
yield scrapy.Request(link, callback=self.book_title, meta = {'title':title, 'Link': link})
def book_title(self, response):
title = response.xpath('//*[#id="productTitle"]/text()').extract()
response.meta['title'] = title
yield response.meta

Count word on the page

How to scrap any site and search for the given word and displays how many times it occurred
class LinkedinScraper(scrapy.Spider):
name = "linked"
def start_requests(self):
urls = ['https://www.linkedin.com/']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'linkedin.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
You can use regex with response.body to find all occurrances in any places
ie.
import re
r = re.findall('\\bcat\\b', "cat catalog cattering")
print(len(r), 'cat(s)')
Gives "1 cat(s)", not "3 cat(s)"
If you need word only in some tags then you use first response.css(), response.xpath(), etc.
EDIT:
Example which shows how to use
re.findall(pattern, response.text)
but it can find text inside tag too.
It also shows how to use
response.css('body').re(pattern)
It counts 'view', '\\bviews\\b' and '\d+ views' on Stackoverflow and display first three elements
You can run it without creating project.
import scrapy
import re
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://stackoverflow.com/']
def parse(self, response):
print('url:', response.url)
for pattern in ['view', '\\bviews\\b', '\d+ views']:
print('>>> pattern:', pattern)
result = re.findall(pattern, response.text)
print('>>> re:', len(result), result[0:3])
result = response.css('body').re(pattern)
print('>>> response.re:', len(result), result[0:3])
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
c.crawl(MySpider)
c.start()

Resources