How do I scrape multiple table rows from this url using scrapy? - python-3.x

import scrapy
class SsoSpider(scrapy.Spider):
name = 'sso'
allowed_domains = ['www.sso.agc.gov.sg']
start_urls = ['https://sso.agc.gov.sg/Browse/Act/Current']
def parse(self, response):
acts = response.xpath("//table[#class='table browse-list']/tbody")
for act in acts:
yield {
#'Act title': act.xpath(".//tr[#class='alternate']/td/a/text()").get(),
'Act title': act.xpath(".//tr/td/a/text()").get(),
#'Short-hand code': act.xpath(".//tr[#class='alternate']/td/a/#href").get()
'Short-hand code': act.xpath(".//tr/td/a/#href").get()
}
So this is my code above for scraping. After running it, I only get 1 scraped result.
I think the issue got to do with how the table rows are created? Like some has class while some does not.
I'm new to scraping so any help would be appreciated!

Try this I hope it will work.
import scrapy
class SsoSpider(scrapy.Spider):
name = 'sso'
allowed_domains = ['www.sso.agc.gov.sg']
start_urls = ['https://sso.agc.gov.sg/Browse/Act/Current']
def parse(self, response):
acts = response.xpath("//*[#id='listPanel']/table/tbody/tr")
for act in acts:
yield {
'Act title': act.xpath("td[1]/a/text()").extract_first(),
'Short-hand code': act.xpath("td/a/#href").extract_first()
}
Let me know.

Related

How to use css selector in object from HtmlResponse

I'm currently developing an application using Scrapy.
I want to get some value using CSS selector out of def parse, So I create a HtmlResponse object first and tried to get some value using css(), But I can't get any value...
Within def parse, I can get the value in the same way.
What should I do if it is outside of def parse?
Here is the code:
import scrapy
from scrapy.http import HtmlResponse
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = ['https://sample.com/search']
my_response = HtmlResponse(url=start_urls[0])
print('HtmlResponse')
print(my_response)
h3s = my_response.css('h3')
print(str(len(h3s)))
print('----------')
def parse(self, response, **kwargs):
print('def parse')
print(response)
h3s = response.css('h3')
print(str(len(h3s)))
Console display:
HtmlResponse
<200 https://sample.com/search>
0 # <- I want to show '3' here
----------
def parse
<200 https://sample.com/search>
3
update
The program I want to finally create is the following code:
[ (Note) The code below does not work for reference ]
import scrapy
from scrapy.http import HtmlResponse
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = []
response_url = 'https://sample.com/search'
my_response = HtmlResponse(url=response_url)
categories = my_response.css('.categories a::attr(href)').getall()
for category in categories:
start_urls.append(category)
def parse(self, response, **kwargs):
pages = response.css('h3')
for page in pages:
print(page.css('::text').get())
Python 3.8.5
Scrapy 2.5.0
I know what do you mean,your start url is the basic domain,but you also want to fetch all category page to extract h3.
in scrapy you can extract data and follow new links in the same parse method,here is a example.
import scrapy
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = ['https://sample.com/search']
def parse(self, response, **kwargs):
print('def parse')
print(response)
pages = response.css('h3')
#extract data at here
for page in pages:
print(page.css('::text').get())
yield page.css('::text').get()
#follow new links here
categories = response.css('.categories a::attr(href)').getall()
for category in categories:
yield scrapy.Request(category,callback=self.parse)
you can read scrapy document for more information

Extract articles from its corresponding links from a webpage using scrapy

Hi I am new to scrapy and I am Trying to extract text from links in a given webpage. Here is the code I wrote for the same and after running scrapy crawl article, it gives no module named article. Can you help me find where I am wrong? Thanks in advance.
import scrapy
from urllib.parse import urljoin
class ArticleSpider(scrapy.Spider):
name = 'article'
allowed_domains = ['www.timesofindia.indiatimes.com/business']
start_urls = ['https://timesofindia.indiatimes.com/business']
def parse(self, response):
links = response.css('span.w_tle a::attr(href)').extract()
for link in links:
url = urljoin(response.url, link)
yield scrapy.Request(url,callback=self.parse_article)
def parse_article(self,response):
for info in response.css('div.article_content clearfix'):
yield {'Article':info.css('div.Normal::text').extract()}
If you take a look at your log you'll see 'offsite/filtered': 211, and that the cause of not getting anything. In order to dodge this you can do two things:
Remove allowed_domains field
Add dont_filter=True in your request like:
yield scrapy.Request(url,callback=self.parse_article, dont_filter=True)
I tested your code it does not seems to work properly if you want to get text body so i rewrote it with XPath which I am more comfortable with.
import scrapy
from urllib.parse import urljoin
class ArticleSpider(scrapy.Spider):
name = 'article'
allowed_domains = ['www.timesofindia.indiatimes.com']
start_urls = ['https://timesofindia.indiatimes.com/business']
def parse(self, response):
links = response.xpath('//*[#id="c_listing_wdt_1"]//span[1]/a/#href').getall()
for link in links:
url = urljoin(response.url, link)
yield scrapy.Request(url,callback=self.parse_article, dont_filter=True)
def parse_article(self, response):
print(response.xpath('//*[#id="content"]//arttextxml//div//text()').getall())
for info in response.xpath('//*[#id="content"]//arttextxml//div//text()').getall():
yield {'Article':info}
getall() can be used instead of extract(), they are almost equal.

Scrapy not crawling all links recursively

I need all internal links from all pages in the website for analysis. I have searched found lot of similar question.
I found this code by Mithu which gives closes possible answer. However this is not providing all possible links from the second level of depth of pages.
The generated csv file has only 676 records however the website has 1000 records.
Working Codes
import csv // Done to avoid line gaps in the generated csv file
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from eylinks.items import LinkscrawlItem
outfile = open("data.csv", "w", newline='')
writer = csv.writer(outfile)
class ToscrapeSpider(scrapy.Spider):
name = "toscrapesp"
start_urls = ["http://books.toscrape.com/"]
rules = ([Rule(LinkExtractor(allow=r".*"), callback='parse', follow=True)])
def parse(self, response):
extractor = LinkExtractor(allow_domains='toscrape.com')
links = extractor.extract_links(response)
for link in links:
yield scrapy.Request(link.url, callback=self.collect_data)
def collect_data(self, response):
global writer
for item in response.css('.product_pod'):
product = item.css('h3 a::text').extract_first()
value = item.css('.price_color::text').extract_first()
lnk = response.url
stats = response.status
print(lnk)
yield {'Name': product, 'Price': value,"URL":lnk,"Status":stats}
writer.writerow([product,value,lnk,stats])
For extract links try this:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
import csv
outfile = open("data.csv", "w", newline='')
writer = csv.writer(outfile)
class BooksScrapySpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
books = response.xpath('//h3/a/#href').extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
next_page_url = response.xpath(
"//a[text()='next']/#href").extract_first()
absolute_next_page = response.urljoin(next_page_url)
yield Request(absolute_next_page)
def parse_book(self, response):
title = response.css("h1::text").extract_first()
price = response.xpath(
"//*[#class='price_color']/text()").extract_first()
url = response.request.url
yield {'title': title,
'price': price,
'url': url,
'status': response.status}
writer.writerow([title,price,url,response.status])

Scrapy with multiple pages

I have created a simple scrapy project, In which, I got the total page number from the initial site example.com/full. Now I need to scrape all the page starting from example.com/page-2 to 100(if total page count is 100). How can I do that?
Any advice would be helpful.
Code:
import scrapy
class AllSpider(scrapy.Spider):
name = 'all'
allowed_domains = ['example.com']
start_urls = ['https://example.com/full/']
total_pages = 0
def parse(self, response):
total_pages = response.xpath("//body/section/div/section/div/div/ul/li[6]/a/text()").extract_first()
#urls = ('https://example.com/page-{}'.format(i) for i in range(1,total_pages))
print(total_pages)
Update #1:
I tried using that urls = ('https://example.com/page-{}'.format(i) for i in range(1,total_pages)) but its not working, may be i'm doing something wrong.
Update #2:
I have changed my code like this one
class AllSpider(scrapy.Spider):
name = 'all'
allowed_domains = ['sanet.st']
start_urls = ['https://sanet.st/full/']
total_pages = 0
def parse(self, response):
total_pages = response.xpath("//body/section/div/section/div/div/ul/li[6]/a/text()").extract_first()
for page in range(2, int(total_pages)):
url = 'https://sanet.st/page-'+str(page)
yield scrapy.Request(url)
title = response.xpath('//*[#class="list_item_title"]/h2/a/span/text()').extract()
print(title)
But still the loop showing only the first page title repeatedly.
I need to extract the title from different pages and print it in the prompt.
How can i do that?
You must search for the 'next_page' object and continue to loop while it is on the page.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class SanetSpider(scrapy.Spider):
name = 'sanet'
allowed_domains = ['sanet.st']
start_urls = ['https://sanet.st/full/']
def parse(self, response):
yield {
# Do something.
'result': response.xpath('//h3[#class="posts-results"]/text()').extract_first()
}
# next_page = /page-{}/ where {} number of page.
next_page = response.xpath('//a[#data-tip="Next page"]/#href').extract_first()
# next_page = https://sanet.st/page-{}/ where {} number of page.
next_page = response.urljoin(next_page)
# If next_page have value
if next_page:
# Recall parse with url https://sanet.st/page-{}/ where {} number of page.
yield scrapy.Request(url=next_page, callback=self.parse)
If you run this code with the "-o sanet.json" key you will get the following result.
scrapy runspider sanet.py -o sanet.json
[
{"result": "results 1 - 15 from 651"},
{"result": "results 16 - 30 from 651"},
{"result": "results 31 - 45 from 651"},
...
etc.
...
{"result": "results 631 - 645 from 651"},
{"result": "results 646 - 651 from 651"}
]
from scrapy.http import Request
def parse(self, response):
total_pages = response.xpath("//body/section/div/section/div/div/ul/li[6]/a/text()").extract_first()
urls = ('https://example.com/page-{}'.format(i) for i in range(1,total_pages))
for url in urls:
yield Request(url, callback=self.parse_page)
def parse_page(self, response):
# do the stuff
an alternative way as shown in the tutorial is to use yield response.follow(url, callback=self.parse_page) and it supports relative URLs directly.

Getting info from the links that were scraped

I am trying to get links of the book at first and then get into that link and grab the title of the book. At the end, I want to store titles in a column and links in another column in csv file. This is how I write the book. I only get links not titles.
import scrapy
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon_spider'
allowed_domains = ['www.amazon.com']
start_urls = ['https://www.amazon.com/s/ref=dp_bc_3?ie=UTF8&node=468216&rh=n%3A283155%2Cn%3A%212349030011%2Cn%3A465600%2C']
def parse(self, response):
links = response.xpath('//*[#class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"]/#href').extract()
for link in links:
yield {'Book Urls': link}
yield scrapy.Request(link, callback=self.book_title)
def book_title(self, response):
title = response.xpath('//*[#id="productTitle"]/text()').extract_first()
yield {'Title': title}
I solved it with response.meta.
import scrapy
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon_spider'
allowed_domains = ['www.amazon.com']
start_urls = ['https://www.amazon.com/s/ref=dp_bc_3?ie=UTF8&node=468216&rh=n%3A283155%2Cn%3A%212349030011%2Cn%3A465600%2C']
def parse(self, response):
links = response.xpath('//*[#class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"]/#href').extract()
for link in links:
title = response.meta.get('title')
yield scrapy.Request(link, callback=self.book_title, meta = {'title':title, 'Link': link})
def book_title(self, response):
title = response.xpath('//*[#id="productTitle"]/text()').extract()
response.meta['title'] = title
yield response.meta

Resources