How to use css selector in object from HtmlResponse - python-3.x

I'm currently developing an application using Scrapy.
I want to get some value using CSS selector out of def parse, So I create a HtmlResponse object first and tried to get some value using css(), But I can't get any value...
Within def parse, I can get the value in the same way.
What should I do if it is outside of def parse?
Here is the code:
import scrapy
from scrapy.http import HtmlResponse
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = ['https://sample.com/search']
my_response = HtmlResponse(url=start_urls[0])
print('HtmlResponse')
print(my_response)
h3s = my_response.css('h3')
print(str(len(h3s)))
print('----------')
def parse(self, response, **kwargs):
print('def parse')
print(response)
h3s = response.css('h3')
print(str(len(h3s)))
Console display:
HtmlResponse
<200 https://sample.com/search>
0 # <- I want to show '3' here
----------
def parse
<200 https://sample.com/search>
3
update
The program I want to finally create is the following code:
[ (Note) The code below does not work for reference ]
import scrapy
from scrapy.http import HtmlResponse
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = []
response_url = 'https://sample.com/search'
my_response = HtmlResponse(url=response_url)
categories = my_response.css('.categories a::attr(href)').getall()
for category in categories:
start_urls.append(category)
def parse(self, response, **kwargs):
pages = response.css('h3')
for page in pages:
print(page.css('::text').get())
Python 3.8.5
Scrapy 2.5.0

I know what do you mean,your start url is the basic domain,but you also want to fetch all category page to extract h3.
in scrapy you can extract data and follow new links in the same parse method,here is a example.
import scrapy
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = ['https://sample.com/search']
def parse(self, response, **kwargs):
print('def parse')
print(response)
pages = response.css('h3')
#extract data at here
for page in pages:
print(page.css('::text').get())
yield page.css('::text').get()
#follow new links here
categories = response.css('.categories a::attr(href)').getall()
for category in categories:
yield scrapy.Request(category,callback=self.parse)
you can read scrapy document for more information

Related

Scrapy script not scrapping items

Im not sure why my script isnt scrapping any items, is the same script that im using for another website, maybe the classes im using are wrong.
`
import scrapy
import os
from scrapy.crawler import CrawlerProcess
from datetime import datetime
date = datetime.now().strftime("%d_%m_%Y")
class stiendaSpider(scrapy.Spider):
name = 'stienda'
start_urls = ['https://stienda.uy/tv']
def parse(self, response):
for products in response.css('.grp778'):
price = products.css('.precioSantander::text').get()
name = products.css('#catalogoProductos .tit::text').get()
if price and name:
yield {'name': name.strip(),
'price': price.strip()}
os.chdir('C:\\Users\\cabre\\Desktop\\scraping\\stienda\\data\\raw')
process = CrawlerProcess(
# settings={"FEEDS": {"items.csv": {"format": "csv"}}}
settings={"FEEDS": {f"stienda_{date}.csv": {"format": "csv"}}}
)
process.crawl(stiendaSpider)
process.start()
`
I tried several but I dont usnderstand why is not working..
I was able to get the name field, but the price attribute is rendered empty and filled in later from an ajax call. That is why it's not being extracted.
import scrapy
import os
from scrapy.crawler import CrawlerProcess
from datetime import datetime
date = datetime.now().strftime("%d_%m_%Y")
class stiendaSpider(scrapy.Spider):
name = 'stienda'
start_urls = ['https://stienda.uy/tv']
def parse(self, response):
for products in response.xpath('//div[#data-disp="1"]'):
name = products.css('.tit::text').get()
if name:
yield {'name': name.strip()}
You can see it if you look at the page source... all of the elements with the class 'precioSantander' are empty.

Runtime Request URL change not working scrapy

I have written a script in Python using Scrapy. The code runs to fetch all the pages that exist containing the code. It works fine on the first page load when scrapy is started and as per the script logic gets us page no. 2. But after loading page 2 I am unable to get xpath of the new page loaded so I can move ahead this way and get all the web-page numbers.
Sharing the code snippet.
import scrapy
from scrapy import Spider
class PostsSpider(Spider):
name = "posts"
start_urls = [
'https://www.boston.com/category/news/'
]
def parse(self, response):
print("first time")
print(response)
results = response.xpath("//*[contains(#id, 'load-more')]/#data-next-page").extract_first()
print(results)
if results is not None:
for result in results:
page_number = 'page/' + result
new_url = self.start_urls[0] + page_number
print(new_url)
yield scrapy.Request(url=new_url, callback=self.parse)
else:
print("last page")
It is because the page doesn't create new get requests when it loads the next page, it makes an ajax call to an api that returns json.
I made some adjustments to your code so it should work properly now. I am assuming that there is something other than the next page number you are trying to extract from each page, so I wrapped the html string into a scrapy.Slector class so you can use Xpath and such on it. This script will crawl alot of pages really fast, so you might want to adjust your settings to slow it down too.
import scrapy
from scrapy import Spider
from scrapy.selector import Selector
class PostsSpider(Spider):
name = "posts"
ajaxurl = "https://www.boston.com/wp-json/boston/v1/load-more?taxonomy=category&term_id=779&search_query=&author=&orderby=&page=%s&_wpnonce=f43ab1aae4&ad_count=4&redundant_ids=25129871,25130264,25129873,25129799,25128140,25126233,25122755,25121853,25124456,25129584,25128656,25123311,25128423,25128100,25127934,25127250,25126228,25126222"
start_urls = [
'https://www.boston.com/category/news/'
]
def parse(self, response):
new_url = None
try:
json_result = response.json()
html = json_result['data']['html']
selector = Selector(text=html, type="html")
# ... do some xpath stuff with selector.xpath.....
new_url = self.ajaxurl % json_result["data"]["nextPage"]
except:
results = response.xpath("//*[contains(#id, 'load-more')]/#data-next-page").extract_first()
if results is not None:
for result in results:
new_url = self.ajaxurl % result
if new_url:
print(new_url)
yield scrapy.Request(url=new_url, callback=self.parse)

How do I scrape multiple table rows from this url using scrapy?

import scrapy
class SsoSpider(scrapy.Spider):
name = 'sso'
allowed_domains = ['www.sso.agc.gov.sg']
start_urls = ['https://sso.agc.gov.sg/Browse/Act/Current']
def parse(self, response):
acts = response.xpath("//table[#class='table browse-list']/tbody")
for act in acts:
yield {
#'Act title': act.xpath(".//tr[#class='alternate']/td/a/text()").get(),
'Act title': act.xpath(".//tr/td/a/text()").get(),
#'Short-hand code': act.xpath(".//tr[#class='alternate']/td/a/#href").get()
'Short-hand code': act.xpath(".//tr/td/a/#href").get()
}
So this is my code above for scraping. After running it, I only get 1 scraped result.
I think the issue got to do with how the table rows are created? Like some has class while some does not.
I'm new to scraping so any help would be appreciated!
Try this I hope it will work.
import scrapy
class SsoSpider(scrapy.Spider):
name = 'sso'
allowed_domains = ['www.sso.agc.gov.sg']
start_urls = ['https://sso.agc.gov.sg/Browse/Act/Current']
def parse(self, response):
acts = response.xpath("//*[#id='listPanel']/table/tbody/tr")
for act in acts:
yield {
'Act title': act.xpath("td[1]/a/text()").extract_first(),
'Short-hand code': act.xpath("td/a/#href").extract_first()
}
Let me know.

Extract articles from its corresponding links from a webpage using scrapy

Hi I am new to scrapy and I am Trying to extract text from links in a given webpage. Here is the code I wrote for the same and after running scrapy crawl article, it gives no module named article. Can you help me find where I am wrong? Thanks in advance.
import scrapy
from urllib.parse import urljoin
class ArticleSpider(scrapy.Spider):
name = 'article'
allowed_domains = ['www.timesofindia.indiatimes.com/business']
start_urls = ['https://timesofindia.indiatimes.com/business']
def parse(self, response):
links = response.css('span.w_tle a::attr(href)').extract()
for link in links:
url = urljoin(response.url, link)
yield scrapy.Request(url,callback=self.parse_article)
def parse_article(self,response):
for info in response.css('div.article_content clearfix'):
yield {'Article':info.css('div.Normal::text').extract()}
If you take a look at your log you'll see 'offsite/filtered': 211, and that the cause of not getting anything. In order to dodge this you can do two things:
Remove allowed_domains field
Add dont_filter=True in your request like:
yield scrapy.Request(url,callback=self.parse_article, dont_filter=True)
I tested your code it does not seems to work properly if you want to get text body so i rewrote it with XPath which I am more comfortable with.
import scrapy
from urllib.parse import urljoin
class ArticleSpider(scrapy.Spider):
name = 'article'
allowed_domains = ['www.timesofindia.indiatimes.com']
start_urls = ['https://timesofindia.indiatimes.com/business']
def parse(self, response):
links = response.xpath('//*[#id="c_listing_wdt_1"]//span[1]/a/#href').getall()
for link in links:
url = urljoin(response.url, link)
yield scrapy.Request(url,callback=self.parse_article, dont_filter=True)
def parse_article(self, response):
print(response.xpath('//*[#id="content"]//arttextxml//div//text()').getall())
for info in response.xpath('//*[#id="content"]//arttextxml//div//text()').getall():
yield {'Article':info}
getall() can be used instead of extract(), they are almost equal.

Export scraped data to CSV

Can someone please explain to me how to export the scraped data from this script to a csv through a python script? It seems that I am successfully scraping the data through the output I am seeing, but I am not sure how to put this into a csv efficiently. Thanks.
import scrapy
import scrapy.crawler as crawler
class RedditbotSpider(scrapy.Spider):
name = 'redditbot'
allowed_domains = ['www.reddit.com/r/gameofthrones/']
start_urls = ['https://www.reddit.com/r/gameofthrones/']
output = 'output.csv'
def parse(self, response):
yield {'a': 'b'}
#Extracting the content using css selectors
titles = response.css('.title.may-blank::text').extract()
votes = response.css('.score.unvoted::text').extract()
times = response.css('time::attr(title)').extract()
comments = response.css('.comments::text').extract()
#Give the extracted content row wise
for item in zip(titles,votes,times,comments):
#create a dictionary to store the scraped info
scraped_info = {
'title' : item[0],
'vote' : item[1],
'created_at' : item[2],
'comments' : item[3],
}
#yield or give the scraped info to scrapy
yield scraped_info
def run_crawler(spider_cls):
"""
spider_cls: Scrapy Spider class
settings: Scrapy settings
returns: Twisted Deferred
"""
runner = crawler.CrawlerRunner()
return runner.crawl(spider_cls) # return Deferred
def test_scrapy_crawler():
deferred = run_crawler(RedditbotSpider)
#deferred.addCallback
def success(results):
"""
After crawler completes, this function will execute.
Do your assertions in this function.
"""
#deferred.addErrback
def error(failure):
raise failure.value
return deferred
test_scrapy_crawler()
you can include the Feed Exporter configuration on the settings before running the spider. So for your code try changing:
runner = crawler.CrawlerRunner()
with
runner = crawler.CrawlerRunner({
'FEED_URI': 'output_file.csv',
'FEED_FORMAT': 'csv',
})
The output items should be inside the output_file.csv file in the same directory you run this script.

Resources