I get "Twisted.internet.error.ReactorNotRestartable" when I run this spider. Actually, I get this error when I run any spider in the project directory. It was working fine when I last checked But since today it gives me this error. I am not familiar with twisted. Please help!
import scrapy
from quo.items import QuoItem
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from twisted.internet import reactor
class ISpider(CrawlSpider):
name='iShopE'
handle_httpstatus_list = [400]
def start_requests(self):
yield scrapy.Request('https://www.ishopping.pk/electronics/home-theatres.html')
def parse(self, response):
for href in response.xpath('//div[#class="category-products-"]').extract():
for product_page in response.xpath('//h2[#class="product-name"]/a/#href').extract():
url=response.urljoin(product_page)
yield scrapy.Request(url, callback=self.parse_productPage)
next_page=response.xpath('(//a[#class="next i-next"]/#href)[1]').extract()
if next_page:
yield scrapy.Request(response.urljoin(next_page[0]), callback=self.parse)
def parse_productPage(self,response):
url_ProductPage=response.url
item=QuoItem()
if url_ProductPage:
item['url_ProductPage']=url_ProductPage
for rev in response.xpath('//div[#class="product-essential"]'):
price=response.xpath('//div[#class="price-box"]/span[#class="regular-price"]/meta[#itemprop="price"]/#content').extract()
if price:
item['price']=price
newPrice=response.xpath('(//div[#class="price-box"]/p[#class="special-price"]/span[#class="price"]/text())[1]').extract()
if newPrice:
newPrice=" ".join(str(x) for x in newPrice)
newPrice=newPrice.strip('\n')
item['price'] =newPrice.replace(" ", "")
item['newPrice']=newPrice.replace(" ","")
oldPrice=response.xpath('(//div[#class="price-box"]/p[#class="old-price"]/span[#class="price"]/text())[1]').extract()
if oldPrice:
oldPrice=" ".join(str(x) for x in oldPrice)
oldPrice=oldPrice.strip('\n')
item['oldPrice'] =oldPrice.replace(" ", "")
Availability=response.xpath('//p[#class="availability in-stock"]/span[#class="value"]/text()').extract()
if Availability:
item['Availability']=Availability
Brand=response.xpath('(//div[#class="box-p-attr"]/span)[1]/text()').extract()
if Brand:
item['Brand']=Brand
deliveryTime=response.xpath('(//div[#class="box-p-attr"]/span)[2]/text()').extract()
if deliveryTime:
item['deliveryTime']=deliveryTime
Waranty=response.xpath('(//div[#class="box-p-attr"]/span)[3]/text()').extract()
if Waranty:
item['Waranty']=Waranty
title=response.xpath('//div[#class="product-name"]/h1[#itemprop="name"]/text()').extract()
if title:
item['Title']=title
yield item
Related
Im not sure why my script isnt scrapping any items, is the same script that im using for another website, maybe the classes im using are wrong.
`
import scrapy
import os
from scrapy.crawler import CrawlerProcess
from datetime import datetime
date = datetime.now().strftime("%d_%m_%Y")
class stiendaSpider(scrapy.Spider):
name = 'stienda'
start_urls = ['https://stienda.uy/tv']
def parse(self, response):
for products in response.css('.grp778'):
price = products.css('.precioSantander::text').get()
name = products.css('#catalogoProductos .tit::text').get()
if price and name:
yield {'name': name.strip(),
'price': price.strip()}
os.chdir('C:\\Users\\cabre\\Desktop\\scraping\\stienda\\data\\raw')
process = CrawlerProcess(
# settings={"FEEDS": {"items.csv": {"format": "csv"}}}
settings={"FEEDS": {f"stienda_{date}.csv": {"format": "csv"}}}
)
process.crawl(stiendaSpider)
process.start()
`
I tried several but I dont usnderstand why is not working..
I was able to get the name field, but the price attribute is rendered empty and filled in later from an ajax call. That is why it's not being extracted.
import scrapy
import os
from scrapy.crawler import CrawlerProcess
from datetime import datetime
date = datetime.now().strftime("%d_%m_%Y")
class stiendaSpider(scrapy.Spider):
name = 'stienda'
start_urls = ['https://stienda.uy/tv']
def parse(self, response):
for products in response.xpath('//div[#data-disp="1"]'):
name = products.css('.tit::text').get()
if name:
yield {'name': name.strip()}
You can see it if you look at the page source... all of the elements with the class 'precioSantander' are empty.
I wanted to scrape all items from each card and the firsr rule is working fine but the second rule meaning pagination rule is not working.
This is my code:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class RealtorListSpider(CrawlSpider):
name = 'realtor_list'
allowed_domains = ['www.realtor.com']
start_urls = ['https://www.realtor.com/realestateagents/New-Orleans_LA/pg-1']
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[#data-testid="component-agentCard"]'), callback='parse_item', follow=False),
Rule(LinkExtractor(restrict_xpaths='//a[#aria-label="Go to next page"]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
yield{
'name': response.xpath('(//*[#class="jsx-3130164309 profile-Tiltle-main"]/text())[2]').get()
}
The problem is in your element selection in linkextractor, not in rule for pagination. Xpath expression doesn't contain link selection but selection is correct that's why I 've made the pagination in starting url and it's working fine.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class RealtorListSpider(CrawlSpider):
name = 'realtor_list'
allowed_domains = ['www.realtor.com']
start_urls = ['https://www.realtor.com/realestateagents/New-Orleans_LA/pg-'+str(x) +'' for x in range(1,6)]
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[#data-testid="component-agentCard"]'), callback='parse_item', follow=False),
)
def parse_item(self, response):
yield{
'name': response.xpath('(//*[#class="jsx-3130164309 profile-Tiltle-main"]/text())[2]').get()
}
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scraper_api import ScraperAPIClient
class Spider(CrawlSpider):
allowed_domains = ['example.com']
client = ScraperAPIClient('xyz')
#Initilize method for taking argument from user
def __init__(self, category=None,location=None ,**kwargs):
#self.start_urls = [client.scrapyGet(url = 'http://example.com')]
super().__init__(**kwargs) # python3
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[contains(#class,'on-click-container')]/a[contains(#href, '/biz/')]"), callback='parse_item', follow=True,process_request='set_proxy'),
#for next page
Rule(LinkExtractor(restrict_xpaths='//a[contains(#class,"next-link")]'),process_request='set_proxy')
)
def set_proxy(self,request):
pass
def parse_item(self, response):
# contains data
yield{
# BUSINESS INFORMATION
"Company Name": response.xpath('//title').extract_first(),
}
Here I don't understand how to write set_proxy function that send request to scraper API server. Please help about this.
#I am following code from this previous stackoverflow posts:
How to schedule Scrapy crawl execution programmatically
Running Scrapy multiple times in the same process
##The following script works well while using one spider:
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from AmazonScrap.spiders.Amazonfeed import AmazonfeedSpider
from scrapy.utils.project import get_project_settings
from twisted.internet.defer import inlineCallbacks
from urllib.parse import urlparse
# from AmazonScrap.spiders.Productfeed import ProductfeedSpider
import yaml
from urllib.parse import urlencode
with open(r'C:\Users\Latitude\Desktop\Shadman\Scrapy_Projects\Product_List.yaml') as file:
PList = yaml.load(file, Loader=yaml.FullLoader)
Purl= []
for k, v in PList.items():
arg = v['M_title']
args = {"k": arg}
amazon_url= 'https://www.amazon.com/s?{}'.format(urlencode(args))
Purl.append(amazon_url)
print(Purl)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner(settings = get_project_settings())
#inlineCallbacks
def loop_urls(urls):
for url in urls:
yield runner.crawl(AmazonfeedSpider, url)
# reactor.stop()
loop_urls(Purl)
reactor.run()
enter code here
##But this script doesn't even scrape successfully using the first spider.. and can't access the 2nd spider...
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from AmazonScrap.spiders.Amazonfeed import AmazonfeedSpider
from scrapy.utils.project import get_project_settings
from twisted.internet.defer import inlineCallbacks
from urllib.parse import urlparse
from AmazonScrap.spiders.Productfeed import ProductfeedSpider
import yaml
from urllib.parse import urlencode
# def crawl_job():
# """
# Job to start spiders.
# Return Deferred, which will execute after crawl has completed.
# """
# settings = get_project_settings()
# runner = CrawlerRunner(settings)
# return runner.crawl(AmazonfeedSpider)
def CrawlProduct():
settings = get_project_settings()
runner2 = CrawlerRunner(settings)
yield runner2.crawl(ProductfeedSpider)
reactor.stop()
def schedule_next_crawl(null, sleep_time):
"""
Schedule the next crawl
"""
reactor.callLater(sleep_time, CrawlProduct)
#inlineCallbacks
def loop_urls(urls):
"""
# Job to start spiders.
# Return Deferred, which will execute after crawl has completed.
# """
settings = get_project_settings()
runner = CrawlerRunner(settings)
for url in urls:
yield runner.crawl(AmazonfeedSpider, url)
# reactor.stop()
def crawl(Purl):
"""
A function that schedules a crawl 30 seconds after
each successful crawl.
"""
# loop_urls() returns a Deferred
d = loop_urls(Purl)
# call schedule_next_crawl(<scrapy response>, n) after crawl job is complete
d.addCallback(schedule_next_crawl, 30)
d.addErrback(catch_error)
def catch_error(failure):
print(failure.value)
if __name__=="__main__":
with open(r'C:\Users\Latitude\Desktop\Shadman\Scrapy_Projects\Product_List.yaml') as file:
PList = yaml.load(file, Loader=yaml.FullLoader)
Purl= []
for k, v in PList.items():
arg = v['M_title']
args = {"k": arg}
amazon_url= 'https://www.amazon.com/s?{}'.format(urlencode(args))
Purl.append(amazon_url)
print(Purl)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
crawl(Purl)
reactor.run()
#Is it for not executing the inlineCallbacks function properly..? I am drawing the attention of altruistic experts and looking forward to their suggestions and solutions. please speculate the aforementioned stackoverflow questions and solutions first, before answering my question.
I am very happy to having discovered the Scrapy Crawl Class with its Rule Objects. However when I am trying to extract urls which contain the word "login" with process_links it doesn't work. The solution I implemented comes from here: Example code for Scrapy process_links and process_request but it doesn't exclude the pages I want
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from accenture.items import AccentureItem
class AccentureSpiderSpider(CrawlSpider):
name = 'accenture_spider'
start_urls = ['https://www.accenture.com/us-en/internet-of-things-index']
rules = (
Rule(LinkExtractor(restrict_xpaths='//a[contains(#href, "insight")]'), callback='parse_item',process_links='process_links', follow=True),
)
def process_links(self, links):
for link in links:
if 'login' in link.text:
continue # skip all links that have "login" in their text
yield link
def parse_item(self, response):
loader = ItemLoader(item=AccentureItem(), response=response)
url = response.url
loader.add_value('url', url)
yield loader.load_item()
My mistake was to use link.text
When using link.url it works fine :)