How to use proxy in scrapy crawler? - python-3.x

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scraper_api import ScraperAPIClient
class Spider(CrawlSpider):
allowed_domains = ['example.com']
client = ScraperAPIClient('xyz')
#Initilize method for taking argument from user
def __init__(self, category=None,location=None ,**kwargs):
#self.start_urls = [client.scrapyGet(url = 'http://example.com')]
super().__init__(**kwargs) # python3
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[contains(#class,'on-click-container')]/a[contains(#href, '/biz/')]"), callback='parse_item', follow=True,process_request='set_proxy'),
#for next page
Rule(LinkExtractor(restrict_xpaths='//a[contains(#class,"next-link")]'),process_request='set_proxy')
)
def set_proxy(self,request):
pass
def parse_item(self, response):
# contains data
yield{
# BUSINESS INFORMATION
"Company Name": response.xpath('//title').extract_first(),
}
Here I don't understand how to write set_proxy function that send request to scraper API server. Please help about this.

Related

How to use css selector in object from HtmlResponse

I'm currently developing an application using Scrapy.
I want to get some value using CSS selector out of def parse, So I create a HtmlResponse object first and tried to get some value using css(), But I can't get any value...
Within def parse, I can get the value in the same way.
What should I do if it is outside of def parse?
Here is the code:
import scrapy
from scrapy.http import HtmlResponse
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = ['https://sample.com/search']
my_response = HtmlResponse(url=start_urls[0])
print('HtmlResponse')
print(my_response)
h3s = my_response.css('h3')
print(str(len(h3s)))
print('----------')
def parse(self, response, **kwargs):
print('def parse')
print(response)
h3s = response.css('h3')
print(str(len(h3s)))
Console display:
HtmlResponse
<200 https://sample.com/search>
0 # <- I want to show '3' here
----------
def parse
<200 https://sample.com/search>
3
update
The program I want to finally create is the following code:
[ (Note) The code below does not work for reference ]
import scrapy
from scrapy.http import HtmlResponse
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = []
response_url = 'https://sample.com/search'
my_response = HtmlResponse(url=response_url)
categories = my_response.css('.categories a::attr(href)').getall()
for category in categories:
start_urls.append(category)
def parse(self, response, **kwargs):
pages = response.css('h3')
for page in pages:
print(page.css('::text').get())
Python 3.8.5
Scrapy 2.5.0
I know what do you mean,your start url is the basic domain,but you also want to fetch all category page to extract h3.
in scrapy you can extract data and follow new links in the same parse method,here is a example.
import scrapy
class SampleSpider(scrapy.Spider):
name = 'sample'
allowed_domains = ['sample.com']
start_urls = ['https://sample.com/search']
def parse(self, response, **kwargs):
print('def parse')
print(response)
pages = response.css('h3')
#extract data at here
for page in pages:
print(page.css('::text').get())
yield page.css('::text').get()
#follow new links here
categories = response.css('.categories a::attr(href)').getall()
for category in categories:
yield scrapy.Request(category,callback=self.parse)
you can read scrapy document for more information

Scrapy CrawlSpider next page isn't working

I wanted to scrape all items from each card and the firsr rule is working fine but the second rule meaning pagination rule is not working.
This is my code:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class RealtorListSpider(CrawlSpider):
name = 'realtor_list'
allowed_domains = ['www.realtor.com']
start_urls = ['https://www.realtor.com/realestateagents/New-Orleans_LA/pg-1']
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[#data-testid="component-agentCard"]'), callback='parse_item', follow=False),
Rule(LinkExtractor(restrict_xpaths='//a[#aria-label="Go to next page"]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
yield{
'name': response.xpath('(//*[#class="jsx-3130164309 profile-Tiltle-main"]/text())[2]').get()
}
The problem is in your element selection in linkextractor, not in rule for pagination. Xpath expression doesn't contain link selection but selection is correct that's why I 've made the pagination in starting url and it's working fine.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class RealtorListSpider(CrawlSpider):
name = 'realtor_list'
allowed_domains = ['www.realtor.com']
start_urls = ['https://www.realtor.com/realestateagents/New-Orleans_LA/pg-'+str(x) +'' for x in range(1,6)]
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[#data-testid="component-agentCard"]'), callback='parse_item', follow=False),
)
def parse_item(self, response):
yield{
'name': response.xpath('(//*[#class="jsx-3130164309 profile-Tiltle-main"]/text())[2]').get()
}

Scrapy Rules: Exclude certain urls with process links

I am very happy to having discovered the Scrapy Crawl Class with its Rule Objects. However when I am trying to extract urls which contain the word "login" with process_links it doesn't work. The solution I implemented comes from here: Example code for Scrapy process_links and process_request but it doesn't exclude the pages I want
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from accenture.items import AccentureItem
class AccentureSpiderSpider(CrawlSpider):
name = 'accenture_spider'
start_urls = ['https://www.accenture.com/us-en/internet-of-things-index']
rules = (
Rule(LinkExtractor(restrict_xpaths='//a[contains(#href, "insight")]'), callback='parse_item',process_links='process_links', follow=True),
)
def process_links(self, links):
for link in links:
if 'login' in link.text:
continue # skip all links that have "login" in their text
yield link
def parse_item(self, response):
loader = ItemLoader(item=AccentureItem(), response=response)
url = response.url
loader.add_value('url', url)
yield loader.load_item()
My mistake was to use link.text
When using link.url it works fine :)

Downloading files with ItemLoaders() in Scrapy

I created a crawl spider to download files. However the spider downloaded only the urls of the files and not the files themselves. I uploaded a question here Scrapy crawl spider does not download files? . While the the basic yield spider kindly suggested in the answers works perfectly, when I attempt to download files with items or item loaders the spider does not work! The original question does not include the items.py. So there it is:
ITEMS
import scrapy
from scrapy.item import Item, Field
class DepositsusaItem(Item):
# main fields
name = Field()
file_urls = Field()
files = Field()
# Housekeeping Fields
url = Field()
project = Field()
spider = Field()
server = Field()
date = Field()
pass
EDIT: added original code
EDIT: further corrections
SPIDER
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import datetime
import socket
from us_deposits.items import DepositsusaItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose
from urllib.parse import urljoin
class DepositsSpider(CrawlSpider):
name = 'deposits'
allowed_domains = ['doi.org']
start_urls = ['https://minerals.usgs.gov/science/mineral-deposit-database/#products', ]
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[#id="products"][1]/p/a'),
callback='parse_x'),
)
def parse_x(self, response):
i = ItemLoader(item=DepositsusaItem(), response=response)
i.add_xpath('name', '//*[#class="container"][1]/header/h1/text()')
i.add_xpath('file_urls', '//span[starts-with(#data-url, "/catalog/file/get/")]/#data-url',
MapCompose(lambda i: urljoin(response.url, i))
)
i.add_value('url', response.url)
i.add_value('project', self.settings.get('BOT_NAME'))
i.add_value('spider', self.name)
i.add_value('server', socket.gethostname())
i.add_value('date', datetime.datetime.now())
return i.load_item()
SETTINGS
BOT_NAME = 'us_deposits'
SPIDER_MODULES = ['us_deposits.spiders']
NEWSPIDER_MODULE = 'us_deposits.spiders'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'us_deposits.pipelines.UsDepositsPipeline': 1,
'us_deposits.pipelines.FilesPipeline': 2
}
FILES_STORE = 'C:/Users/User/Documents/Python WebCrawling Learning Projects'
PIPELINES
class UsDepositsPipeline(object):
def process_item(self, item, spider):
return item
class FilesPipeline(object):
def process_item(self, item, spider):
return item
It seems to me that using items and/or item loaders has nothing to do with your problem.
The only problems I see are in your settings file:
FilesPipeline is not activated (only us_deposits.pipelines.UsDepositsPipeline is)
FILES_STORE should be a string, not a set (an exception is raised when you activate the files pipeline)
ROBOTSTXT_OBEY = True will prevent the downloading of files
If I correct all of those issues, the file download works as expected.

how to get count of matched words in google.com using scrapy

If I gave a word to the scrapy application. It has to search in google and print the count of the matched words. Don’t hardcode the word in the application it should take from the console .
import scrapy
class GogleSpider(scrapy.Spider):
name = 'gogle'
allowed_domains = ['google.co.in']
start_urls = ['https://www.google.co.in/?gfe_rd=cr/']
def parse(self, response):
As in documentation
import scrapy
class GogleSpider(scrapy.Spider):
name = 'gogle'
allowed_domains = ['google.co.in']
start_urls = ['https://www.google.co.in/?gfe_rd=cr/']
def __init__(self, word=None, *args, **kwargs):
super(GogleSpider, self).__init__(*args, **kwargs)
self.word = word
def parse(self, response):
print("word:", self.word)
And now you can run it in console as
scrapy crawl gogle -a word=electronics
and you get word "electronics" in parse() in self.word
import scrapy
import re
class GogleSpider(scrapy.Spider):
name = 'gogle'
allowed_domains = ['google.co.in']
start_urls = ['https://www.google.co.in/?gfe_rd=cr/']
def __init__(self, word=None):
super(GogleSpider, self).__init__()
self.word = word
def parse(self, response):
string=response.xpath('//div[#class="sbqs_c"]/text()').extract()
string=''.join(string)
print(len(re.findall(self.word, string.lower())))

Resources