how to scrape anonymously using Scrapy Tor Privoxy & UserAgent? (Windows 10) - python-3.x

The answer of this question was quite difficult to find since informations are scattered, and the title of the questions are sometime misleading. The answer below regroup all informations needed in one place.

Your spider should look like.
# based on https://doc.scrapy.org/en/latest/intro/tutorial.html
import scrapy
import requests
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
print('\n\nurl:', url)
## use one of the yield below
# middleware will process the request
yield scrapy.Request(url=url, callback=self.parse)
# check if Tor has changed IP
#yield scrapy.Request('http://icanhazip.com/', callback=self.is_tor_and_privoxy_used)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
print('\n\nSpider: Start')
print('Is proxy in response.meta?: ', response.meta)
print ("user_agent is: ",response.request.headers['User-Agent'])
print('\n\n Spider: End')
self.log('Saved file --- %s' % filename)
def is_tor_and_privoxy_used(self, response):
print('\n\nSpider: Start')
print("My IP is : " + str(response.body))
print("Is proxy in response.meta?: ", response.meta) # not header dispo
print('\n\nSpider: End')
self.log('Saved file %s' % filename)
You will also need to add stuff in middleware.py and settings.py . If you don't know how to do it this will help you

Related

Scrapy TabError: inconsistent use of tabs and space in identation

import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/'
]
for url in urls:
yield scrapy.Requests(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
when i execute this file with "scrapy crawl quotes", i got error in line 7 which marks my "[" and says: TabError: inconsistent use of tabs and spaces in identation
but i wonder why, because i assume that i correctly "copied" the code from a video, and i tested several identations.
Currently im working on 18.04 ubuntu lts server

AttributeError: 'bytes' object has no attribute 'get'

i am trying to extract all the brand names from https://www.gizbot.com/mobile-brands-in-india/.
Below is the code for mobiles_spiders.py file
class MobilesSpider(scrapy.Spider):
name = "mobiles"
def start_requests(self):
urls = [
'https://www.gizbot.com/mobile-brands-in-india/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'mobiles-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.xpath(str.encode('.//div[has-class("all-brands-block-desc-brand")]/text()').get()))
self.log('Saved file %s' % filename)
but the code is giving me error as
AttributeError: 'bytes' object has no attribute 'get'
i need suggestion as to what function i need to use instead of get() for extracting all the div elements that contains brand name.
any help is appreciated.
It may help you.
import scrapy
class MobilesSpider(scrapy.Spider):
name = "mobiles"
def start_requests(self):
urls = [
'https://www.gizbot.com/mobile-brands-in-india/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'mobiles-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.xpath('.//div[has-class("all-brands-block-desc-brand")]/text()').get().encode('utf-8'))
self.log('Saved file %s' % filename)

Scrapy with multiple pages

I have created a simple scrapy project, In which, I got the total page number from the initial site example.com/full. Now I need to scrape all the page starting from example.com/page-2 to 100(if total page count is 100). How can I do that?
Any advice would be helpful.
Code:
import scrapy
class AllSpider(scrapy.Spider):
name = 'all'
allowed_domains = ['example.com']
start_urls = ['https://example.com/full/']
total_pages = 0
def parse(self, response):
total_pages = response.xpath("//body/section/div/section/div/div/ul/li[6]/a/text()").extract_first()
#urls = ('https://example.com/page-{}'.format(i) for i in range(1,total_pages))
print(total_pages)
Update #1:
I tried using that urls = ('https://example.com/page-{}'.format(i) for i in range(1,total_pages)) but its not working, may be i'm doing something wrong.
Update #2:
I have changed my code like this one
class AllSpider(scrapy.Spider):
name = 'all'
allowed_domains = ['sanet.st']
start_urls = ['https://sanet.st/full/']
total_pages = 0
def parse(self, response):
total_pages = response.xpath("//body/section/div/section/div/div/ul/li[6]/a/text()").extract_first()
for page in range(2, int(total_pages)):
url = 'https://sanet.st/page-'+str(page)
yield scrapy.Request(url)
title = response.xpath('//*[#class="list_item_title"]/h2/a/span/text()').extract()
print(title)
But still the loop showing only the first page title repeatedly.
I need to extract the title from different pages and print it in the prompt.
How can i do that?
You must search for the 'next_page' object and continue to loop while it is on the page.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class SanetSpider(scrapy.Spider):
name = 'sanet'
allowed_domains = ['sanet.st']
start_urls = ['https://sanet.st/full/']
def parse(self, response):
yield {
# Do something.
'result': response.xpath('//h3[#class="posts-results"]/text()').extract_first()
}
# next_page = /page-{}/ where {} number of page.
next_page = response.xpath('//a[#data-tip="Next page"]/#href').extract_first()
# next_page = https://sanet.st/page-{}/ where {} number of page.
next_page = response.urljoin(next_page)
# If next_page have value
if next_page:
# Recall parse with url https://sanet.st/page-{}/ where {} number of page.
yield scrapy.Request(url=next_page, callback=self.parse)
If you run this code with the "-o sanet.json" key you will get the following result.
scrapy runspider sanet.py -o sanet.json
[
{"result": "results 1 - 15 from 651"},
{"result": "results 16 - 30 from 651"},
{"result": "results 31 - 45 from 651"},
...
etc.
...
{"result": "results 631 - 645 from 651"},
{"result": "results 646 - 651 from 651"}
]
from scrapy.http import Request
def parse(self, response):
total_pages = response.xpath("//body/section/div/section/div/div/ul/li[6]/a/text()").extract_first()
urls = ('https://example.com/page-{}'.format(i) for i in range(1,total_pages))
for url in urls:
yield Request(url, callback=self.parse_page)
def parse_page(self, response):
# do the stuff
an alternative way as shown in the tutorial is to use yield response.follow(url, callback=self.parse_page) and it supports relative URLs directly.

Count word on the page

How to scrap any site and search for the given word and displays how many times it occurred
class LinkedinScraper(scrapy.Spider):
name = "linked"
def start_requests(self):
urls = ['https://www.linkedin.com/']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'linkedin.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
You can use regex with response.body to find all occurrances in any places
ie.
import re
r = re.findall('\\bcat\\b', "cat catalog cattering")
print(len(r), 'cat(s)')
Gives "1 cat(s)", not "3 cat(s)"
If you need word only in some tags then you use first response.css(), response.xpath(), etc.
EDIT:
Example which shows how to use
re.findall(pattern, response.text)
but it can find text inside tag too.
It also shows how to use
response.css('body').re(pattern)
It counts 'view', '\\bviews\\b' and '\d+ views' on Stackoverflow and display first three elements
You can run it without creating project.
import scrapy
import re
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://stackoverflow.com/']
def parse(self, response):
print('url:', response.url)
for pattern in ['view', '\\bviews\\b', '\d+ views']:
print('>>> pattern:', pattern)
result = re.findall(pattern, response.text)
print('>>> re:', len(result), result[0:3])
result = response.css('body').re(pattern)
print('>>> response.re:', len(result), result[0:3])
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
c.crawl(MySpider)
c.start()

Scrapy as a task

I recently discovered scrapy and I want to wrote a spider to get urls from my database containing pdf files and download them and remove the record. The issue is that my database will get new records irregularly. That's why I want to run my Crawler as as task every 6 hours.
Any ideas how I can accomplish that?
Here's some code
class PDFSpider(scrapy.Spider):
name = "pdf"
def __init__(self):
self.lastUrl = None
def start_requests(self):
# get urls from database using django models
for url in PDFDownloadQueue.objects.all():
self.lastUrl = url
yield scrapy.Request(url=url.url, callback=self.parse)
def parse(self, response):
# write httpresponse as a html file
filename = response.url.split("/")[-1]
output = os.path.join(OUTPUT_PATH,filename)
with open(output, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
print("Parsed {}".format(self.lastUrl))

Resources