Scrapy TabError: inconsistent use of tabs and space in identation - python-3.x

import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/'
]
for url in urls:
yield scrapy.Requests(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
when i execute this file with "scrapy crawl quotes", i got error in line 7 which marks my "[" and says: TabError: inconsistent use of tabs and spaces in identation
but i wonder why, because i assume that i correctly "copied" the code from a video, and i tested several identations.
Currently im working on 18.04 ubuntu lts server

Related

Runtime Request URL change not working scrapy

I have written a script in Python using Scrapy. The code runs to fetch all the pages that exist containing the code. It works fine on the first page load when scrapy is started and as per the script logic gets us page no. 2. But after loading page 2 I am unable to get xpath of the new page loaded so I can move ahead this way and get all the web-page numbers.
Sharing the code snippet.
import scrapy
from scrapy import Spider
class PostsSpider(Spider):
name = "posts"
start_urls = [
'https://www.boston.com/category/news/'
]
def parse(self, response):
print("first time")
print(response)
results = response.xpath("//*[contains(#id, 'load-more')]/#data-next-page").extract_first()
print(results)
if results is not None:
for result in results:
page_number = 'page/' + result
new_url = self.start_urls[0] + page_number
print(new_url)
yield scrapy.Request(url=new_url, callback=self.parse)
else:
print("last page")
It is because the page doesn't create new get requests when it loads the next page, it makes an ajax call to an api that returns json.
I made some adjustments to your code so it should work properly now. I am assuming that there is something other than the next page number you are trying to extract from each page, so I wrapped the html string into a scrapy.Slector class so you can use Xpath and such on it. This script will crawl alot of pages really fast, so you might want to adjust your settings to slow it down too.
import scrapy
from scrapy import Spider
from scrapy.selector import Selector
class PostsSpider(Spider):
name = "posts"
ajaxurl = "https://www.boston.com/wp-json/boston/v1/load-more?taxonomy=category&term_id=779&search_query=&author=&orderby=&page=%s&_wpnonce=f43ab1aae4&ad_count=4&redundant_ids=25129871,25130264,25129873,25129799,25128140,25126233,25122755,25121853,25124456,25129584,25128656,25123311,25128423,25128100,25127934,25127250,25126228,25126222"
start_urls = [
'https://www.boston.com/category/news/'
]
def parse(self, response):
new_url = None
try:
json_result = response.json()
html = json_result['data']['html']
selector = Selector(text=html, type="html")
# ... do some xpath stuff with selector.xpath.....
new_url = self.ajaxurl % json_result["data"]["nextPage"]
except:
results = response.xpath("//*[contains(#id, 'load-more')]/#data-next-page").extract_first()
if results is not None:
for result in results:
new_url = self.ajaxurl % result
if new_url:
print(new_url)
yield scrapy.Request(url=new_url, callback=self.parse)

response.body is returning an empty file using scrapy in python

I am trying to make a web crawler, using scrapy from python, that extracts the information that google shows in the right side when you make a search
The URL I am using is: https://www.google.com/search?q=la%20cuarta
In this other question I asked the same (question), someone suggested me to write response.body to a file, but I am getting an empty file, when I tried a different URL this does not happend
This is my code:
import scrapy
class google1(scrapy.Spider):
name = 'google1'
def start_requests(self):
urls = ['http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
'https://www.google.com/search?q=la%20cuarta',
'https://docs.scrapy.org/en/latest/intro/tutorial.html']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'page-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
It does not even write the file from the google search, but in the scrapy shell response.body is not empty
Ok, I tested your code and it works, that is the spiders downloads all the pages, including the googel page.
Your problem might be in the settings, add these to your settings:
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0'
ROBOTSTXT_OBEY = False

how to scrape anonymously using Scrapy Tor Privoxy & UserAgent? (Windows 10)

The answer of this question was quite difficult to find since informations are scattered, and the title of the questions are sometime misleading. The answer below regroup all informations needed in one place.
Your spider should look like.
# based on https://doc.scrapy.org/en/latest/intro/tutorial.html
import scrapy
import requests
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
print('\n\nurl:', url)
## use one of the yield below
# middleware will process the request
yield scrapy.Request(url=url, callback=self.parse)
# check if Tor has changed IP
#yield scrapy.Request('http://icanhazip.com/', callback=self.is_tor_and_privoxy_used)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
print('\n\nSpider: Start')
print('Is proxy in response.meta?: ', response.meta)
print ("user_agent is: ",response.request.headers['User-Agent'])
print('\n\n Spider: End')
self.log('Saved file --- %s' % filename)
def is_tor_and_privoxy_used(self, response):
print('\n\nSpider: Start')
print("My IP is : " + str(response.body))
print("Is proxy in response.meta?: ", response.meta) # not header dispo
print('\n\nSpider: End')
self.log('Saved file %s' % filename)
You will also need to add stuff in middleware.py and settings.py . If you don't know how to do it this will help you

Count word on the page

How to scrap any site and search for the given word and displays how many times it occurred
class LinkedinScraper(scrapy.Spider):
name = "linked"
def start_requests(self):
urls = ['https://www.linkedin.com/']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'linkedin.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
You can use regex with response.body to find all occurrances in any places
ie.
import re
r = re.findall('\\bcat\\b', "cat catalog cattering")
print(len(r), 'cat(s)')
Gives "1 cat(s)", not "3 cat(s)"
If you need word only in some tags then you use first response.css(), response.xpath(), etc.
EDIT:
Example which shows how to use
re.findall(pattern, response.text)
but it can find text inside tag too.
It also shows how to use
response.css('body').re(pattern)
It counts 'view', '\\bviews\\b' and '\d+ views' on Stackoverflow and display first three elements
You can run it without creating project.
import scrapy
import re
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://stackoverflow.com/']
def parse(self, response):
print('url:', response.url)
for pattern in ['view', '\\bviews\\b', '\d+ views']:
print('>>> pattern:', pattern)
result = re.findall(pattern, response.text)
print('>>> re:', len(result), result[0:3])
result = response.css('body').re(pattern)
print('>>> response.re:', len(result), result[0:3])
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
c.crawl(MySpider)
c.start()

Scrapy as a task

I recently discovered scrapy and I want to wrote a spider to get urls from my database containing pdf files and download them and remove the record. The issue is that my database will get new records irregularly. That's why I want to run my Crawler as as task every 6 hours.
Any ideas how I can accomplish that?
Here's some code
class PDFSpider(scrapy.Spider):
name = "pdf"
def __init__(self):
self.lastUrl = None
def start_requests(self):
# get urls from database using django models
for url in PDFDownloadQueue.objects.all():
self.lastUrl = url
yield scrapy.Request(url=url.url, callback=self.parse)
def parse(self, response):
# write httpresponse as a html file
filename = response.url.split("/")[-1]
output = os.path.join(OUTPUT_PATH,filename)
with open(output, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
print("Parsed {}".format(self.lastUrl))

Resources