I have written a spider to crawl a website.
I am able to generate all the page urls (pagination).
I need help to crawl all these pages and and then print the response.
url_string="http://website.com/ct-50658/page-"
class SpiderName(Spider):
name="website"
allowed_domains=["website.com"]
start_urls=["http://website.com/page-2"]
def printer(self, response):
hxs=HtmlXPathSelector(response)
x=hxs.select("//span/a/#title").extract()
with open('website.csv', 'wb') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=' ',quotechar='|', quoting=csv.QUOTE_MINIMAL)
for i in x:
spamwriter.writerow(i)
def parse(self,response):
hxs=HtmlXPathSelector(response)
#sel=Selector(response)
pages=hxs.select("//div[#id='srchpagination']/a/#href").extract()
total_pages=int(pages[-2][-2:])
j=0
url_list=[]
while (j<total_pages):
j=j+1
urls=url_string+str(j)
url_list.append(urls)
for one_url in url_list:
request= Request(one_url, callback=self.printer)
return request
You're recreating the 'website.csv' file for every one_url Request's response. You should probably create it once (in __init__ for example) and save a CSV Writer reference to it in an attribute of your spider, referencing something like self.csvwriter in def printer.
Also, in for one_url in url_list:loop, you should use yield Request(one_url, callback=self.printer). Here, you're returning only the last Request
Here's a sample spider with these modifications and some code simplifications:
class SpiderName(Spider):
name="website"
allowed_domains=["website.com"]
start_urls=["http://website.com/page-2"]
def __init__(self, category=None, *args, **kwargs):
super(SpiderName, self).__init__(*args, **kwargs)
self.spamwriter = csv.writer(open('website.csv', 'wb'),
delimiter=' ',
quotechar='|',
quoting=csv.QUOTE_MINIMAL)
def printer(self, response):
hxs = HtmlXPathSelector(response)
for i in hxs.select("//span/a/#title").extract():
self.spamwriter.writerow(i)
def parse(self,response):
hxs=HtmlXPathSelector(response)
#sel=Selector(response)
pages = hxs.select("//div[#id='srchpagination']/a/#href").extract()
total_pages = int(pages[-2][-2:])
while j in range(0, total_pages):
yield Request(url_string+str(j), callback=self.printer)
Related
I wanted to create a wrapper function over pandas.read_csv to change the default separator and format the file a specific way. This is the code I had :
def custom_read(path, sep="|", **kwargs):
if not kwargs.get("chunksize", False):
df_ = pd.read_csv(path, sep=sep, **kwargs)
return format_df(df_, path)
else:
with pd.read_csv(path, sep=sep, **kwargs) as reader:
return (format_df(chunk, path) for chunk in reader)
It turns out that this segfaults when used like so :
L = [chunk.iloc[:10, :] for chunk in custom_read(my_file)]
From what I understood off the backtrace, the generator is created, then the file is closed and the segfault happens when the generator tries to read from the now closed file.
I could avoid the segfault with a minor refactoring :
def custom_read(path, sep="|", **kwargs):
if not kwargs.get("chunksize", False):
df_ = pd.read_csv(path, sep=sep, **kwargs)
return format_df(df_, path)
else:
reader = pd.read_csv(path, sep=sep, **kwargs)
return (format_df(chunk, path) for chunk in reader)
I couldn't find anything on the particular usecase of generators in with clauses, is it something to avoid ? Is this supposed not to work or is this a bug of some kind ?
Is there a way to avoid this error but still use the encouraged with statement ?
You could use a generator which keeps the file open. See the following example:
import os
def lines_format(lines):
return "\n".join(f"*{line.strip()}*" for line in lines)
def chunk_gen(file, chunksize):
with open(file, mode='r') as f:
while True:
lines = f.readlines(chunksize)
if not lines:
break
yield lines_format(lines)
def get_formatted_pages(file, chunksize=0):
if chunksize > 0:
return chunk_gen(file, chunksize)
else:
with open(file, mode='r') as f:
lines = f.readlines()
return [lines_format(lines)]
with open("abc.txt", mode='w') as f:
f.write(os.linesep.join('abc'))
pages = get_formatted_pages("abc.txt")
for i, page in enumerate(pages, start=1):
print(f"Page {i}")
print(page)
pages = get_formatted_pages("abc.txt", chunksize=2)
for i, page in enumerate(pages, start=1):
print(f"Page {i}")
print(page)
Edit:
In your pandas.read_csv use case, this would look like
import pandas as pd
df = pd.DataFrame({'char': list('abc'), "num": range(3)})
df.to_csv('abc.csv')
def gen_chunk(file, chunksize):
with pd.read_csv(file, chunksize=chunksize, index_col=0) as reader:
for chunk in reader:
yield format_df(chunk)
def format_df(df):
# do something
df['char'] = df['char'].str.capitalize()
return df
def get_formatted_pages(file, chunksize=0):
if chunksize > 0:
return gen_chunk(file, chunksize)
else:
return [format_df(pd.read_csv(file, index_col=0))]
list(get_formatted_pages('abc.csv', chunksize=2))
I am scraping multiple pages with Scrapy, it is working fine but I am getting 2 dictionaries in my output, instead I would like to get the result of both pages into one output row .
In this particular case return the output of the get_image function from the second page back with the rest of the data : artist and album , but I don't know how to feed that information back to the main dictionary .
Thanks !
import scrapy
class OsmoseSpider(scrapy.Spider):
name = "osmose"
def start_requests(self):
baseurl = 'https://www.osmoseproductions.com/liste/?lng=2&categ_rech=0&alpha=0&fmt=990001&srt=2&page='
urls = []
for x in range(1,2):
urls.append(baseurl+str(x))
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def get_image(self, response):
for im in response.xpath('//*[#id="img_product_page_osmose"]/img[#id="id_photo_principale"]/#src').getall():
yield {'image': im}
def parse(self, response):
artist, album, link, images = [], [], [], []
for a in response.xpath('//*[#id="paginCorpA1"]/div[*]/div/div[2]/div[1]/div[2]/a/span[1]/text()').getall():
artist.append(a)
for b in response.xpath('//*[#id="paginCorpA1"]/div[*]/div/div[2]/div[1]/div[2]/a/span[2]/text()').getall():
album.append(b)
for l in response.xpath('//*[#id="paginCorpA1"]/div[*]/div/div[2]/div[1]/div[2]/a/#href').getall():
link.append(l)
for x in link:
next_page = x
if next_page is not None:
yield response.follow(next_page, callback=self.get_image)
for i, j in zip(artist, album):
yield {'artist': i,
'album': j,
}
page = response.url.split("/")[-2]
filename = 'osmose-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
I'd use passing arguments in meta. Check this example:
def parse(self, response):
artists = response.xpath('//*[#id="paginCorpA1"]/div[*]/div/div[2]/div[1]/div[2]/a/span[1]/text()').getall()
albums = response.xpath('//*[#id="paginCorpA1"]/div[*]/div/div[2]/div[1]/div[2]/a/span[2]/text()').getall()
links = response.xpath('//*[#id="paginCorpA1"]/div[*]/div/div[2]/div[1]/div[2]/a/#href').getall()
for artist, album, link in zip(artists, albums, links):
if not link:
continue
yield response.follow(link, self.get_image, meta={'artist': artist, 'album': album})
def get_image(self, response):
artist = response.meta['artist']
album = response.meta['album']
for im in response.xpath('//*[#id="img_product_page_osmose"]/img[#id="id_photo_principale"]/#src').getall():
yield {'image': im, 'album': album, 'artist': artist}
The answer of this question was quite difficult to find since informations are scattered, and the title of the questions are sometime misleading. The answer below regroup all informations needed in one place.
Your spider should look like.
# based on https://doc.scrapy.org/en/latest/intro/tutorial.html
import scrapy
import requests
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
print('\n\nurl:', url)
## use one of the yield below
# middleware will process the request
yield scrapy.Request(url=url, callback=self.parse)
# check if Tor has changed IP
#yield scrapy.Request('http://icanhazip.com/', callback=self.is_tor_and_privoxy_used)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
print('\n\nSpider: Start')
print('Is proxy in response.meta?: ', response.meta)
print ("user_agent is: ",response.request.headers['User-Agent'])
print('\n\n Spider: End')
self.log('Saved file --- %s' % filename)
def is_tor_and_privoxy_used(self, response):
print('\n\nSpider: Start')
print("My IP is : " + str(response.body))
print("Is proxy in response.meta?: ", response.meta) # not header dispo
print('\n\nSpider: End')
self.log('Saved file %s' % filename)
You will also need to add stuff in middleware.py and settings.py . If you don't know how to do it this will help you
How to scrap any site and search for the given word and displays how many times it occurred
class LinkedinScraper(scrapy.Spider):
name = "linked"
def start_requests(self):
urls = ['https://www.linkedin.com/']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'linkedin.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
You can use regex with response.body to find all occurrances in any places
ie.
import re
r = re.findall('\\bcat\\b', "cat catalog cattering")
print(len(r), 'cat(s)')
Gives "1 cat(s)", not "3 cat(s)"
If you need word only in some tags then you use first response.css(), response.xpath(), etc.
EDIT:
Example which shows how to use
re.findall(pattern, response.text)
but it can find text inside tag too.
It also shows how to use
response.css('body').re(pattern)
It counts 'view', '\\bviews\\b' and '\d+ views' on Stackoverflow and display first three elements
You can run it without creating project.
import scrapy
import re
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://stackoverflow.com/']
def parse(self, response):
print('url:', response.url)
for pattern in ['view', '\\bviews\\b', '\d+ views']:
print('>>> pattern:', pattern)
result = re.findall(pattern, response.text)
print('>>> re:', len(result), result[0:3])
result = response.css('body').re(pattern)
print('>>> response.re:', len(result), result[0:3])
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
c.crawl(MySpider)
c.start()
I recently discovered scrapy and I want to wrote a spider to get urls from my database containing pdf files and download them and remove the record. The issue is that my database will get new records irregularly. That's why I want to run my Crawler as as task every 6 hours.
Any ideas how I can accomplish that?
Here's some code
class PDFSpider(scrapy.Spider):
name = "pdf"
def __init__(self):
self.lastUrl = None
def start_requests(self):
# get urls from database using django models
for url in PDFDownloadQueue.objects.all():
self.lastUrl = url
yield scrapy.Request(url=url.url, callback=self.parse)
def parse(self, response):
# write httpresponse as a html file
filename = response.url.split("/")[-1]
output = os.path.join(OUTPUT_PATH,filename)
with open(output, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
print("Parsed {}".format(self.lastUrl))