i am trying to extract all the brand names from https://www.gizbot.com/mobile-brands-in-india/.
Below is the code for mobiles_spiders.py file
class MobilesSpider(scrapy.Spider):
name = "mobiles"
def start_requests(self):
urls = [
'https://www.gizbot.com/mobile-brands-in-india/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'mobiles-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.xpath(str.encode('.//div[has-class("all-brands-block-desc-brand")]/text()').get()))
self.log('Saved file %s' % filename)
but the code is giving me error as
AttributeError: 'bytes' object has no attribute 'get'
i need suggestion as to what function i need to use instead of get() for extracting all the div elements that contains brand name.
any help is appreciated.
It may help you.
import scrapy
class MobilesSpider(scrapy.Spider):
name = "mobiles"
def start_requests(self):
urls = [
'https://www.gizbot.com/mobile-brands-in-india/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'mobiles-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.xpath('.//div[has-class("all-brands-block-desc-brand")]/text()').get().encode('utf-8'))
self.log('Saved file %s' % filename)
Related
Building my first web scraper. I'm simply trying to get a list of names and append them to a csv file. The scraper seems to work but not as intended. Output file only produces one name which is always the last name scraped. Its always a different name when I rerun the scraper. In this case the name written to the csv file was Ola Aina.
#Create the spider class
class premSpider(scrapy.Spider):
name = "premSpider"
def start_requests(self):
# Create a List of Urls with which we wish to scrape
urls = ['https://www.premierleague.com/players']
#Iterate through each url and send it to be parsed
for url in urls:
#yield kind of acts like return
yield scrapy.Request(url = url, callback = self.parse)
def parse(self, response):
#extract links to player pages
plinks = response.xpath('//tr').css('a::attr(href)').extract()
#follow links to specific player pages
for plink in plinks:
yield response.follow(url = plink, callback = self.parse2)
def parse2(self, response):
plinks2 = response.xpath('//a[#href="stats"]').css('a::attr(href)').extract()
for link2 in plinks2:
yield response.follow(url = link2, callback = self.parse3)
def parse3(self, response):
names= response.xpath('//div[#class="name t-colour"]/text()').extract()
filepath = 'playerlinks.csv'
with open(filepath, 'w') as f:
f.writelines([name + '\n' for name in names])
process = CrawlerProcess()
process.crawl(premSpider)
process.start()
You could also use Scrapy's own "FEEDS" export..
add this just below your spider name:
custom_settings = {'FEEDS':{'results1.csv':{'format':'csv'}}}"
And modify parse3 to read as below:
def parse3(self, response):
names=response.xpath('.//div[#class="name t-colour"]/text()').get()
yield {'names':names}
I am scraping multiple pages with Scrapy, it is working fine but I am getting 2 dictionaries in my output, instead I would like to get the result of both pages into one output row .
In this particular case return the output of the get_image function from the second page back with the rest of the data : artist and album , but I don't know how to feed that information back to the main dictionary .
Thanks !
import scrapy
class OsmoseSpider(scrapy.Spider):
name = "osmose"
def start_requests(self):
baseurl = 'https://www.osmoseproductions.com/liste/?lng=2&categ_rech=0&alpha=0&fmt=990001&srt=2&page='
urls = []
for x in range(1,2):
urls.append(baseurl+str(x))
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def get_image(self, response):
for im in response.xpath('//*[#id="img_product_page_osmose"]/img[#id="id_photo_principale"]/#src').getall():
yield {'image': im}
def parse(self, response):
artist, album, link, images = [], [], [], []
for a in response.xpath('//*[#id="paginCorpA1"]/div[*]/div/div[2]/div[1]/div[2]/a/span[1]/text()').getall():
artist.append(a)
for b in response.xpath('//*[#id="paginCorpA1"]/div[*]/div/div[2]/div[1]/div[2]/a/span[2]/text()').getall():
album.append(b)
for l in response.xpath('//*[#id="paginCorpA1"]/div[*]/div/div[2]/div[1]/div[2]/a/#href').getall():
link.append(l)
for x in link:
next_page = x
if next_page is not None:
yield response.follow(next_page, callback=self.get_image)
for i, j in zip(artist, album):
yield {'artist': i,
'album': j,
}
page = response.url.split("/")[-2]
filename = 'osmose-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
I'd use passing arguments in meta. Check this example:
def parse(self, response):
artists = response.xpath('//*[#id="paginCorpA1"]/div[*]/div/div[2]/div[1]/div[2]/a/span[1]/text()').getall()
albums = response.xpath('//*[#id="paginCorpA1"]/div[*]/div/div[2]/div[1]/div[2]/a/span[2]/text()').getall()
links = response.xpath('//*[#id="paginCorpA1"]/div[*]/div/div[2]/div[1]/div[2]/a/#href').getall()
for artist, album, link in zip(artists, albums, links):
if not link:
continue
yield response.follow(link, self.get_image, meta={'artist': artist, 'album': album})
def get_image(self, response):
artist = response.meta['artist']
album = response.meta['album']
for im in response.xpath('//*[#id="img_product_page_osmose"]/img[#id="id_photo_principale"]/#src').getall():
yield {'image': im, 'album': album, 'artist': artist}
The answer of this question was quite difficult to find since informations are scattered, and the title of the questions are sometime misleading. The answer below regroup all informations needed in one place.
Your spider should look like.
# based on https://doc.scrapy.org/en/latest/intro/tutorial.html
import scrapy
import requests
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
print('\n\nurl:', url)
## use one of the yield below
# middleware will process the request
yield scrapy.Request(url=url, callback=self.parse)
# check if Tor has changed IP
#yield scrapy.Request('http://icanhazip.com/', callback=self.is_tor_and_privoxy_used)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
print('\n\nSpider: Start')
print('Is proxy in response.meta?: ', response.meta)
print ("user_agent is: ",response.request.headers['User-Agent'])
print('\n\n Spider: End')
self.log('Saved file --- %s' % filename)
def is_tor_and_privoxy_used(self, response):
print('\n\nSpider: Start')
print("My IP is : " + str(response.body))
print("Is proxy in response.meta?: ", response.meta) # not header dispo
print('\n\nSpider: End')
self.log('Saved file %s' % filename)
You will also need to add stuff in middleware.py and settings.py . If you don't know how to do it this will help you
How to scrap any site and search for the given word and displays how many times it occurred
class LinkedinScraper(scrapy.Spider):
name = "linked"
def start_requests(self):
urls = ['https://www.linkedin.com/']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'linkedin.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
You can use regex with response.body to find all occurrances in any places
ie.
import re
r = re.findall('\\bcat\\b', "cat catalog cattering")
print(len(r), 'cat(s)')
Gives "1 cat(s)", not "3 cat(s)"
If you need word only in some tags then you use first response.css(), response.xpath(), etc.
EDIT:
Example which shows how to use
re.findall(pattern, response.text)
but it can find text inside tag too.
It also shows how to use
response.css('body').re(pattern)
It counts 'view', '\\bviews\\b' and '\d+ views' on Stackoverflow and display first three elements
You can run it without creating project.
import scrapy
import re
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://stackoverflow.com/']
def parse(self, response):
print('url:', response.url)
for pattern in ['view', '\\bviews\\b', '\d+ views']:
print('>>> pattern:', pattern)
result = re.findall(pattern, response.text)
print('>>> re:', len(result), result[0:3])
result = response.css('body').re(pattern)
print('>>> response.re:', len(result), result[0:3])
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
c.crawl(MySpider)
c.start()
I recently discovered scrapy and I want to wrote a spider to get urls from my database containing pdf files and download them and remove the record. The issue is that my database will get new records irregularly. That's why I want to run my Crawler as as task every 6 hours.
Any ideas how I can accomplish that?
Here's some code
class PDFSpider(scrapy.Spider):
name = "pdf"
def __init__(self):
self.lastUrl = None
def start_requests(self):
# get urls from database using django models
for url in PDFDownloadQueue.objects.all():
self.lastUrl = url
yield scrapy.Request(url=url.url, callback=self.parse)
def parse(self, response):
# write httpresponse as a html file
filename = response.url.split("/")[-1]
output = os.path.join(OUTPUT_PATH,filename)
with open(output, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
print("Parsed {}".format(self.lastUrl))