Building my first web scraper. I'm simply trying to get a list of names and append them to a csv file. The scraper seems to work but not as intended. Output file only produces one name which is always the last name scraped. Its always a different name when I rerun the scraper. In this case the name written to the csv file was Ola Aina.
#Create the spider class
class premSpider(scrapy.Spider):
name = "premSpider"
def start_requests(self):
# Create a List of Urls with which we wish to scrape
urls = ['https://www.premierleague.com/players']
#Iterate through each url and send it to be parsed
for url in urls:
#yield kind of acts like return
yield scrapy.Request(url = url, callback = self.parse)
def parse(self, response):
#extract links to player pages
plinks = response.xpath('//tr').css('a::attr(href)').extract()
#follow links to specific player pages
for plink in plinks:
yield response.follow(url = plink, callback = self.parse2)
def parse2(self, response):
plinks2 = response.xpath('//a[#href="stats"]').css('a::attr(href)').extract()
for link2 in plinks2:
yield response.follow(url = link2, callback = self.parse3)
def parse3(self, response):
names= response.xpath('//div[#class="name t-colour"]/text()').extract()
filepath = 'playerlinks.csv'
with open(filepath, 'w') as f:
f.writelines([name + '\n' for name in names])
process = CrawlerProcess()
process.crawl(premSpider)
process.start()
You could also use Scrapy's own "FEEDS" export..
add this just below your spider name:
custom_settings = {'FEEDS':{'results1.csv':{'format':'csv'}}}"
And modify parse3 to read as below:
def parse3(self, response):
names=response.xpath('.//div[#class="name t-colour"]/text()').get()
yield {'names':names}
Related
I have written a script in Python using Scrapy. The code runs to fetch all the pages that exist containing the code. It works fine on the first page load when scrapy is started and as per the script logic gets us page no. 2. But after loading page 2 I am unable to get xpath of the new page loaded so I can move ahead this way and get all the web-page numbers.
Sharing the code snippet.
import scrapy
from scrapy import Spider
class PostsSpider(Spider):
name = "posts"
start_urls = [
'https://www.boston.com/category/news/'
]
def parse(self, response):
print("first time")
print(response)
results = response.xpath("//*[contains(#id, 'load-more')]/#data-next-page").extract_first()
print(results)
if results is not None:
for result in results:
page_number = 'page/' + result
new_url = self.start_urls[0] + page_number
print(new_url)
yield scrapy.Request(url=new_url, callback=self.parse)
else:
print("last page")
It is because the page doesn't create new get requests when it loads the next page, it makes an ajax call to an api that returns json.
I made some adjustments to your code so it should work properly now. I am assuming that there is something other than the next page number you are trying to extract from each page, so I wrapped the html string into a scrapy.Slector class so you can use Xpath and such on it. This script will crawl alot of pages really fast, so you might want to adjust your settings to slow it down too.
import scrapy
from scrapy import Spider
from scrapy.selector import Selector
class PostsSpider(Spider):
name = "posts"
ajaxurl = "https://www.boston.com/wp-json/boston/v1/load-more?taxonomy=category&term_id=779&search_query=&author=&orderby=&page=%s&_wpnonce=f43ab1aae4&ad_count=4&redundant_ids=25129871,25130264,25129873,25129799,25128140,25126233,25122755,25121853,25124456,25129584,25128656,25123311,25128423,25128100,25127934,25127250,25126228,25126222"
start_urls = [
'https://www.boston.com/category/news/'
]
def parse(self, response):
new_url = None
try:
json_result = response.json()
html = json_result['data']['html']
selector = Selector(text=html, type="html")
# ... do some xpath stuff with selector.xpath.....
new_url = self.ajaxurl % json_result["data"]["nextPage"]
except:
results = response.xpath("//*[contains(#id, 'load-more')]/#data-next-page").extract_first()
if results is not None:
for result in results:
new_url = self.ajaxurl % result
if new_url:
print(new_url)
yield scrapy.Request(url=new_url, callback=self.parse)
Saving data of one page and not going to second page and not showing any errors.
import scrapy
from ..items import QoutetutorialItem
class QouteSpider(scrapy.Spider):
name = 'qoute'
page_num =2;
allowed_domains = ['http://quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/page/1/']
def parse(self, response):
all_div_quote = response.css("div.quote")
items = QoutetutorialItem()
for x in all_div_quote:
title = x.css("span.text::text").extract();
author = x.css(".author::text").extract();
tag = x.css(".tag::text").extract();
items['title'] = title
items['author'] = author
items['tag'] = tag
yield items
next_page = 'http://quotes.toscrape.com/page/'+str(QouteSpider.page_num)+'/'
# if next_page is not None:
if QouteSpider.page_num <11:
QouteSpider.page_num+=1
yield response.follow(next_page , callback= self.parse)
Simply do that. Firstly, fetch next page URL from page source as it is present there and then make the request to that. This is how it will looks like.
next_page = response.css('.next ::attr(href)')
if next_page:
yield response.follow(next_page, callback=self.parse)
This will solve your issue and now you also don't need calculation for next page URL also.
How to scrap any site and search for the given word and displays how many times it occurred
class LinkedinScraper(scrapy.Spider):
name = "linked"
def start_requests(self):
urls = ['https://www.linkedin.com/']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'linkedin.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
You can use regex with response.body to find all occurrances in any places
ie.
import re
r = re.findall('\\bcat\\b', "cat catalog cattering")
print(len(r), 'cat(s)')
Gives "1 cat(s)", not "3 cat(s)"
If you need word only in some tags then you use first response.css(), response.xpath(), etc.
EDIT:
Example which shows how to use
re.findall(pattern, response.text)
but it can find text inside tag too.
It also shows how to use
response.css('body').re(pattern)
It counts 'view', '\\bviews\\b' and '\d+ views' on Stackoverflow and display first three elements
You can run it without creating project.
import scrapy
import re
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://stackoverflow.com/']
def parse(self, response):
print('url:', response.url)
for pattern in ['view', '\\bviews\\b', '\d+ views']:
print('>>> pattern:', pattern)
result = re.findall(pattern, response.text)
print('>>> re:', len(result), result[0:3])
result = response.css('body').re(pattern)
print('>>> response.re:', len(result), result[0:3])
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
c.crawl(MySpider)
c.start()
I want to use scrapy spider to get data (question title + content & answer) from all posts of the following website:
https://forums.att.com/t5/custom/page/page-id/latest-activity/category-id/Customer_Care/page/1?page-type=latest-solutions-topics
The problem is I just dont know how to make it first to follow the link of the post and then to crawl the data of all 15 posts/site.
{import scrapy
class ArticleSpider(scrapy.Spider):
name = "post"
start_urls = ['https://forums.att.com/t5/Data-Messaging-Features-Internet/Throttling-for-unlimited-data/m-p/4805201#M73235']
def parse(self, response):
SET_SELECTOR = 'body'
for post in response.css(SET_SELECTOR):
# Selector for title, content and answer
TITLE_SELECTOR = '.lia-message-subject h5 ::text'
CONTENT_SELECTOR = '.lia-message-body-content'
ANSWER_SELECTOR = '.lia-message-body-content'
yield {
# [0].extract() = extract_first()
'Qtitle': post.css(TITLE_SELECTOR)[0].extract(),
'Qcontent': post.css(CONTENT_SELECTOR)[0].extract(),
'Answer': post.css(ANSWER_SELECTOR)[1].extract(),
}
# Running through all 173 pages
NEXT_PAGE_SELECTOR = '.lia-paging-page-next a ::attr(href)'
next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)}
I hope u can help me out. Thanks in advance!
You need to add a method for scraping post content. You can rewrite your spider code like this (I use xpath selector):
# -*- coding: utf-8 -*-
import scrapy
class ArticleSpider(scrapy.Spider):
name = "post"
start_urls = ['https://forums.att.com/t5/custom/page/page-id/latest-activity/category-id/Customer_Care/page/1?page-type=latest-solutions-topics']
def parse(self, response):
for post_link in response.xpath('//h2//a/#href').extract():
link = response.urljoin(post_link)
yield scrapy.Request(link, callback=self.parse_post)
# Checks if the main page has a link to next page if True keep parsing.
next_page = response.xpath('(//a[#rel="next"])[1]/#href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
def parse_post(self, response):
# Scrape title, content from post.
for post in response.xpath('//div[contains(#class, "lia-quilt-forum-message")]'):
item = dict()
item['title'] = post.xpath('.//h5/text()').extract_first()
item['content'] = post.xpath('.//div[#class="lia-message-body-content"]//text()').extract()
yield item
# If the post page has a link to next page keep parsing.
next_page = response.xpath('(//a[#rel="next"])[1]/#href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse_post)
This code parses all links from the main page and calls parse _post methods for scraping each post content. Both parse and parse_post methods check if there is next link and if True proceed scraping.
I recently discovered scrapy and I want to wrote a spider to get urls from my database containing pdf files and download them and remove the record. The issue is that my database will get new records irregularly. That's why I want to run my Crawler as as task every 6 hours.
Any ideas how I can accomplish that?
Here's some code
class PDFSpider(scrapy.Spider):
name = "pdf"
def __init__(self):
self.lastUrl = None
def start_requests(self):
# get urls from database using django models
for url in PDFDownloadQueue.objects.all():
self.lastUrl = url
yield scrapy.Request(url=url.url, callback=self.parse)
def parse(self, response):
# write httpresponse as a html file
filename = response.url.split("/")[-1]
output = os.path.join(OUTPUT_PATH,filename)
with open(output, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
print("Parsed {}".format(self.lastUrl))