I have a function for extracting articles from one page but i am not able to navigate to next page to scrape all the pages:
below is how i am trying:
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
class MedicalSpider(scrapy.Spider):
name = 'medical'
# allowed_domains = ['https://blogs.webmd.com/diabetes/default.htm']
allowed_domains = ['blogs.webmd.com'] # Only the domain, not the URL
start_urls = ['https://blogs.webmd.com/diabetes/default.htm']
def parse(self, response):
article_links = response.css('.posts-list-post-content a ::attr(href)')
print(article_links)
for link in article_links:
url = link.get()
if url:
yield response.follow(url=url, callback=self.parse_article)
def parse_article(self, response):
headline = response.css('.blog-header-container h1::text').get()
article_sections = response.css('.article-body .article-page section p::text')
body = ""
for article_sections in article_sections:
body += article_sections.get() + "\n"
yield {
'headline': headline,
'body': body
}
# url_apnd = "https://blogs.webmd.com/diabetes"
next_page = response.css('.next a ::attr(href)').get()
print(next_page)
# print("URL " + response.urljoin(next_page))
if next_page:
yield scrapy.Request(response.urljoin(next_page),callback=self.parse)
Please help me get the navigation to next page right.
You need to move the next page logic in your parse function, since the next page button is in the url defined in your start_urls.
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
class MedicalSpider(scrapy.Spider):
name = 'medical'
allowed_domains = ['blogs.webmd.com'] # Only the domain, not the URL
start_urls = ['https://blogs.webmd.com/diabetes/default.htm']
def parse(self, response):
article_links = response.css('.posts-list-post-content a ::attr(href)')
print(article_links)
for link in article_links:
url = link.get()
if url:
yield response.follow(url=url, callback=self.parse_article)
next_page = response.css('.next a ::attr(href)').get()
print(next_page)
# print("URL " + response.urljoin(next_page))
if next_page:
yield scrapy.Request(response.urljoin(next_page),callback=self.parse)
def parse_article(self, response):
headline = response.css('.blog-header-container h1::text').get()
article_sections = response.css('.article-body .article-page section p::text')
body = ""
for article_sections in article_sections:
body += article_sections.get() + "\n"
yield {
'headline': headline,
'body': body
}
Related
I am using python3.8.5, scrapy2.4.0 I am also using scrapy-proxy-pool and scrapy-user-agents I am getting "AttributeError: Response content isn't text". I am running this code on python3-venv. Would you like to help me explaining and solving the problem ?
Here is my code:
import scrapy
import json
class BasisMembersSpider(scrapy.Spider):
name = 'basis'
allowed_domains = ['www.basis.org.bd']
def start_requests(self):
start_url = 'https://basis.org.bd/get-member-list?page=1&team='
yield scrapy.Request(url=start_url, callback=self.get_membership_no)
def get_membership_no(self, response):
data_array = json.loads(response.body)['data']
next_page = json.loads(response.body)['links']['next']
for data in data_array:
next_url = 'https://basis.org.bd/get-company-profile/{0}'.format(data['membership_no'])
yield scrapy.Request(url=next_url, callback=self.parse)
if next_page:
yield scrapy.Request(url=next_page, callback=self.get_membership_no)
def parse(self, response):
print("Printing informations....................................................")
Here is my settings.py file:
BOT_NAME = 'web_scraping'
SPIDER_MODULES = ['web_scraping.spiders']
NEWSPIDER_MODULE = 'web_scraping.spiders'
AUTOTHROTTLE_ENABLED = True
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'web_scraping (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
PROXY_POOL_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 800,
'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 610,
'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 620,
}
And are error messages from console output:
Thank you...
I'm working on a crawler and I have to save the output in a csv file.
Here is my code:
import scrapy
class ArticleSpider(scrapy.Spider):
name = "article"
def start_requests(self):
urls = [
'https://www.topart-online.com/de/Ahorn-japan.%2C-70cm%2C--36-Blaetter----Herbst/c-KAT282/a-150001HE'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-1]
filename = 'article-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
def parse(self, response):
yield{
'title': response.xpath('//h1[#class="text-center text-md-left mt-0"]/text()').get(),
'quantity': response.xpath('//div[#class="col-6"]/text()')[0].get().strip(),
'delivery_status': response.xpath('//div[#class="availabilitydeliverytime"]/text()').get().replace('/','').strip(),
'itemattr': response.xpath('//div[#class="productcustomattrdesc word-break col-6"]/text()').getall(),
'itemvalues': response.xpath('//div[#class="col-6"]/text()').getall()
}
My question is:
How can I output itemattr and itemvalues in the correct order? So I can see for example: Umkarton(itemattr) 20/20/20(dimension of a Umkarton)
I have a broad scrapy crawler which takes a csv file of about 20,000 rows. The file has a name, start_url and allowed domain column. See below:
Name start_url allowed_domain
place1 https://place1.co.uk place1.co.uk
place2 https://place2.co.uk place2.co.uk
place3 https://place3.co.uk place3.co.uk
A sample of my crawler code is below:
class FinalSpider(CrawlSpider):
name = "final"
df = pd.read_csv("places.csv")
start_urls = df["start_url"].values.tolist()
custom_settings = {
'DOWNLOAD_DELAY': 3,
'DOWNLOADER_MIDDLEWARES': {
'my_spider.middlewares.MySpiderDownloaderMiddleware': 543,
},
'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'RETRY_ENABLED': False,
'AJAXCRAWL_ENABLED': True
}
rules = (Rule(LinkExtractor(allow_domains=(df["allowed_domain"].values.tolist())), callback='parse_item', follow=True),)
def __init__(self):
pass
def parse_item(self, response):
# do stuff
The problem is that my crawler is allowed to follow links that are in any of the allowed domains, not just the one associated with the start_url.
You cannot assign allowed_domain for each link in start_urls
You will have to filter urls inside process_request method of DownloaderMiddleware
Here is your Spider code
class FinalSpider(CrawlSpider):
name = "final"
custom_settings = {
'DOWNLOAD_DELAY': 3,
'DOWNLOADER_MIDDLEWARES': {
'my_spider.middlewares.MySpiderDownloaderMiddleware': 543,
},
'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'RETRY_ENABLED': False,
'AJAXCRAWL_ENABLED': True
}
rules = (Rule(LinkExtractor(), follow=True),)
def start_requests(self):
df = pd.read_csv("places.csv")
for key, row in df.iterrows():
yield Request(url=row['start_url'],
callback=self.parse_item,
meta={'allowed_domain': row['allowed_domain']})
Here is Middleware code
import tldextract
from scrapy.exceptions import IgnoreRequest
class MySpiderDownloaderMiddleware(object):
def process_request(self, request, spider):
site = tldextract.extract(request.url)
site = "{}.{}".format(site.domain, site.suffix)
if request.meta['allowed_domain'] not in site:
raise IgnoreRequest("Filtered offsite request")
I'm trying to scrape the headings in about page but I tried so much and failed due to not the proper understanding of what to do? I'm a beginner. So I require help.
import scrapy
from ..items import DmoztutorialItem
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
start_urls = [
'http://dmoz-odp.org/',
]
def parse(self, response):
items = DmoztutorialItem()
Navbar = response.css('#main-nav a::text').extract()
Category_names = response.css('.top-cat a::text').extract()
Subcategories = response.css('.sub-cat a::text').extract()
items['Navbar'] = Navbar
items['Category_names'] = Category_names
items['Subcategories'] = Subcategories
yield items
# Nav_page = response.css('#main-nav a::attr(href)').extract()
Nav_page = 'http://dmoz-odp.org/docs/en/about.html'.extract()
# About_heading = response.css('h1+ p , #mainContent
# h1::text').extract()
items['Nav_page'] = Nav_page
# items['About_heading'] = About_heading
yield response.follow(Nav_page)
Can you tell what kind of output do you need? It is very unclear from your post.
Check this example, where you can:
Get some data;
Call request to another page with date saved;
Yield final data.
Hope it will help you.
import scrapy
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
start_urls = ['http://dmoz-odp.org/']
nav_page = 'http://dmoz-odp.org/docs/en/about.html'
def parse(self, response):
# collect data on first page
items = {
'Navbar': response.css('#main-nav a::text').extract(),
'Category_names': response.css('.top-cat a::text').extract(),
'Subcategories': response.css('.sub-cat a::text').extract(),
'Nav_page': self.nav_page,
}
# save and call request to another page
yield response.follow(self.nav_page, self.parse_nav, meta={'items': items})
def parse_nav(self, response):
# do you stuff on second page
items = response.meta['items']
items['something'] = 'something' # add your logics
yield items
Or make separate logins for separate pages:
import scrapy
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
def start_requests(self):
reqs = (
('http://dmoz-odp.org/', self.parse_main),
('http://dmoz-odp.org/docs/en/about.html', self.parse_nav),
)
for link, callback in reqs:
yield scrapy.Request(link, callback)
def parse_main(self, response):
items = {
'Navbar': response.css('#main-nav a::text').extract(),
'Category_names': response.css('.top-cat a::text').extract(),
'Subcategories': response.css('.sub-cat a::text').extract(),
}
yield items
def parse_nav(self, response):
items = {
'something': 'something', # add your logics
}
yield items
To parse a different HTML page, you need to yield a Request object with the target URL as the first argument for its constructor, and do the parsing in the method of your spider that you pass to the constructor of that Request object as the callback parameter.
I strongly encourage you to complete the Scrapy tutorial. What you are trying to achieve is covered in the Following links section.
I'm new to Scrapy Python and I would like to know the best way to group prices and sellers.
My code:
import scrapy
import re
class ProductSpider(scrapy.Spider):
name = 'product'
start_urls = ['https://www.google.nl/shopping/product/13481271230046931540?client=opera&biw=1880&bih=1008&output=search&q=738678181690&oq=738678181690&prds=hsec:online,paur:ClkAsKraX-dNyyNw1dQNKCoMJnN5PTCcIkKQRK_FTu38WNkh-ASATIAsHY2pgV1a1wnYh_rnqfve8FuuCsHc8boLnMjv9EO2Q4wJS_AvrOL1pcn-GYMYHecz7BIZAFPVH73OGQwGCep7aILTJXavWpXt0Ij80g&sa=X&ved=0ahUKEwjmtKKLp5HaAhWHchQKHbO5Dg8Q2SsIFQ']
all_seller = []
def parse(self, response):
self.log('Bla Bla Bla:' + response.url)
for product in response.css(".os-main-table"):
item ={
"all_sellers": product.css(".os-seller-name-primary > a::text").extract(),
"all_prices": product.css("td.os-total-col::text").re("\d+\,\d{1,2}"),
}
for item in zip(all_prices,all_sellers):
scrapped_info = {
'all_sellers' : item[0],
'all_prices': item[1],
}
yield scrapped_info
next_page_url = response.css('.pag-prev-next-links > a:last-child::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
Solution:
import scrapy
import re
class ProductSpider(scrapy.Spider):
name = 'product'
start_urls = ['https://www.google.nl/shopping/product/13481271230046931540?client=opera&biw=1880&bih=1008&output=search&q=738678181690&oq=738678181690&prds=hsec:online,paur:ClkAsKraX-dNyyNw1dQNKCoMJnN5PTCcIkKQRK_FTu38WNkh-ASATIAsHY2pgV1a1wnYh_rnqfve8FuuCsHc8boLnMjv9EO2Q4wJS_AvrOL1pcn-GYMYHecz7BIZAFPVH73OGQwGCep7aILTJXavWpXt0Ij80g&sa=X&ved=0ahUKEwjmtKKLp5HaAhWHchQKHbO5Dg8Q2SsIFQ']
def parse(self, response):
all_sellers = response.css(".os-seller-name-primary > a::text").extract()
all_prices = response.css("td.os-total-col::text").re("\d+\,\d{1,2}")
for item in zip(all_prices,all_sellers):
scrapped_info = {
'all_sellers' : item[0],
'all_prices': item[1],
}
yield scrapped_info
next_page_url = response.css('.pag-prev-next-links > a:last-child::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)