Scrapy Crawler Python3 - python-3.x

I'm working on a crawler and I have to save the output in a csv file.
Here is my code:
import scrapy
class ArticleSpider(scrapy.Spider):
name = "article"
def start_requests(self):
urls = [
'https://www.topart-online.com/de/Ahorn-japan.%2C-70cm%2C--36-Blaetter----Herbst/c-KAT282/a-150001HE'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-1]
filename = 'article-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
def parse(self, response):
yield{
'title': response.xpath('//h1[#class="text-center text-md-left mt-0"]/text()').get(),
'quantity': response.xpath('//div[#class="col-6"]/text()')[0].get().strip(),
'delivery_status': response.xpath('//div[#class="availabilitydeliverytime"]/text()').get().replace('/','').strip(),
'itemattr': response.xpath('//div[#class="productcustomattrdesc word-break col-6"]/text()').getall(),
'itemvalues': response.xpath('//div[#class="col-6"]/text()').getall()
}
My question is:
How can I output itemattr and itemvalues in the correct order? So I can see for example: Umkarton(itemattr) 20/20/20(dimension of a Umkarton)

Related

How to handle DNSLookupError in Scrapy?

I am checking a bunch of website response statuses and exporting them to a CSV file. There are a couple of websites having DNSLookupError or NO WEBSITE FOUND and not storing anything in the CSV file. How can I also store the DNSLookupError message to the CSV along with the URL?
def parse(self, response):
yield {
'URL': response.url,
'Status': response.status
}
You can use the errback function to catch DNS errors or any other types of errors. See below sample usage.
import scrapy
from twisted.internet.error import DNSLookupError
class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['example.com']
def start_requests(self):
yield scrapy.Request(url="http://example.com/error", errback=self.parse_error)
def parse_error(self, failure):
if failure.check(DNSLookupError):
# this is the original request
request = failure.request
yield {
'URL': request.url,
'Status': failure.value
}
def parse(self, response):
yield {
'URL': response.url,
'Status': response.status
}

Is it correct in Scrapy to have multiple parse methods in one spider?

Is it correct in Scrapy to have multiple parse methods in one spider ?
Something looking like:
import scrapy
class FooSpider(scrapy.Spider):
name = 'foo'
start_urls = ['https://example.com']
def parse(self, response):
...
yield {'foo': foo}
def parse(self, response):
...
yield {'bar': bar}
No, but you can create different methods and call them from start_requests for example.
import scrapy
class FooSpider(scrapy.Spider):
name = 'POC'
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url, callback=self.get_title, dont_filter=True)
yield scrapy.Request(url=url, callback=self.get_price, dont_filter=True)
def get_title(self, response):
yield {'title': response.xpath('//h3/text()').get()}
def get_price(self, response):
yield {'price': response.xpath('//div[#class="card-body"]/h4/text()').get()}

AttributeError: Response content isn't text. What is the problem?

I am using python3.8.5, scrapy2.4.0 I am also using scrapy-proxy-pool and scrapy-user-agents I am getting "AttributeError: Response content isn't text". I am running this code on python3-venv. Would you like to help me explaining and solving the problem ?
Here is my code:
import scrapy
import json
class BasisMembersSpider(scrapy.Spider):
name = 'basis'
allowed_domains = ['www.basis.org.bd']
def start_requests(self):
start_url = 'https://basis.org.bd/get-member-list?page=1&team='
yield scrapy.Request(url=start_url, callback=self.get_membership_no)
def get_membership_no(self, response):
data_array = json.loads(response.body)['data']
next_page = json.loads(response.body)['links']['next']
for data in data_array:
next_url = 'https://basis.org.bd/get-company-profile/{0}'.format(data['membership_no'])
yield scrapy.Request(url=next_url, callback=self.parse)
if next_page:
yield scrapy.Request(url=next_page, callback=self.get_membership_no)
def parse(self, response):
print("Printing informations....................................................")
Here is my settings.py file:
BOT_NAME = 'web_scraping'
SPIDER_MODULES = ['web_scraping.spiders']
NEWSPIDER_MODULE = 'web_scraping.spiders'
AUTOTHROTTLE_ENABLED = True
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'web_scraping (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
PROXY_POOL_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 800,
'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 610,
'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 620,
}
And are error messages from console output:
Thank you...

How to associate allowed_domains with start_urls in scrapy

I have a broad scrapy crawler which takes a csv file of about 20,000 rows. The file has a name, start_url and allowed domain column. See below:
Name start_url allowed_domain
place1 https://place1.co.uk place1.co.uk
place2 https://place2.co.uk place2.co.uk
place3 https://place3.co.uk place3.co.uk
A sample of my crawler code is below:
class FinalSpider(CrawlSpider):
name = "final"
df = pd.read_csv("places.csv")
start_urls = df["start_url"].values.tolist()
custom_settings = {
'DOWNLOAD_DELAY': 3,
'DOWNLOADER_MIDDLEWARES': {
'my_spider.middlewares.MySpiderDownloaderMiddleware': 543,
},
'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'RETRY_ENABLED': False,
'AJAXCRAWL_ENABLED': True
}
rules = (Rule(LinkExtractor(allow_domains=(df["allowed_domain"].values.tolist())), callback='parse_item', follow=True),)
def __init__(self):
pass
def parse_item(self, response):
# do stuff
The problem is that my crawler is allowed to follow links that are in any of the allowed domains, not just the one associated with the start_url.
You cannot assign allowed_domain for each link in start_urls
You will have to filter urls inside process_request method of DownloaderMiddleware
Here is your Spider code
class FinalSpider(CrawlSpider):
name = "final"
custom_settings = {
'DOWNLOAD_DELAY': 3,
'DOWNLOADER_MIDDLEWARES': {
'my_spider.middlewares.MySpiderDownloaderMiddleware': 543,
},
'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'RETRY_ENABLED': False,
'AJAXCRAWL_ENABLED': True
}
rules = (Rule(LinkExtractor(), follow=True),)
def start_requests(self):
df = pd.read_csv("places.csv")
for key, row in df.iterrows():
yield Request(url=row['start_url'],
callback=self.parse_item,
meta={'allowed_domain': row['allowed_domain']})
Here is Middleware code
import tldextract
from scrapy.exceptions import IgnoreRequest
class MySpiderDownloaderMiddleware(object):
def process_request(self, request, spider):
site = tldextract.extract(request.url)
site = "{}.{}".format(site.domain, site.suffix)
if request.meta['allowed_domain'] not in site:
raise IgnoreRequest("Filtered offsite request")

Navigate to next page with response.css

I have a function for extracting articles from one page but i am not able to navigate to next page to scrape all the pages:
below is how i am trying:
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
class MedicalSpider(scrapy.Spider):
name = 'medical'
# allowed_domains = ['https://blogs.webmd.com/diabetes/default.htm']
allowed_domains = ['blogs.webmd.com'] # Only the domain, not the URL
start_urls = ['https://blogs.webmd.com/diabetes/default.htm']
def parse(self, response):
article_links = response.css('.posts-list-post-content a ::attr(href)')
print(article_links)
for link in article_links:
url = link.get()
if url:
yield response.follow(url=url, callback=self.parse_article)
def parse_article(self, response):
headline = response.css('.blog-header-container h1::text').get()
article_sections = response.css('.article-body .article-page section p::text')
body = ""
for article_sections in article_sections:
body += article_sections.get() + "\n"
yield {
'headline': headline,
'body': body
}
# url_apnd = "https://blogs.webmd.com/diabetes"
next_page = response.css('.next a ::attr(href)').get()
print(next_page)
# print("URL " + response.urljoin(next_page))
if next_page:
yield scrapy.Request(response.urljoin(next_page),callback=self.parse)
Please help me get the navigation to next page right.
You need to move the next page logic in your parse function, since the next page button is in the url defined in your start_urls.
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
class MedicalSpider(scrapy.Spider):
name = 'medical'
allowed_domains = ['blogs.webmd.com'] # Only the domain, not the URL
start_urls = ['https://blogs.webmd.com/diabetes/default.htm']
def parse(self, response):
article_links = response.css('.posts-list-post-content a ::attr(href)')
print(article_links)
for link in article_links:
url = link.get()
if url:
yield response.follow(url=url, callback=self.parse_article)
next_page = response.css('.next a ::attr(href)').get()
print(next_page)
# print("URL " + response.urljoin(next_page))
if next_page:
yield scrapy.Request(response.urljoin(next_page),callback=self.parse)
def parse_article(self, response):
headline = response.css('.blog-header-container h1::text').get()
article_sections = response.css('.article-body .article-page section p::text')
body = ""
for article_sections in article_sections:
body += article_sections.get() + "\n"
yield {
'headline': headline,
'body': body
}

Resources