AttributeError: Response content isn't text. What is the problem? - python-3.x

I am using python3.8.5, scrapy2.4.0 I am also using scrapy-proxy-pool and scrapy-user-agents I am getting "AttributeError: Response content isn't text". I am running this code on python3-venv. Would you like to help me explaining and solving the problem ?
Here is my code:
import scrapy
import json
class BasisMembersSpider(scrapy.Spider):
name = 'basis'
allowed_domains = ['www.basis.org.bd']
def start_requests(self):
start_url = 'https://basis.org.bd/get-member-list?page=1&team='
yield scrapy.Request(url=start_url, callback=self.get_membership_no)
def get_membership_no(self, response):
data_array = json.loads(response.body)['data']
next_page = json.loads(response.body)['links']['next']
for data in data_array:
next_url = 'https://basis.org.bd/get-company-profile/{0}'.format(data['membership_no'])
yield scrapy.Request(url=next_url, callback=self.parse)
if next_page:
yield scrapy.Request(url=next_page, callback=self.get_membership_no)
def parse(self, response):
print("Printing informations....................................................")
Here is my settings.py file:
BOT_NAME = 'web_scraping'
SPIDER_MODULES = ['web_scraping.spiders']
NEWSPIDER_MODULE = 'web_scraping.spiders'
AUTOTHROTTLE_ENABLED = True
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'web_scraping (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
PROXY_POOL_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 800,
'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 610,
'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 620,
}
And are error messages from console output:
Thank you...

Related

Scrapy Crawler Python3

I'm working on a crawler and I have to save the output in a csv file.
Here is my code:
import scrapy
class ArticleSpider(scrapy.Spider):
name = "article"
def start_requests(self):
urls = [
'https://www.topart-online.com/de/Ahorn-japan.%2C-70cm%2C--36-Blaetter----Herbst/c-KAT282/a-150001HE'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-1]
filename = 'article-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
def parse(self, response):
yield{
'title': response.xpath('//h1[#class="text-center text-md-left mt-0"]/text()').get(),
'quantity': response.xpath('//div[#class="col-6"]/text()')[0].get().strip(),
'delivery_status': response.xpath('//div[#class="availabilitydeliverytime"]/text()').get().replace('/','').strip(),
'itemattr': response.xpath('//div[#class="productcustomattrdesc word-break col-6"]/text()').getall(),
'itemvalues': response.xpath('//div[#class="col-6"]/text()').getall()
}
My question is:
How can I output itemattr and itemvalues in the correct order? So I can see for example: Umkarton(itemattr) 20/20/20(dimension of a Umkarton)

How to associate allowed_domains with start_urls in scrapy

I have a broad scrapy crawler which takes a csv file of about 20,000 rows. The file has a name, start_url and allowed domain column. See below:
Name start_url allowed_domain
place1 https://place1.co.uk place1.co.uk
place2 https://place2.co.uk place2.co.uk
place3 https://place3.co.uk place3.co.uk
A sample of my crawler code is below:
class FinalSpider(CrawlSpider):
name = "final"
df = pd.read_csv("places.csv")
start_urls = df["start_url"].values.tolist()
custom_settings = {
'DOWNLOAD_DELAY': 3,
'DOWNLOADER_MIDDLEWARES': {
'my_spider.middlewares.MySpiderDownloaderMiddleware': 543,
},
'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'RETRY_ENABLED': False,
'AJAXCRAWL_ENABLED': True
}
rules = (Rule(LinkExtractor(allow_domains=(df["allowed_domain"].values.tolist())), callback='parse_item', follow=True),)
def __init__(self):
pass
def parse_item(self, response):
# do stuff
The problem is that my crawler is allowed to follow links that are in any of the allowed domains, not just the one associated with the start_url.
You cannot assign allowed_domain for each link in start_urls
You will have to filter urls inside process_request method of DownloaderMiddleware
Here is your Spider code
class FinalSpider(CrawlSpider):
name = "final"
custom_settings = {
'DOWNLOAD_DELAY': 3,
'DOWNLOADER_MIDDLEWARES': {
'my_spider.middlewares.MySpiderDownloaderMiddleware': 543,
},
'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'RETRY_ENABLED': False,
'AJAXCRAWL_ENABLED': True
}
rules = (Rule(LinkExtractor(), follow=True),)
def start_requests(self):
df = pd.read_csv("places.csv")
for key, row in df.iterrows():
yield Request(url=row['start_url'],
callback=self.parse_item,
meta={'allowed_domain': row['allowed_domain']})
Here is Middleware code
import tldextract
from scrapy.exceptions import IgnoreRequest
class MySpiderDownloaderMiddleware(object):
def process_request(self, request, spider):
site = tldextract.extract(request.url)
site = "{}.{}".format(site.domain, site.suffix)
if request.meta['allowed_domain'] not in site:
raise IgnoreRequest("Filtered offsite request")

Navigate to next page with response.css

I have a function for extracting articles from one page but i am not able to navigate to next page to scrape all the pages:
below is how i am trying:
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
class MedicalSpider(scrapy.Spider):
name = 'medical'
# allowed_domains = ['https://blogs.webmd.com/diabetes/default.htm']
allowed_domains = ['blogs.webmd.com'] # Only the domain, not the URL
start_urls = ['https://blogs.webmd.com/diabetes/default.htm']
def parse(self, response):
article_links = response.css('.posts-list-post-content a ::attr(href)')
print(article_links)
for link in article_links:
url = link.get()
if url:
yield response.follow(url=url, callback=self.parse_article)
def parse_article(self, response):
headline = response.css('.blog-header-container h1::text').get()
article_sections = response.css('.article-body .article-page section p::text')
body = ""
for article_sections in article_sections:
body += article_sections.get() + "\n"
yield {
'headline': headline,
'body': body
}
# url_apnd = "https://blogs.webmd.com/diabetes"
next_page = response.css('.next a ::attr(href)').get()
print(next_page)
# print("URL " + response.urljoin(next_page))
if next_page:
yield scrapy.Request(response.urljoin(next_page),callback=self.parse)
Please help me get the navigation to next page right.
You need to move the next page logic in your parse function, since the next page button is in the url defined in your start_urls.
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
class MedicalSpider(scrapy.Spider):
name = 'medical'
allowed_domains = ['blogs.webmd.com'] # Only the domain, not the URL
start_urls = ['https://blogs.webmd.com/diabetes/default.htm']
def parse(self, response):
article_links = response.css('.posts-list-post-content a ::attr(href)')
print(article_links)
for link in article_links:
url = link.get()
if url:
yield response.follow(url=url, callback=self.parse_article)
next_page = response.css('.next a ::attr(href)').get()
print(next_page)
# print("URL " + response.urljoin(next_page))
if next_page:
yield scrapy.Request(response.urljoin(next_page),callback=self.parse)
def parse_article(self, response):
headline = response.css('.blog-header-container h1::text').get()
article_sections = response.css('.article-body .article-page section p::text')
body = ""
for article_sections in article_sections:
body += article_sections.get() + "\n"
yield {
'headline': headline,
'body': body
}

How to scrape headings in about page?

I'm trying to scrape the headings in about page but I tried so much and failed due to not the proper understanding of what to do? I'm a beginner. So I require help.
import scrapy
from ..items import DmoztutorialItem
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
start_urls = [
'http://dmoz-odp.org/',
]
def parse(self, response):
items = DmoztutorialItem()
Navbar = response.css('#main-nav a::text').extract()
Category_names = response.css('.top-cat a::text').extract()
Subcategories = response.css('.sub-cat a::text').extract()
items['Navbar'] = Navbar
items['Category_names'] = Category_names
items['Subcategories'] = Subcategories
yield items
# Nav_page = response.css('#main-nav a::attr(href)').extract()
Nav_page = 'http://dmoz-odp.org/docs/en/about.html'.extract()
# About_heading = response.css('h1+ p , #mainContent
# h1::text').extract()
items['Nav_page'] = Nav_page
# items['About_heading'] = About_heading
yield response.follow(Nav_page)
Can you tell what kind of output do you need? It is very unclear from your post.
Check this example, where you can:
Get some data;
Call request to another page with date saved;
Yield final data.
Hope it will help you.
import scrapy
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
start_urls = ['http://dmoz-odp.org/']
nav_page = 'http://dmoz-odp.org/docs/en/about.html'
def parse(self, response):
# collect data on first page
items = {
'Navbar': response.css('#main-nav a::text').extract(),
'Category_names': response.css('.top-cat a::text').extract(),
'Subcategories': response.css('.sub-cat a::text').extract(),
'Nav_page': self.nav_page,
}
# save and call request to another page
yield response.follow(self.nav_page, self.parse_nav, meta={'items': items})
def parse_nav(self, response):
# do you stuff on second page
items = response.meta['items']
items['something'] = 'something' # add your logics
yield items
Or make separate logins for separate pages:
import scrapy
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
def start_requests(self):
reqs = (
('http://dmoz-odp.org/', self.parse_main),
('http://dmoz-odp.org/docs/en/about.html', self.parse_nav),
)
for link, callback in reqs:
yield scrapy.Request(link, callback)
def parse_main(self, response):
items = {
'Navbar': response.css('#main-nav a::text').extract(),
'Category_names': response.css('.top-cat a::text').extract(),
'Subcategories': response.css('.sub-cat a::text').extract(),
}
yield items
def parse_nav(self, response):
items = {
'something': 'something', # add your logics
}
yield items
To parse a different HTML page, you need to yield a Request object with the target URL as the first argument for its constructor, and do the parsing in the method of your spider that you pass to the constructor of that Request object as the callback parameter.
I strongly encourage you to complete the Scrapy tutorial. What you are trying to achieve is covered in the Following links section.

How to ignore robots.txt errors to show in logs?

I am working on a crawler and want to do a polite crawl by obeying robots.txt. As it is a broad crawl, the log file becomes bigger in size and harder to process and most of the logging are because of robots.txt not found in most of the sites.
So my question is. Is there a way, i can ignore robots.txt related error and not to log them as I don't need to know if we found it or not.
I already have errback handler to handle failed request for my crawler but it doesn't applicable to robots.txt as this request is made by scrapy middleware
Below is my code:
Spider:
class MySpider(scrapy.Spider):
name = 'mobile'
def start_requests(self):
urls = [
'https://site1.com',
'http://site2.com'
]
for url in urls:
safe_no = 'test'
yield scrapy.Request(url=url, callback=self.parse,
errback=self.handle_error, meta={'safe_no': safe_no})
def parse(self, response):
safe_no = response.meta['safe_no']
html_doc = response.body
text_data, contacts, keep_no = self.get_contact(html_doc, response.url)
# print(contacts,keep_no)
link_found = False
data = []
parsed_uri = urlparse(response.url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
###Parse data and get contact....
if contacts:
yield{
'safe_no': safe_no,
'url': response.url,
'contacts': contacts,
# 'text_data': text_data
}
def handle_error(self, failure):
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError : "%s"', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError : "%s"', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError : "%s"', request.url)
else:
request = failure.request
self.logger.error('Can not connect : "%s" ', request.url)
Below is the log of the crawler:
2019-01-10 15:33:36 [scrapy.downloadermiddlewares.robotstxt] ERROR: Error downloading <GET http://www.site1.com/robots.txt>: DNS lookup failed: no results for hostname lookup: www.site1.com.
Traceback (most recent call last):
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\defer.py", line 1416, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\python\failure.py", line 491, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\endpoints.py", line 975, in startConnectionAttempts
"no results for hostname lookup: {}".format(self._hostStr)
As You can see in log, handle_error method doesn't apply to /robot.txt URL request.
I did some research and found that we can configure middleware to ignore some of the errors, but so far no luck.
Here is a small refactoring of your handle_error.
def handle_error(self, failure):
# this is the original request
request = failure.request
if failure.check(DNSLookupError):
self.logger.error('DNSLookupError : "%s"', request.url)
elif request.url.endswith('/robots.txt'):
pass
elif failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError : "%s"', response.url)
elif failure.check(TimeoutError, TCPTimedOutError):
self.logger.error('TimeoutError : "%s"', request.url)
else:
request = failure.request
self.logger.error('Can not connect : "%s" ', request.url)
Your log example shows a DNS lookup error, which IMHO should be logged regardless of what the specific URL is (it would fail even if it wasn't for robots.txt, and probably means the entire domain should be skipped there and then).
In case anyone else is reading this, a little hack together solution I did was to take the base class and comment out the extra detail being printed:
class MycrawlerRobotsTxtMiddleware:
DOWNLOAD_PRIORITY = 1000
def __init__(self, crawler):
if not crawler.settings.getbool("CUSTOM_ROBOTSTXT_OBEY"):
raise NotConfigured
self._default_useragent = crawler.settings.get("USER_AGENT", "Scrapy")
self._robotstxt_useragent = crawler.settings.get("ROBOTSTXT_USER_AGENT", None)
self.crawler = crawler
self._parsers = {}
self._parserimpl = load_object(crawler.settings.get("ROBOTSTXT_PARSER"))
# check if parser dependencies are met, this should throw an error otherwise.
self._parserimpl.from_crawler(self.crawler, b"")
#classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
if request.meta.get("dont_obey_robotstxt"):
return
d = maybeDeferred(self.robot_parser, request, spider)
d.addCallback(self.process_request_2, request, spider)
return d
def process_request_2(self, rp, request, spider):
if rp is None:
return
useragent = self._robotstxt_useragent
if not useragent:
useragent = request.headers.get(b"User-Agent", self._default_useragent)
if not rp.allowed(request.url, useragent):
logger.debug(
"Forbidden by robots.txt: %(request)s",
{"request": request},
extra={"spider": spider},
)
self.crawler.stats.inc_value("robotstxt/forbidden")
raise IgnoreRequest("Forbidden by robots.txt")
def robot_parser(self, request, spider):
url = urlparse_cached(request)
netloc = url.netloc
if netloc not in self._parsers:
self._parsers[netloc] = Deferred()
robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
robotsreq = Request(
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
meta={"dont_obey_robotstxt": True},
)
dfd = self.crawler.engine.download(robotsreq, spider)
dfd.addCallback(self._parse_robots, netloc, spider)
dfd.addErrback(self._logerror, robotsreq, spider)
dfd.addErrback(self._robots_error, netloc)
self.crawler.stats.inc_value("robotstxt/request_count")
if isinstance(self._parsers[netloc], Deferred):
d = Deferred()
def cb(result):
d.callback(result)
return result
self._parsers[netloc].addCallback(cb)
return d
else:
return self._parsers[netloc]
def _logerror(self, failure, request, spider):
# if failure.type is not IgnoreRequest:
# logger.error(
# "Error downloading %(request)s: %(f_exception)s",
# {"request": request, "f_exception": failure.value},
# exc_info=failure_to_exc_info(failure),
# extra={"spider": spider},
# )
if failure.type is not IgnoreRequest:
logger.error(f"Error downloading robots.txt: {request}")
return failure
def _parse_robots(self, response, netloc, spider):
self.crawler.stats.inc_value("robotstxt/response_count")
self.crawler.stats.inc_value(
f"robotstxt/response_status_count/{response.status}"
)
rp = self._parserimpl.from_crawler(self.crawler, response.body)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = rp
rp_dfd.callback(rp)
def _robots_error(self, failure, netloc):
if failure.type is not IgnoreRequest:
key = f"robotstxt/exception_count/{failure.type}"
self.crawler.stats.inc_value(key)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = None
rp_dfd.callback(None)
Then I added this into settings.py:
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Custom one written so it doesn't log every 404 response
CUSTOM_ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
...
"mycrawler.middlewares.MycrawlerRobotsTxtMiddleware": 100,
}

Resources