How to ignore robots.txt errors to show in logs? - python-3.x

I am working on a crawler and want to do a polite crawl by obeying robots.txt. As it is a broad crawl, the log file becomes bigger in size and harder to process and most of the logging are because of robots.txt not found in most of the sites.
So my question is. Is there a way, i can ignore robots.txt related error and not to log them as I don't need to know if we found it or not.
I already have errback handler to handle failed request for my crawler but it doesn't applicable to robots.txt as this request is made by scrapy middleware
Below is my code:
Spider:
class MySpider(scrapy.Spider):
name = 'mobile'
def start_requests(self):
urls = [
'https://site1.com',
'http://site2.com'
]
for url in urls:
safe_no = 'test'
yield scrapy.Request(url=url, callback=self.parse,
errback=self.handle_error, meta={'safe_no': safe_no})
def parse(self, response):
safe_no = response.meta['safe_no']
html_doc = response.body
text_data, contacts, keep_no = self.get_contact(html_doc, response.url)
# print(contacts,keep_no)
link_found = False
data = []
parsed_uri = urlparse(response.url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
###Parse data and get contact....
if contacts:
yield{
'safe_no': safe_no,
'url': response.url,
'contacts': contacts,
# 'text_data': text_data
}
def handle_error(self, failure):
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError : "%s"', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError : "%s"', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError : "%s"', request.url)
else:
request = failure.request
self.logger.error('Can not connect : "%s" ', request.url)
Below is the log of the crawler:
2019-01-10 15:33:36 [scrapy.downloadermiddlewares.robotstxt] ERROR: Error downloading <GET http://www.site1.com/robots.txt>: DNS lookup failed: no results for hostname lookup: www.site1.com.
Traceback (most recent call last):
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\defer.py", line 1416, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\python\failure.py", line 491, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\endpoints.py", line 975, in startConnectionAttempts
"no results for hostname lookup: {}".format(self._hostStr)
As You can see in log, handle_error method doesn't apply to /robot.txt URL request.
I did some research and found that we can configure middleware to ignore some of the errors, but so far no luck.

Here is a small refactoring of your handle_error.
def handle_error(self, failure):
# this is the original request
request = failure.request
if failure.check(DNSLookupError):
self.logger.error('DNSLookupError : "%s"', request.url)
elif request.url.endswith('/robots.txt'):
pass
elif failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError : "%s"', response.url)
elif failure.check(TimeoutError, TCPTimedOutError):
self.logger.error('TimeoutError : "%s"', request.url)
else:
request = failure.request
self.logger.error('Can not connect : "%s" ', request.url)
Your log example shows a DNS lookup error, which IMHO should be logged regardless of what the specific URL is (it would fail even if it wasn't for robots.txt, and probably means the entire domain should be skipped there and then).

In case anyone else is reading this, a little hack together solution I did was to take the base class and comment out the extra detail being printed:
class MycrawlerRobotsTxtMiddleware:
DOWNLOAD_PRIORITY = 1000
def __init__(self, crawler):
if not crawler.settings.getbool("CUSTOM_ROBOTSTXT_OBEY"):
raise NotConfigured
self._default_useragent = crawler.settings.get("USER_AGENT", "Scrapy")
self._robotstxt_useragent = crawler.settings.get("ROBOTSTXT_USER_AGENT", None)
self.crawler = crawler
self._parsers = {}
self._parserimpl = load_object(crawler.settings.get("ROBOTSTXT_PARSER"))
# check if parser dependencies are met, this should throw an error otherwise.
self._parserimpl.from_crawler(self.crawler, b"")
#classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
if request.meta.get("dont_obey_robotstxt"):
return
d = maybeDeferred(self.robot_parser, request, spider)
d.addCallback(self.process_request_2, request, spider)
return d
def process_request_2(self, rp, request, spider):
if rp is None:
return
useragent = self._robotstxt_useragent
if not useragent:
useragent = request.headers.get(b"User-Agent", self._default_useragent)
if not rp.allowed(request.url, useragent):
logger.debug(
"Forbidden by robots.txt: %(request)s",
{"request": request},
extra={"spider": spider},
)
self.crawler.stats.inc_value("robotstxt/forbidden")
raise IgnoreRequest("Forbidden by robots.txt")
def robot_parser(self, request, spider):
url = urlparse_cached(request)
netloc = url.netloc
if netloc not in self._parsers:
self._parsers[netloc] = Deferred()
robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
robotsreq = Request(
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
meta={"dont_obey_robotstxt": True},
)
dfd = self.crawler.engine.download(robotsreq, spider)
dfd.addCallback(self._parse_robots, netloc, spider)
dfd.addErrback(self._logerror, robotsreq, spider)
dfd.addErrback(self._robots_error, netloc)
self.crawler.stats.inc_value("robotstxt/request_count")
if isinstance(self._parsers[netloc], Deferred):
d = Deferred()
def cb(result):
d.callback(result)
return result
self._parsers[netloc].addCallback(cb)
return d
else:
return self._parsers[netloc]
def _logerror(self, failure, request, spider):
# if failure.type is not IgnoreRequest:
# logger.error(
# "Error downloading %(request)s: %(f_exception)s",
# {"request": request, "f_exception": failure.value},
# exc_info=failure_to_exc_info(failure),
# extra={"spider": spider},
# )
if failure.type is not IgnoreRequest:
logger.error(f"Error downloading robots.txt: {request}")
return failure
def _parse_robots(self, response, netloc, spider):
self.crawler.stats.inc_value("robotstxt/response_count")
self.crawler.stats.inc_value(
f"robotstxt/response_status_count/{response.status}"
)
rp = self._parserimpl.from_crawler(self.crawler, response.body)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = rp
rp_dfd.callback(rp)
def _robots_error(self, failure, netloc):
if failure.type is not IgnoreRequest:
key = f"robotstxt/exception_count/{failure.type}"
self.crawler.stats.inc_value(key)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = None
rp_dfd.callback(None)
Then I added this into settings.py:
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Custom one written so it doesn't log every 404 response
CUSTOM_ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
...
"mycrawler.middlewares.MycrawlerRobotsTxtMiddleware": 100,
}

Related

count successful and unsuccessful post requests for asynchronous post call/request

I need help in implementing the logic to count number of successful post calls which are asynchronous in nature (status_code=200) as well as failed_calls (status_code != 200)
I am new to coroutines. Would appreciate if someone can suggest a better way of making a post asynchronous call which can be retried, polled for status, and that can emit metrics for successful post requests as well.
Following is my code:
asyncio.get_event_loop().run_in_executor(
None,
self.publish_actual,
event_name,
custom_payload,
event_message_params,
)
which calls publish_actual:
def publish_actual(
self,
event_name: str,
custom_payload={},
event_message_params=[],
):
"""Submits a post request using the request library
:param event_name: name of the event
:type event_name: str
:param key: key for a particular application
:param custom_payload: custom_payload, defaults to {}
:type custom_payload: dict, optional
:param event_message_params: event_message_params, defaults to []
:type event_message_params: list, optional
"""
json_data = {}
path = f"/some/path"
self.request(path, "POST", json=json_data)
which calls following request function
def request(self, api_path, method="GET", **kwargs):
try:
self._validate_configuration()
headers = {}
api_endpoint = self.service_uri.to_url(api_path)
logger.debug(api_endpoint)
if "headers" in kwargs and kwargs["headers"]:
headers.update(kwargs["headers"])
headers = {"Content-Type": "application/json"}
begin = datetime.now()
def build_success_metrics(response, *args, **kwargs):
tags = {
"name": "success_metrics",
"domain": api_endpoint,
"status_code": 200,
}
build_metrics(tags)
def check_for_errors(response, *args, **kwargs):
response.raise_for_status()
response = self.session.request(
method=method,
url=api_endpoint,
headers=headers,
timeout=self.timeout,
hooks={"response": [build_success_metrics, check_for_errors]},
**kwargs,
)
end = datetime.now()
logger.debug(
f"'{method}' request against endpoint '{api_endpoint}' took {round((end - begin).total_seconds() * 1000, 3)} ms"
)
logger.debug(f"response: {response}")
except RequestException as e:
tags = {
"name": "error_metrics",
"domain": api_endpoint,
"exception_class": e.__class__.__name__,
}
build_metrics(tags)
return f"Exception occured: {e}"
Let me know if anything else is required from my end to explain what exactly I have done and what I am trying to achieve.
There is not much await and async in your example so I've just addressed the counting part of your question in general terms in asyncio. asyncio.Queue is good for this because you can separate out the counting from the cause quite simply.
import asyncio
import aiohttp
class Count():
def __init__(self, queue: asyncio.Queue):
self.queue = queue
self.good = 0
self.bad = 0
async def count(self):
while True:
result = await self.queue.get()
if result == 'Exit':
return
if result == 200:
self.good += 1
else:
self.bad += 1
async def request(q: asyncio.Queue):
async with aiohttp.ClientSession() as session:
for _ in range(5): # just poll 30 times in this instance
await asyncio.sleep(0.1)
async with session.get(
'https://httpbin.org/status/200%2C500', ssl=False
) as response:
q.put_nowait(response.status)
q.put_nowait('Exit')
async def main():
q = asyncio.Queue()
cnt = Count(q)
tasks = [cnt.count(), request(q)]
await asyncio.gather(*[asyncio.create_task(t) for t in tasks])
print(cnt.good, cnt.bad)
if __name__ == "__main__":
asyncio.run(main())
Output is random given httpbin response. Should add to 5.
4 1

How do I apply Django middleware everywhere except for a single path?

I'm using Python 3.9 with Django 3. I have defined this middleware ...
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'corsheaders.middleware.CorsMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
'directory.middleware.extend_token_response.ExtendTokenResponse'
]
However, I don't want the middleware to apply to a certain URL. I have hard-coded this in the middleware like so
class ExtendTokenResponse:
def __init__(self, get_response):
self.get_response = get_response
# One-time configuration and initialization.
def __call__(self, request):
response = self.get_response(request)
if request.path != '/' + LOGOUT_PATH:
# Code to be executed for each request before
# the view (and later middleware) are called.
is_expired = True
try:
token = request.auth
print("req path: %s" % request.path)
is_expired = is_token_expired(token) if token else True
except Exception as err:
print(err)
if not is_expired:
but this seems a little sloppy and I would think the middleware comes with somethign out of the box to configure that this wouldn't need to be applied to my "/logout" path. Is there a more elegant way to configure this?
Edit: In response to Bernhard Vallant's answer, I changed my middleware to the below
def token_response_exempt(view_func):
# Set an attribute on the function to mark it as exempt
def wrapped_view(*args, **kwargs):
return view_func(*args, **kwargs)
wrapped_view.token_response_exempt = True
return wraps(view_func)(wrapped_view)
class ExtendTokenResponse:
def init(self, get_response):
self.get_response = get_response
# One-time configuration and initialization.
def process_view(self, request, view_func, view_args, view_kwargs):
print("in process view method ...\n")
if getattr(view_func, "token_response_exempt", False):
print("returning none ...\n")
return None
# Code to be executed for each request before
# the view (and later middleware) are called.
is_expired = True
try:
token = request.auth
print("req path: %s" % request.path)
is_expired = is_token_expired(token) if token else True
except Exception as err:
print(err)
if not is_expired:
token.delete()
new_token = Token.objects.create(user = token.user)
# Code to be executed for each request/response after
# the view is called.
print("setting new token to %s" % new_token)
request.token = new_token
def __call__(self, request):
response = self.get_response(request)
print("---- in call method ----\n")
if getattr(request, "token", None) is not None:
print("setting refresh token header = %s" % request.token)
response['Refresh-Token'] = request.token
return response
but any call to an endpoint, e.g.,
curl --header "Content-type: application/json" --data "$req" --request POST "http://localhost:8000/login"
results in no token being retrieved from the reqeust. "request.auth" generates the error
'WSGIRequest' object has no attribute 'auth'
Django itself doesn't provide a solution for this. Probably hardcoding/defining paths in your settings/middleware is fine as long it is a middleware that primarly exists for one specific project.
However if you want to mark certain views to exclude them from being processed you could use decorators in the same way Django does with the csrf_exempt decorator.
from functools import wraps
def token_response_exempt(view_func):
# Set an attribute on the function to mark it as exempt
def wrapped_view(*args, **kwargs):
return view_func(*args, **kwargs)
wrapped_view.token_response_exempt = True
return wraps(view_func)(wrapped_view)
# your middleware
class ExtendTokenResponse:
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request):
response = self.get_response(request)
if getattr(request, "token", None) is not None:
response['Refresh-Token'] = request.token
return response
def process_view(self, request, view_func, view_args, view_kwargs):
if getattr(view_func, "token_response_exempt", False):
return None
# do your token generation here
request.token = token
And then you can use decorator like the following:
# urls.py
urlpatterns = [
path('logout/', token_response_exempt(LogOutView.as_view())),
]
About your case, I have 2 recommendations below:
Method 1: use process_view and define a list func will be excluded with structure "app.module.func" and check to skip in process_view
# In settings.py
EXCLUDE_FROM_MY_MIDDLEWARE =set({'custom_app.views.About'})
# In middlewares.py
class ExtendTokenResponse:
def __init__(self, get_response):
self.get_response = get_response
# One-time configuration and initialization.
def __call__(self, request):
# Code to be executed for each request before
# the view (and later middleware) are called.
logger.info(f'request hit request {request}')
response = self.get_response(request)
# Code to be executed for each request/response after
# the view is called.
return response
def process_view(self, request, view_func, view_args, view_kwargs):
view_function = '.'.join((view_func.__module__, view_func.__name__))
exclusion_set=getattr(settings,'EXCLUDE_FROM_MY_MIDDLEWARE',set() )
if view_function in exclusion_set:
return None
Method 2: Use decorator_from_middleware and apply middleware to each function needed it.
from django.utils.decorators import decorator_from_middleware
# with function view
#decorator_from_middleware(ExtendTokenResponse)
def view_function(request):
...
#with class view
class SimpleMiddlewareMixin:
#decorator_from_middleware(ExtendTokenResponse)
def dispatch(*args, **kwargs):
return super().dispatch(*args, **kwargs)
class MyClassBasedView(SimpleMiddlewareMixin, ListView):

AttributeError: Response content isn't text. What is the problem?

I am using python3.8.5, scrapy2.4.0 I am also using scrapy-proxy-pool and scrapy-user-agents I am getting "AttributeError: Response content isn't text". I am running this code on python3-venv. Would you like to help me explaining and solving the problem ?
Here is my code:
import scrapy
import json
class BasisMembersSpider(scrapy.Spider):
name = 'basis'
allowed_domains = ['www.basis.org.bd']
def start_requests(self):
start_url = 'https://basis.org.bd/get-member-list?page=1&team='
yield scrapy.Request(url=start_url, callback=self.get_membership_no)
def get_membership_no(self, response):
data_array = json.loads(response.body)['data']
next_page = json.loads(response.body)['links']['next']
for data in data_array:
next_url = 'https://basis.org.bd/get-company-profile/{0}'.format(data['membership_no'])
yield scrapy.Request(url=next_url, callback=self.parse)
if next_page:
yield scrapy.Request(url=next_page, callback=self.get_membership_no)
def parse(self, response):
print("Printing informations....................................................")
Here is my settings.py file:
BOT_NAME = 'web_scraping'
SPIDER_MODULES = ['web_scraping.spiders']
NEWSPIDER_MODULE = 'web_scraping.spiders'
AUTOTHROTTLE_ENABLED = True
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'web_scraping (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
PROXY_POOL_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 800,
'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 610,
'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 620,
}
And are error messages from console output:
Thank you...

Speed up using multi threading python3

Actually I am creating a proxy checker but the problem is it is taking a lot of time to check because there are a lot of the proxies
def proxy():
lives = []
allproxy = []
def fetch_proxy():
raw_proxy = []
res = requests.get(proxy_api)
raw_proxy = res.text.splitlines()
return raw_proxy
allproxy = fetch_proxy()
for proxy in allproxy:
try:
proxyDictChk = {
"https" : "https://"+proxy,
"http" : "http://"+proxy,
}
res = requests.get("http://httpbin.org/ip",proxies=proxyDictChk,timeout=3)
print("Proxy is Working")
lives.append(proxy)
except Exception as e:
print("Proxy Dead")
return lives
print(proxy())
I am curious that how I can use multithreading here to make this fast
PS. Thanks in advance
The python docs provide a pretty good example, https://docs.python.org/3/library/concurrent.futures.html
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(check_proxy, url, 60): url for url in allproxy}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
is_valid = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%s page is %s' % (url, is_valid))
So you would just need to define the function check_proxy.
def check_proxy( proxy ):
try:
proxyDictChk = {
"https" : "https://"+proxy,
"http" : "http://"+proxy,
}
res = requests.get("http://httpbin.org/ip",proxies=proxyDictChk,timeout=3)
print("Proxy is Working")
return True
except Exception as e:
print("Proxies Dead!")
return False
Essentially, use an executor and submit a function that does what you want. Then use the future to get the results of the functions as they're completed.
Also, since this lets the exception bubble up, you don't have to handle it in the function.
def check_proxy( proxy ):
proxyDictChk = { "https" : "https://"+proxy,
"http" : "http://"+proxy,
}
res = requests.get("http://httpbin.org/ip",proxies=proxyDictChk,timeout=3)
return True
Now the exception can be handled at the future state. You could change the return type to something more meaningful.

How to post group of requests to 2 urls with aiohttp

I have 2 URLs and 60k+ requests. Basically, I need to post every request to both URLs, then compare their responses, but not to wait for the response to post another request.
I've tried to do it with aiohttp and asyncio
import asyncio
import time
import aiohttp
import os
from aiofile import AIOFile
testURL = ""
prodURL = ""
directoryWithRequests = ''
directoryToWrite = ''
headers = {'content-type': 'application/soap+xml'}
i = 1
async def fetch(session, url, reqeust):
global i
async with session.post(url=url, data=reqeust.encode('utf-8'), headers=headers) as response:
if response.status != 200:
async with AIOFile(directoryToWrite + str(i) + '.xml', 'w') as afp:
await afp.write(reqeust)
i += 1
return await response.text()
async def fetch_all(session, urls, request):
results = await asyncio.gather(*[asyncio.create_task(fetch(session, url, request)) for url in urls])
return results
async def asynchronousRequests(requestBody):
urls = [testURL, prodURL]
global i
with open(requestBody) as my_file:
body = my_file.read()
async with aiohttp.ClientSession() as session:
htmls = await fetch_all(session, urls, body)
# some conditions
async def asynchronous():
try:
start = time.time()
futures = [asynchronousRequests(directoryWithRequests + i) for i in os.listdir(directoryWithRequests)]
for future in asyncio.as_completed(futures):
result = await future
print("Process took: {:.2f} seconds".format(time.time() - start))
except Exception as e:
print(str(e))
if __name__ == '__main__':
try:
# AsyncronTest
ioloop = asyncio.ProactorEventLoop()
ioloop.run_until_complete(asynchronous())
ioloop.close()
if i == 1:
print('Regress is OK')
else:
print('Number of requests to check = {}'.format(i))
except Exception as e:
print(e)
I believe that the code above works, but it creates N futures, where the N equals to the number of request files. This brings to sort of ddos because the server can't response to that number of requests at the same time.
Found suitable solution. Basically it's just 2 async tasks:
tasks = [
postRequest(testURL, client, body),
postRequest(prodURL, client, body)
]
await asyncio.wait(tasks)
It's not the same performance as the code in the question with afortable number of requests, but as least it doesn't ddos the server that much.

Resources