Scrapy NotSupported and TimeoutError - python-3.x

My goal is to find out each and every link that contains daraz.com.bd/shop/
What I tried so far is bellow..
import scrapy
class LinksSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['daraz.com.bd']
extracted_links = []
shop_list = []
def start_requests(self):
start_urls = 'https://www.daraz.com.bd'
yield scrapy.Request(url=start_urls, callback=self.extract_link)
def extract_link(self, response):
str_response_content_type = str(response.headers.get('content-type'))
if str_response_content_type == "b'text/html; charset=utf-8'" :
links = response.xpath("//a/#href").extract()
for link in links:
link = link.lstrip("/")
if ("https://" or "http://") not in link:
link = "https://" + str(link)
split_link = link.split('.')
if "daraz.com.bd" in link and link not in self.extracted_links:
self.extracted_links.append(link)
if len(split_link) > 1:
if "www" in link and "daraz" in split_link[1]:
yield scrapy.Request(url=link, callback=self.extract_link, dont_filter=True)
elif "www" not in link and "daraz" in split_link[0]:
yield scrapy.Request(url=link, callback=self.extract_link, dont_filter=True)
if "daraz.com.bd/shop/" in link and link not in self.shop_list:
yield {
"links" : link
}
Here is my settings.py file:
BOT_NAME = 'chotosite'
SPIDER_MODULES = ['chotosite.spiders']
NEWSPIDER_MODULE = 'chotosite.spiders'
ROBOTSTXT_OBEY = False
REDIRECT_ENABLED = False
DOWNLOAD_DELAY = 0.25
USER_AGENT = 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z‡ Safari/537.36'
AUTOTHROTTLE_ENABLED = True
What problem am I facing ?
It stops automatically after collecting only 6-7 links that contains daraz.com.bd/shop/.
User timeout caused connection failure: Getting https://www.daraz.com.bd/kettles/ took longer than 180.0 seconds..
INFO: Ignoring response <301 https://www.daraz.com.bd/toner-and-mists/>: HTTP status code is not handled or not allowed
How do I solve those issues ? Please help me.
If you have some other idea to reach my goal I will be more than happy. thank you...
Here are some console log:
2020-12-04 22:21:23 [scrapy.extensions.logstats] INFO: Crawled 891 pages (at 33 pages/min), scraped 6 items (at 0 items/min)
2020-12-04 22:22:05 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://www.daraz.com.bd/kettles/> (failed 1 times): User timeout caused connection failure: Getting https://www.daraz.com.bd/kettles/ took longer than 180.0 seconds..
2020-12-04 22:22:11 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.daraz.com.bd/kettles/> (referer: https://www.daraz.com.bd)
2020-12-04 22:22:11 [scrapy.core.engine] INFO: Closing spider (finished)
2020-12-04 22:22:11 [scrapy.extensions.feedexport] INFO: Stored csv feed (6 items) in: dara.csv
2020-12-04 22:22:11 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 4,
'downloader/exception_type_count/scrapy.exceptions.NotSupported': 1,
'downloader/exception_type_count/twisted.internet.error.TimeoutError': 3,
'downloader/request_bytes': 565004,
'downloader/request_count': 896,
'downloader/request_method_count/GET': 896,
'downloader/response_bytes': 39063472,
'downloader/response_count': 892,
'downloader/response_status_count/200': 838,
'downloader/response_status_count/301': 45,
'downloader/response_status_count/302': 4,
'downloader/response_status_count/404': 5,
'elapsed_time_seconds': 828.333752,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 12, 4, 16, 22, 11, 864492),
'httperror/response_ignored_count': 54,
'httperror/response_ignored_status_count/301': 45,
'httperror/response_ignored_status_count/302': 4,
'httperror/response_ignored_status_count/404': 5,
'item_scraped_count': 6,
'log_count/DEBUG': 901,
'log_count/ERROR': 1,
'log_count/INFO': 78,
'memusage/max': 112971776,
'memusage/startup': 53370880,
'request_depth_max': 5,
'response_received_count': 892,
'retry/count': 3,
'retry/reason_count/twisted.internet.error.TimeoutError': 3,
'scheduler/dequeued': 896,
'scheduler/dequeued/memory': 896,
'scheduler/enqueued': 896,
'scheduler/enqueued/memory': 896,
'start_time': datetime.datetime(2020, 12, 4, 16, 8, 23, 530740)}
2020-12-04 22:22:11 [scrapy.core.engine] INFO: Spider closed (finished)

You can use link extract object to extract all link. Then you can filter your desire link.
In you scrapy shell
scrapy shell https://www.daraz.com.bd
from scrapy.linkextractors import LinkExtractor
l = LinkExtractor()
links = l.extract_links(response)
for link in links:
print(link.url)

Related

selenium script written in _init_ function not executing

I am trying to integrate selenium with scrapy to render javascript from a website. I have put the selenium automation code in a constructor, it performs a button click, and then the parse function scrapes the data from the page. But follwing errors are appearing in the terminal window.
code :
import scrapy
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
class test_2(scrapy.Spider):
name='test_2'
#allowed_domains=[]
start_urls=[
'https://www.jackjones.in/st-search?q=shoes'
]
def _init_(self):
print("test-1")
chrome_options=Options()
chrome_options.add_argument("--headless")
driver=webdriver.Chrome("C:/chromedriver")
driver.set_window(1920,1080)
driver.get("https://www.jackjones.in/st-search?q=shoes")
tab=driver.find_elements_by_class_name("st-single-product")
tab[4].click()
self.html=driver.page_source
print("test-2")
driver.close()
def parse(self, response):
print("test-3")
resp=Selector(text=self.html)
yield{
'title':resp.xpath("//h1/text()").get()
}
It appears that compiler does not execute the init function before going to parse function, because neither of the print statements are getting executed but the print statement in parse function is present in the output.
How to fix this?
Output:
PS C:\Users\Vasu\summer\scrapy_selenium> scrapy crawl test_2
2022-07-01 13:18:30 [scrapy.utils.log] INFO: Scrapy 2.6.1 started (bot: scrapy_selenium)
2022-07-01 13:18:30 [scrapy.utils.log] INFO: Versions: lxml 4.9.0.0, libxml2 2.9.14, cssselect 1.1.0, parsel 1.6.0,
w3lib 1.22.0, Twisted 22.4.0, Python 3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)], pyOpenSSL
22.0.0 (OpenSSL 1.1.1p 21 Jun 2022), cryptography 37.0.1, Platform Windows-10-10.0.19044-SP0
2022-07-01 13:18:30 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'scrapy_selenium',
'NEWSPIDER_MODULE': 'scrapy_selenium.spiders',
'SPIDER_MODULES': ['scrapy_selenium.spiders']}
2022-07-01 13:18:30 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-07-01 13:18:30 [scrapy.extensions.telnet] INFO: Telnet Password: 168b57499cd07735
2022-07-01 13:18:30 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2022-07-01 13:18:31 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2022-07-01 13:18:31 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2022-07-01 13:18:31 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2022-07-01 13:18:31 [scrapy.core.engine] INFO: Spider opened
2022-07-01 13:18:31 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-07-01 13:18:31 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-07-01 13:18:31 [filelock] DEBUG: Attempting to acquire lock 1385261511056 on C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\tldextract\.suffix_cache/publicsuffix.org-tlds\de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-07-01 13:18:31 [filelock] DEBUG: Lock 1385261511056 acquired on C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\tldextract\.suffix_cache/publicsuffix.org-tlds\de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-07-01 13:18:32 [filelock] DEBUG: Attempting to release lock 1385261511056 on C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\tldextract\.suffix_cache/publicsuffix.org-tlds\de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-07-01 13:18:32 [filelock] DEBUG: Lock 1385261511056 released on C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\tldextract\.suffix_cache/publicsuffix.org-tlds\de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-07-01 13:18:32 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.jackjones.in/st-search?q=shoes> (referer: None)
test-3
2022-07-01 13:18:32 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.jackjones.in/st-search?q=shoes> (referer: None)
Traceback (most recent call last):
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\utils\defer.py", line 132, in iter_errback
yield next(it)
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\utils\python.py", line 354, in __next__
return next(self.data)
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\utils\python.py", line 354, in __next__
return next(self.data)
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable
for r in iterable:
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable
for r in iterable:
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable
for r in iterable:
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable
for r in iterable:
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable
for r in iterable:
File "C:\Users\Vasu\summer\scrapy_selenium\scrapy_selenium\spiders\test_2.py", line 31, in parse
resp=Selector(text=self.html)
AttributeError: 'test_2' object has no attribute 'html'
2022-07-01 13:18:32 [scrapy.core.engine] INFO: Closing spider (finished)
2022-07-01 13:18:32 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 237,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 20430,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.613799,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 7, 1, 7, 48, 32, 202155),
'httpcompression/response_bytes': 87151,
'httpcompression/response_count': 1,
'log_count/DEBUG': 6,
'log_count/ERROR': 1,
'log_count/INFO': 10,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'spider_exceptions/AttributeError': 1,
'start_time': datetime.datetime(2022, 7, 1, 7, 48, 31, 588356)}
2022-07-01 13:18:32 [scrapy.core.engine] INFO: Spider closed (finished)
It's __init__, not _init_ (note the double underscores).
Secondly, there is no h1 on the page. Try this instead:
yield {
'title':resp.xpath("//title/text()").get()
}

Scrapy using files.middleware downloads given file without extension

I want to automate some file-exchange. I need to download a .csv-file from a website, which is secured with an authentication before you can start to download.
First, I wanted to try downloading the file, with wget, but I did not manage, so I switched to scrapy and everything works fine, my authentication and the download, BUT the file comes without an extension -.-'
here is a snippet of my spider:
def after_login(self, response):
accview = response.xpath('//span[#class="user-actions welcome"]')
if accview:
print('Logged in')
file_url = response.xpath('//article[#class="widget-single widget-shape-widget widget"]/p/a/#href').get()
file_url = response.urljoin(file_url)
items = StockfwItem()
items['file_urls'] = [file_url]
yield items
my settings.py:
ITEM_PIPELINES = {'scrapy.pipelines.files.FilesPipeline': 1}
items.py:
file_urls = scrapy.Field()
files = scrapy.Field()
The reason why I am sure, that there is a problem with my spider, is that, if I download the file regular via brower, it always comes as a regular csvfile.
When I try to open the downloaded file(filename is hashed in sha1), I get the following error_msg:
File "/usr/lib/python3.6/csv.py", line 111, in __next__
self.fieldnames
File "/usr/lib/python3.6/csv.py", line 98, in fieldnames
self._fieldnames = next(self.reader)
_csv.Error: line contains NULL byte
Also when I open the downloaded file with notepad++ and save encoding as utf-8, it works without any problems...
scrapy console output:
{'file_urls': ['https://floraworld.be/Servico.Orchard.FloraWorld/Export/Export'] ,
'files': [{'checksum': 'f56c6411803ec45863bc9dbea65edcb9',
'path': 'full/cc72731cc79929b50c5afb14e0f7e26dae8f069c',
'status': 'downloaded',
'url': 'https://floraworld.be/Servico.Orchard.FloraWorld/Export/Expo rt'}]}
2021-08-02 10:00:30 [scrapy.core.engine] INFO: Closing spider (finished)
2021-08-02 10:00:30 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2553,
'downloader/request_count': 4,
'downloader/request_method_count/GET': 2,
'downloader/request_method_count/POST': 2,
'downloader/response_bytes': 76289,
'downloader/response_count': 4,
'downloader/response_status_count/200': 3,
'downloader/response_status_count/302': 1,
'elapsed_time_seconds': 20.892172,
'file_count': 1,
'file_status_count/downloaded': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 8, 2, 8, 0, 30, 704638),
'item_scraped_count': 1,
'log_count/DEBUG': 6,
'log_count/INFO': 10,
'log_count/WARNING': 1,
'memusage/max': 55566336,
'memusage/startup': 55566336,
'request_depth_max': 1,
'response_received_count': 3,
'scheduler/dequeued': 4,
'scheduler/dequeued/memory': 4,
'scheduler/enqueued': 4,
'scheduler/enqueued/memory': 4,
'splash/render.html/request_count': 1,
'splash/render.html/response_count/200': 1,
'start_time': datetime.datetime(2021, 8, 2, 8, 0, 9, 812466)}
2021-08-02 10:00:30 [scrapy.core.engine] INFO: Spider closed (finished)
snippet of downloaded file and opened with vim on ubuntu server:
"^#A^#r^#t^#i^#c^#l^#e^#C^#o^#d^#e^#"^#;^#"^#D^#e^#s^#c^#r^#i^#p^#t^#i^#o^#n^#"^#;^#"^#B^#B^#"^#;^#"^#K^#T^#"^#;^#"^#S^#i^#z^#e^#"^#;^#"^#P^#r^#i^#c^#e^#"^#;^#"^#S^#t^#o^#c^#k^#"^#;^#"^#D^#e^#l^#i^#v^#e^#r^#y^#D^#a^#t^#e^#"^#^M^#
^#"^#1^#0^#0^#0^#L^#"^#;^#"^#A^#l^#o^#e^# ^#p^#l^#a^#n^#t^# ^#x^# ^#2^#3^# ^#l^#v^#s^#"^#;^#"^#4^#"^#;^#"^#4^#"^#;^#"^#6^#5^#"^#;^#"^#4^#6^#,^#7^#7^#"^#;^#"^#1^#1^#8^#,^#0^#0^#0^#0^#0^#"^#;^#"^#"^#^M^#
^#"^#1^#0^#0^#0^#M^#"^#;^#"^#A^#l^#o^#e^# ^#p^#l^#a^#n^#t^# ^#x^# ^#1^#7^# ^#l^#v^#s^#"^#;^#"^#4^#"^#;^#"^#1^#2^#"^#;^#"^#5^#0^#"^#;^#"^#3^#2^#,^#6^#1^#"^#;^#"^#2^#0^#6^#,^#0^#0^#0^#0^#0^#"^#;^#"^#"^#^M^#
^#"^#1^#0^#0^#0^#S^#"^#;^#"^#A^#l^#o^#e^# ^#p^#l^#a^#n^#t^# ^#x^# ^#1^#6^# ^#l^#v^#s^#"^#;^#"^#4^#"^#;^#"^#2^#4^#"^#;^#"^#4^#0^#"^#;^#"^#2^#2^#,^#3^#2^#"^#;^#"^#-^#6^#,^#0^#0^#0^#0^#0^#"^#;^#"^#2^#3^#/^#0^#8^#/^#2^#0^#2^#1^#"^#^M^#
^#"^#1^#0^#0^#2^#M^#"^#;^#"^#B^#A^#T^#O^#N^# ^#P^#L^#A^#N^#T^# ^#6^#7^# ^#C^#M^# ^#W^#/^#P^#O^#T^#"^#;^#"^#2^#"^#;^#"^#6^#"^#;^#"^#6^#7^#"^#;^#"^#2^#2^#,^#4^#2^#"^#;^#"^#3^#3^#,^#0^#0^#0^#0^#0^#"^#;^#"^#5^#/^#0^#9^#/^#2^#0^#2^#1^#"^#^M^#
^#"^#1^#0^#0^#2^#S^#"^#;^#"^#B^#A^#T^#O^#N^# ^#P^#L^#A^#N^#T^# ^#4^#2^# ^#C^#M^# ^#W^#/^#P^#O^#T^#"^#;^#"^#4^#"^#;^#"^#1^#2^#"^#;^#"^#4^#2^#"^#;^#"^#1^#0^#,^#5^#4^#"^#;^#"^#-^#9^#5^#,^#0^#0^#0^#0^#0^#"^#;^#"^#5^#/^#0^#9^#/^#2^#0^#2^#1^#"^#^M^#
^#"^#1^#0^#0^#4^#N^#"^#;^#"^#B^#a^#t^#o^#n^# ^#P^#l^#a^#n^#t^#"^#;^#"^#2^#"^#;^#"^#2^#"^#;^#"^#9^#9^#"^#;^#"^#1^#2^#0^#,^#9^#5^#"^#;^#"^#5^#3^#,^#0^#0^#0^#0^#0^#"^#;^#"^#3^#0^#/^#0^#9^#/^#2^#0^#2^#1^#"^#^M^#
^#"^#1^#0^#0^#5^#N^#"^#;^#"^#N^#a^#t^#u^#r^#a^#l^# ^#s^#t^#r^#e^#l^#i^#t^#z^#i^#a^# ^#w^#/^#p^#o^#t^#"^#;^#"^#1^#"^#;^#"^#1^#"^#;^#"^#1^#3^#0^#"^#;^#"^#2^#0^#7^#,^#4^#4^#"^#;^#"^#1^#4^#,^#0^#0^#0^#0^#0^#"^#;^#"^#1^#/^#1^#2^#/^#2^#0^#2^#1^#"^#^M^#
what the heck is this??
When I change the filename to file.csv, downloading the file to my windoof desktop and open it with notepad++ again, it looks good:
"ArticleCode";"Description";"BB";"KT";"Size";"Price";"Stock";"DeliveryDate"
"1000L";"Aloe plant x 23 lvs";"4";"4";"65";"46,77";"118,00000";""
"1000M";"Aloe plant x 17 lvs";"4";"12";"50";"32,61";"206,00000";""
"1000S";"Aloe plant x 16 lvs";"4";"24";"40";"22,32";"-6,00000";"23/08/2021"
"1002M";"BATON PLANT 67 CM W/POT";"2";"6";"67";"22,42";"33,00000";"5/09/2021"
"1002S";"BATON PLANT 42 CM W/POT";"4";"12";"42";"10,54";"-95,00000";"5/09/2021"
for all those who suffer on the same problem:
I just hit in my terminal:
cat Inputfile | tr -d '\0' > Outputfile.csv
First of all try to change the encoding in vim:
set fileencodings=utf-8
or open it in a different text editor in your ubuntu machine, maybe it's just a problem with vim.
Second thing to do is to download the file with the correct name:
import os
from urllib.parse import unquote
from scrapy.pipelines.files import FilesPipeline
from scrapy.http import Request
class TempPipeline():
def process_item(self, item, spider):
return item
class ProcessPipeline(FilesPipeline):
# Overridable Interface
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.files_urls_field, [])
return [Request(u) for u in urls]
def file_path(self, request, response=None, info=None, *, item=None):
# return 'files/' + os.path.basename(urlparse(request.url).path) # from scrapy documentation
return os.path.basename(unquote(request.url)) # this is what worked for my project, but maybe you'll want to add ".csv"
also you need to change settings.py:
ITEM_PIPELINES = {
'myproject.pipelines.MyImagesPipeline': 300
}
FILES_STORE = '/path/to/valid/dir'
Try those two things and if it still doesn't work then update me please.
I think your file containing null bytes.
The issue might be:
Your items.py contains two fields file_urls and files. But, your spider yields only one item i.e., file_urls. Thus, CSV gets created with two columns (file_urls , files) but files column does not contain any data (which might causing this problem). Try commenting this line and see if it works #files = scrapy.Field().

Scrapy 1.6 : DNS lookup failed

I am new to Scrapy and im trying to crawl this website https://www.timeanddate.com/weather/india and its throwing DNS lookup error. The code i wrote for scraping works perfectly in shell so my guess is DNS error happens before scraping takes place.
This is what i get :
2019-05-02 11:59:03 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: IndiaWeather)
2019-05-02 11:59:03 [scrapy.utils.log] INFO: Versions: lxml 4.3.2.0, libxml2 2.9.9, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.2.0, Python 3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1b 26 Feb 2019), cryptography 2.6.1, Platform Windows-10-10.0.17134-SP0
2019-05-02 11:59:03 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'IndiaWeather', 'NEWSPIDER_MODULE': 'IndiaWeather.spiders', 'SPIDER_MODULES': ['IndiaWeather.spiders']}
2019-05-02 11:59:03 [scrapy.extensions.telnet] INFO: Telnet Password: 688b4fe759cb3ed5
2019-05-02 11:59:03 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2019-05-02 11:59:03 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2019-05-02 11:59:03 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2019-05-02 11:59:03 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2019-05-02 11:59:03 [scrapy.core.engine] INFO: Spider opened
2019-05-02 11:59:03 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-05-02 11:59:03 [py.warnings] WARNING: C:\Users\Abrar\Anaconda3\lib\site-packages\scrapy\spidermiddlewares\offsite.py:61: URLWarning: allowed_domains accepts only domains, not URLs. Ignoring URL entry https://www.timeanddate.com/weather/india in allowed_domains.
warnings.warn(message, URLWarning)
2019-05-02 11:59:03 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2019-05-02 11:59:05 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://https//www.timeanddate.com/weather/india/> (failed 1 times): DNS lookup failed: no results for hostname lookup: https.
2019-05-02 11:59:08 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://https//www.timeanddate.com/weather/india/> (failed 2 times): DNS lookup failed: no results for hostname lookup: https.
2019-05-02 11:59:10 [scrapy.downloadermiddlewares.retry] DEBUG: Gave up retrying <GET http://https//www.timeanddate.com/weather/india/> (failed 3 times): DNS lookup failed: no results for hostname lookup: https.
2019-05-02 11:59:10 [scrapy.core.scraper] ERROR: Error downloading <GET http://https//www.timeanddate.com/weather/india/>
Traceback (most recent call last):
File "C:\Users\Abrar\Anaconda3\lib\site-packages\twisted\internet\defer.py", line 1416, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "C:\Users\Abrar\Anaconda3\lib\site-packages\twisted\python\failure.py", line 512, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "C:\Users\Abrar\Anaconda3\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "C:\Users\Abrar\Anaconda3\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "C:\Users\Abrar\Anaconda3\lib\site-packages\twisted\internet\endpoints.py", line 975, in startConnectionAttempts
"no results for hostname lookup: {}".format(self._hostStr)
twisted.internet.error.DNSLookupError: DNS lookup failed: no results for hostname lookup: https.
2019-05-02 11:59:10 [scrapy.core.engine] INFO: Closing spider (finished)
2019-05-02 11:59:10 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 3,
'downloader/exception_type_count/twisted.internet.error.DNSLookupError': 3,
'downloader/request_bytes': 717,
'downloader/request_count': 3,
'downloader/request_method_count/GET': 3,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2019, 5, 2, 6, 29, 10, 505262),
'log_count/DEBUG': 3,
'log_count/ERROR': 1,
'log_count/INFO': 9,
'log_count/WARNING': 1,
'retry/count': 2,
'retry/max_reached': 1,
'retry/reason_count/twisted.internet.error.DNSLookupError': 2,
'scheduler/dequeued': 3,
'scheduler/dequeued/memory': 3,
'scheduler/enqueued': 3,
'scheduler/enqueued/memory': 3,
'start_time': datetime.datetime(2019, 5, 2, 6, 29, 3, 412894)}
2019-05-02 11:59:10 [scrapy.core.engine] INFO: Spider closed (finished)
I have posted the everything above.
Look at this error message you get:
2019-05-02 11:59:10 [scrapy.core.scraper] ERROR: Error downloading <GET http://https//www.timeanddate.com/weather/india/>
You have doubled the schema in URL (both http and https) and it's also invalid (no : after the second https). The first usually happend if you use scrapy genspider command-line command and specify the domain already with schema.
So, remove one of the schemas from the start_urls URLs.
please check value of
allowed_domains = ['abc.xyz.domain_name/']
start_urls = ['http://abc.xyz.domain_name//']
in your program.
or may be your writte http two times in your program.
or may be your added http in allowed_domain in your program.

Scrapy spider finishing scraping process without scraping anything

I have this spider that scrapes amazon for information.
The spider reads a .txt file in which I write which product it must search and then enters amazon page for that product, for example :
https://www.amazon.com/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords=laptop
I use the keyword=laptop for changing which product to search and such.
The issue that I'm having is that the spider just does not work, which is weird because a week ago it did her job just fine.
Also, no errors appear on the console, the spider starts, "crawls" the keyword and then just stops.
Here is the full spider
import scrapy
import re
import string
import random
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from genericScraper.items import GenericItem
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
class GenericScraperSpider(CrawlSpider):
name = "generic_spider"
#Dominio permitido
allowed_domain = ['www.amazon.com']
search_url = 'https://www.amazon.com/s?field-keywords={}'
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI' : 'datosGenericos.csv'
}
rules = {
#Gets all the elements in page 1 of the keyword i search
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[contains(#class, "s-access-detail-page")]') ),
callback = 'parse_item', follow = False)
}
def start_requests(self):
txtfile = open('productosGenericosABuscar.txt', 'r')
keywords = txtfile.readlines()
txtfile.close()
for keyword in keywords:
yield Request(self.search_url.format(keyword))
def parse_item(self,response):
genericAmz_item = GenericItem()
#info de producto
categoria = response.xpath('normalize-space(//span[contains(#class, "a-list-item")]//a/text())').extract_first()
genericAmz_item['nombreProducto'] = response.xpath('normalize-space(//span[contains(#id, "productTitle")]/text())').extract()
genericAmz_item['precioProducto'] = response.xpath('//span[contains(#id, "priceblock")]/text()'.strip()).extract()
genericAmz_item['opinionesProducto'] = response.xpath('//div[contains(#id, "averageCustomerReviews_feature_div")]//i//span[contains(#class, "a-icon-alt")]/text()'.strip()).extract()
genericAmz_item['urlProducto'] = response.request.url
genericAmz_item['categoriaProducto'] = re.sub('Back to search results for |"','', categoria)
yield genericAmz_item
Other spiders with a similar structure I made also work, any idea what's going on?
Here's what I get in the console
2019-01-31 22:49:26 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: genericScraper)
2019-01-31 22:49:26 [scrapy.utils.log] INFO: Versions: lxml 4.2.5.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.19.0, Twisted 18.7.0, Python 3.7.0 (default, Jun 28 2018, 08:04:48) [MSC v.1912 64 bit (AMD64)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2p 14 Aug 2018), cryptography 2.3.1, Platform Windows-10-10.0.17134-SP0
2019-01-31 22:49:26 [scrapy.crawler] INFO: Overridden settings: {'AUTOTHROTTLE_ENABLED': True, 'BOT_NAME': 'genericScraper', 'DOWNLOAD_DELAY': 3, 'FEED_FORMAT': 'csv', 'FEED_URI': 'datosGenericos.csv', 'NEWSPIDER_MODULE': 'genericScraper.spiders', 'SPIDER_MODULES': ['genericScraper.spiders'], 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
2019-01-31 22:49:26 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.feedexport.FeedExporter',
'scrapy.extensions.logstats.LogStats',
'scrapy.extensions.throttle.AutoThrottle']
2019-01-31 22:49:26 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2019-01-31 22:49:26 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2019-01-31 22:49:26 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2019-01-31 22:49:26 [scrapy.core.engine] INFO: Spider opened
2019-01-31 22:49:26 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-01-31 22:49:26 [scrapy.extensions.telnet] DEBUG: Telnet console listening on xxx.x.x.x:xxxx
2019-01-31 22:49:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/s?field-keywords=Laptop> (referer: None)
2019-01-31 22:49:27 [scrapy.core.engine] INFO: Closing spider (finished)
2019-01-31 22:49:27 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 315,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 2525,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2019, 2, 1, 1, 49, 27, 375619),
'log_count/DEBUG': 2,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2019, 2, 1, 1, 49, 26, 478037)}
2019-01-31 22:49:27 [scrapy.core.engine] INFO: Spider closed (finished)
Interesting! It is possibly due to website wasn't returning any data. Have you tried to debug with scrapy shell. If not, try to check with that is response.body returning intended data which you want to crawl.
def parse_item(self,response):
from scrapy.shell import inspect_response
inspect_response(response, self)
For more details, please read detailed info on scrapy shell
After debugging, If you still not getting intended data that means there is more into the site which obstructing to crawling process. That could be dynamic script or cookie/local-storage/session dependency.
For dynamic/JS script, you can use selenium or splash.
selenium-with-scrapy-for-dynamic-page
handling-javascript-in-scrapy-with-splash
For cookie/local-storage/session, you have to look deeper into inspect window and find out which is essential for getting the data.

Scrapy's pagination error

Hi guys I'm getting the following pagination error while tying to scrape a website
2017-07-27 18:30:21 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.pedidosja.com.br/restaurantes/sao-paulo?a=rua%20tenente%20negr%C3%A3o%20200&cep=04530030&doorNumber=200&bairro=Itaim%20Bibi&lat=-23.585202&lng=-46.671715199999994> (referer: None)
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/root/Documents/Spiders/pedidosYa/pedidosYa/spiders/pedidosya.py", line 35, in parse
next_page_url = response.urljoin(next_page_url)
File "/usr/local/lib/python3.5/dist-packages/scrapy/http/response/text.py", line 82, in urljoin
return urljoin(get_base_url(self), url)
File "/usr/lib/python3.5/urllib/parse.py", line 416, in urljoin
base, url, _coerce_result = _coerce_args(base, url)
File "/usr/lib/python3.5/urllib/parse.py", line 112, in _coerce_args
raise TypeError("Cannot mix str and non-str arguments")
TypeError: Cannot mix str and non-str arguments
2017-07-27 18:30:21 [scrapy.core.engine] INFO: Closing spider (finished)
2017-07-27 18:30:21 [scrapy.extensions.feedexport] INFO: Stored csv feed (13 items) in: test3.csv
2017-07-27 18:30:21 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 653,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 62571,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 7, 27, 23, 30, 21, 221038),
'item_scraped_count': 13,
'log_count/DEBUG': 16,
'log_count/ERROR': 1,
'log_count/INFO': 8,
'memusage/max': 49278976,
'memusage/startup': 49278976,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'spider_exceptions/TypeError': 1,
'start_time': datetime.datetime(2017, 7, 27, 23, 30, 17, 538310)}
2017-07-27 18:30:21 [scrapy.core.engine] INFO: Spider closed (finished)
The spider is raising a type error "Cannot mix str and non-str arguments" I not very experienced in pyhton, I would also apreciate some
resources where I could learn about this type of error. Bellow you will find the code of the spider.
# -*- coding: utf-8 -*-
import scrapy
from pedidosYa.items import PedidosyaItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose
class PedidosyaSpider(scrapy.Spider):
name = 'pedidosya'
allowed_domains = ['www.pedidosya.com.br']
start_urls = [
'https://www.pedidosja.com.br/restaurantes/sao-paulo?a=rua%20tenente%20negr%C3%A3o%20200&cep=04530030&doorNumber=200&bairro=Itaim%20Bibi&lat=-23.585202&lng=-46.671715199999994']
def parse(self, response):
# need to define wrapper
for wrapper in response.css('.restaurant-wrapper.peyaCard.show.with_tags'):
l = ItemLoader(item=PedidosyaItem(), selector=wrapper)
l.add_css('Name', 'a.arrivalName::text')
l.add_css('Menu1', 'span.categories > span::text', MapCompose(str.strip))
l.add_css('Menu2', 'span.categories > span + span::text', MapCompose(str.strip))
l.add_css('Menu3', 'span.categories > span + span + span::text', MapCompose(str.strip))
l.add_css('Address', 'span.address::text', MapCompose(str.strip))
l.add_css('DeliveryTime', 'i.delTime::text', MapCompose(str.strip))
l.add_css('CreditCard', 'ul.content_credit_cards > li > img::attr(alt)', MapCompose(str.strip))
l.add_css('DeliveryCost', 'div.shipping > i::text', MapCompose(str.strip))
l.add_css('Rankink', 'span.ranking i::text', MapCompose(str.strip))
l.add_css('No', 'span.ranking a::text', MapCompose(str.strip))
l.add_css('Sponsor', 'span.grey_small.not-logged::text', MapCompose(str.strip))
l.add_css('DeliveryMinimun', 'div.minDelivery::text', MapCompose(str.strip))
l.add_css('Distance', 'div.distance i::text', MapCompose(str.strip))
yield l.load_item()
next_page_url = response.css('li.arrow.next > a ::attr(href)').extract()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
Thank you in advance and have a wonderfull day!!
next_page_url = response.css('li.arrow.next > a ::attr(href)').extract()
^^^^^^^^^^
if next_page_url:
next_page_url = response.urljoin(next_page_url)
^^^^^^^^^^^^^
Here you are calling urljoin on a list since extract() method when creating next_page_url returns a list of all values, even if it's only one member.
To remedy this use extract_first() instead:
next_page_url = response.css('li.arrow.next > a ::attr(href)').extract_first()
^^^^^^^^^^^^^^^
The problem is in this line:
next_page_url = response.css('li.arrow.next > a::attr(href)').extract()
because extract() method returns always a list of results, even if it founds just one. Either use extract_first() method which will give you just the first result:
next_page_url = response.css('li.arrow.next > a::attr(href)').extract_first()
or get the first element of the results list yourself:
next_page_url = response.css('li.arrow.next > a::attr(href)').extract()[0]

Resources