Scrapy using files.middleware downloads given file without extension - python-3.x

I want to automate some file-exchange. I need to download a .csv-file from a website, which is secured with an authentication before you can start to download.
First, I wanted to try downloading the file, with wget, but I did not manage, so I switched to scrapy and everything works fine, my authentication and the download, BUT the file comes without an extension -.-'
here is a snippet of my spider:
def after_login(self, response):
accview = response.xpath('//span[#class="user-actions welcome"]')
if accview:
print('Logged in')
file_url = response.xpath('//article[#class="widget-single widget-shape-widget widget"]/p/a/#href').get()
file_url = response.urljoin(file_url)
items = StockfwItem()
items['file_urls'] = [file_url]
yield items
my settings.py:
ITEM_PIPELINES = {'scrapy.pipelines.files.FilesPipeline': 1}
items.py:
file_urls = scrapy.Field()
files = scrapy.Field()
The reason why I am sure, that there is a problem with my spider, is that, if I download the file regular via brower, it always comes as a regular csvfile.
When I try to open the downloaded file(filename is hashed in sha1), I get the following error_msg:
File "/usr/lib/python3.6/csv.py", line 111, in __next__
self.fieldnames
File "/usr/lib/python3.6/csv.py", line 98, in fieldnames
self._fieldnames = next(self.reader)
_csv.Error: line contains NULL byte
Also when I open the downloaded file with notepad++ and save encoding as utf-8, it works without any problems...
scrapy console output:
{'file_urls': ['https://floraworld.be/Servico.Orchard.FloraWorld/Export/Export'] ,
'files': [{'checksum': 'f56c6411803ec45863bc9dbea65edcb9',
'path': 'full/cc72731cc79929b50c5afb14e0f7e26dae8f069c',
'status': 'downloaded',
'url': 'https://floraworld.be/Servico.Orchard.FloraWorld/Export/Expo rt'}]}
2021-08-02 10:00:30 [scrapy.core.engine] INFO: Closing spider (finished)
2021-08-02 10:00:30 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2553,
'downloader/request_count': 4,
'downloader/request_method_count/GET': 2,
'downloader/request_method_count/POST': 2,
'downloader/response_bytes': 76289,
'downloader/response_count': 4,
'downloader/response_status_count/200': 3,
'downloader/response_status_count/302': 1,
'elapsed_time_seconds': 20.892172,
'file_count': 1,
'file_status_count/downloaded': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 8, 2, 8, 0, 30, 704638),
'item_scraped_count': 1,
'log_count/DEBUG': 6,
'log_count/INFO': 10,
'log_count/WARNING': 1,
'memusage/max': 55566336,
'memusage/startup': 55566336,
'request_depth_max': 1,
'response_received_count': 3,
'scheduler/dequeued': 4,
'scheduler/dequeued/memory': 4,
'scheduler/enqueued': 4,
'scheduler/enqueued/memory': 4,
'splash/render.html/request_count': 1,
'splash/render.html/response_count/200': 1,
'start_time': datetime.datetime(2021, 8, 2, 8, 0, 9, 812466)}
2021-08-02 10:00:30 [scrapy.core.engine] INFO: Spider closed (finished)
snippet of downloaded file and opened with vim on ubuntu server:
"^#A^#r^#t^#i^#c^#l^#e^#C^#o^#d^#e^#"^#;^#"^#D^#e^#s^#c^#r^#i^#p^#t^#i^#o^#n^#"^#;^#"^#B^#B^#"^#;^#"^#K^#T^#"^#;^#"^#S^#i^#z^#e^#"^#;^#"^#P^#r^#i^#c^#e^#"^#;^#"^#S^#t^#o^#c^#k^#"^#;^#"^#D^#e^#l^#i^#v^#e^#r^#y^#D^#a^#t^#e^#"^#^M^#
^#"^#1^#0^#0^#0^#L^#"^#;^#"^#A^#l^#o^#e^# ^#p^#l^#a^#n^#t^# ^#x^# ^#2^#3^# ^#l^#v^#s^#"^#;^#"^#4^#"^#;^#"^#4^#"^#;^#"^#6^#5^#"^#;^#"^#4^#6^#,^#7^#7^#"^#;^#"^#1^#1^#8^#,^#0^#0^#0^#0^#0^#"^#;^#"^#"^#^M^#
^#"^#1^#0^#0^#0^#M^#"^#;^#"^#A^#l^#o^#e^# ^#p^#l^#a^#n^#t^# ^#x^# ^#1^#7^# ^#l^#v^#s^#"^#;^#"^#4^#"^#;^#"^#1^#2^#"^#;^#"^#5^#0^#"^#;^#"^#3^#2^#,^#6^#1^#"^#;^#"^#2^#0^#6^#,^#0^#0^#0^#0^#0^#"^#;^#"^#"^#^M^#
^#"^#1^#0^#0^#0^#S^#"^#;^#"^#A^#l^#o^#e^# ^#p^#l^#a^#n^#t^# ^#x^# ^#1^#6^# ^#l^#v^#s^#"^#;^#"^#4^#"^#;^#"^#2^#4^#"^#;^#"^#4^#0^#"^#;^#"^#2^#2^#,^#3^#2^#"^#;^#"^#-^#6^#,^#0^#0^#0^#0^#0^#"^#;^#"^#2^#3^#/^#0^#8^#/^#2^#0^#2^#1^#"^#^M^#
^#"^#1^#0^#0^#2^#M^#"^#;^#"^#B^#A^#T^#O^#N^# ^#P^#L^#A^#N^#T^# ^#6^#7^# ^#C^#M^# ^#W^#/^#P^#O^#T^#"^#;^#"^#2^#"^#;^#"^#6^#"^#;^#"^#6^#7^#"^#;^#"^#2^#2^#,^#4^#2^#"^#;^#"^#3^#3^#,^#0^#0^#0^#0^#0^#"^#;^#"^#5^#/^#0^#9^#/^#2^#0^#2^#1^#"^#^M^#
^#"^#1^#0^#0^#2^#S^#"^#;^#"^#B^#A^#T^#O^#N^# ^#P^#L^#A^#N^#T^# ^#4^#2^# ^#C^#M^# ^#W^#/^#P^#O^#T^#"^#;^#"^#4^#"^#;^#"^#1^#2^#"^#;^#"^#4^#2^#"^#;^#"^#1^#0^#,^#5^#4^#"^#;^#"^#-^#9^#5^#,^#0^#0^#0^#0^#0^#"^#;^#"^#5^#/^#0^#9^#/^#2^#0^#2^#1^#"^#^M^#
^#"^#1^#0^#0^#4^#N^#"^#;^#"^#B^#a^#t^#o^#n^# ^#P^#l^#a^#n^#t^#"^#;^#"^#2^#"^#;^#"^#2^#"^#;^#"^#9^#9^#"^#;^#"^#1^#2^#0^#,^#9^#5^#"^#;^#"^#5^#3^#,^#0^#0^#0^#0^#0^#"^#;^#"^#3^#0^#/^#0^#9^#/^#2^#0^#2^#1^#"^#^M^#
^#"^#1^#0^#0^#5^#N^#"^#;^#"^#N^#a^#t^#u^#r^#a^#l^# ^#s^#t^#r^#e^#l^#i^#t^#z^#i^#a^# ^#w^#/^#p^#o^#t^#"^#;^#"^#1^#"^#;^#"^#1^#"^#;^#"^#1^#3^#0^#"^#;^#"^#2^#0^#7^#,^#4^#4^#"^#;^#"^#1^#4^#,^#0^#0^#0^#0^#0^#"^#;^#"^#1^#/^#1^#2^#/^#2^#0^#2^#1^#"^#^M^#
what the heck is this??
When I change the filename to file.csv, downloading the file to my windoof desktop and open it with notepad++ again, it looks good:
"ArticleCode";"Description";"BB";"KT";"Size";"Price";"Stock";"DeliveryDate"
"1000L";"Aloe plant x 23 lvs";"4";"4";"65";"46,77";"118,00000";""
"1000M";"Aloe plant x 17 lvs";"4";"12";"50";"32,61";"206,00000";""
"1000S";"Aloe plant x 16 lvs";"4";"24";"40";"22,32";"-6,00000";"23/08/2021"
"1002M";"BATON PLANT 67 CM W/POT";"2";"6";"67";"22,42";"33,00000";"5/09/2021"
"1002S";"BATON PLANT 42 CM W/POT";"4";"12";"42";"10,54";"-95,00000";"5/09/2021"

for all those who suffer on the same problem:
I just hit in my terminal:
cat Inputfile | tr -d '\0' > Outputfile.csv

First of all try to change the encoding in vim:
set fileencodings=utf-8
or open it in a different text editor in your ubuntu machine, maybe it's just a problem with vim.
Second thing to do is to download the file with the correct name:
import os
from urllib.parse import unquote
from scrapy.pipelines.files import FilesPipeline
from scrapy.http import Request
class TempPipeline():
def process_item(self, item, spider):
return item
class ProcessPipeline(FilesPipeline):
# Overridable Interface
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.files_urls_field, [])
return [Request(u) for u in urls]
def file_path(self, request, response=None, info=None, *, item=None):
# return 'files/' + os.path.basename(urlparse(request.url).path) # from scrapy documentation
return os.path.basename(unquote(request.url)) # this is what worked for my project, but maybe you'll want to add ".csv"
also you need to change settings.py:
ITEM_PIPELINES = {
'myproject.pipelines.MyImagesPipeline': 300
}
FILES_STORE = '/path/to/valid/dir'
Try those two things and if it still doesn't work then update me please.

I think your file containing null bytes.
The issue might be:
Your items.py contains two fields file_urls and files. But, your spider yields only one item i.e., file_urls. Thus, CSV gets created with two columns (file_urls , files) but files column does not contain any data (which might causing this problem). Try commenting this line and see if it works #files = scrapy.Field().

Related

Why does Pycharm start a test when I name a function test_

Why does PyCharm suddenly wants to start a test?
My script is named 1_selection_sort.py And I'm trying to call the function test_selection_sort, and I'm just running with <current file> (Added in 2022.2.2 I assume).
I'm pretty sure this worked 24/10/2022 (Version 2022.2.2 and maybe 2022.2.3, but in 2022.2.4 it's no longer working).
Could someone please tell me when and why this was changed? Or maybe I did something wrong during installing?
My file is NOT named according to this naming scheme (https://docs.pytest.org/en/7.1.x/explanation/goodpractices.html#conventions-for-python-test-discovery):
In those directories, search for test_*.py
or *_test.py files, imported by their test
package name.
"""
Schrijf een functie selection_sort dat een lijst in dalende volgorde sorteert m.b.v. selection sort.
"""
def selection_sort(lijst):
for i in range(len(lijst)):
for j, number in enumerate(lijst):
if number < lijst[i]:
lijst[j] = lijst[i]
lijst[i] = number
return lijst
def test_selection_sort(lijst, check):
print(lijst)
result = selection_sort(lijst)
print(result)
print(check)
assert result == check
print("Begin controle selection_sort")
test_selection_sort([1, 3, 45, 32, 65, 34], [65, 45, 34, 32, 3, 1])
test_selection_sort([1], [1])
test_selection_sort([54, 29, 12, 92, 2, 100], [100, 92, 54, 29, 12, 2])
test_selection_sort([], [])
print("Controle selection_sort succesvol")
Output:
"C:\Program Files\Anaconda3\python.exe" "C:/Users/r0944584/AppData/Local/JetBrains/PyCharm Community Edition 2022.2.4/plugins/python-ce/helpers/pycharm/_jb_pytest_runner.py" --path "C:\Users\r0944584\Downloads\skeletons(4)\skeletons\1_selection_sort.py"
Testing started at 14:13 ...
Launching pytest with arguments C:\Users\r0944584\Downloads\skeletons(4)\skeletons\1_selection_sort.py --no-header --no-summary -q in C:\Users\r0944584\Downloads\skeletons(4)\skeletons
============================= test session starts =============================
collecting ... collected 1 item
1_selection_sort.py::test_selection_sort ERROR [100%]
test setup failed
file C:\Users\r0944584\Downloads\skeletons(4)\skeletons\1_selection_sort.py, line 15
def test_selection_sort(lijst, check):
E fixture 'lijst' not found
> available fixtures: anyio_backend, anyio_backend_name, anyio_backend_options, cache, capfd, capfdbinary, caplog, capsys, capsysbinary, doctest_namespace, monkeypatch, pytestconfig, record_property, record_testsuite_property, record_xml_attribute, recwarn, tmp_path, tmp_path_factory, tmpdir, tmpdir_factory
> use 'pytest --fixtures [testpath]' for help on them.
C:\Users\r0944584\Downloads\skeletons(4)\skeletons\1_selection_sort.py:15
========================= 1 warning, 1 error in 0.01s =========================
Process finished with exit code 1
The solution I found was to disable Pytest following this answer: https://stackoverflow.com/a/59203776/13454049
Disable Pytest for you project
Open the Settings/Preferences | Tools | Python Integrated Tools settings dialog as described in Choosing Your Testing Framework.
In the Default test runner field select Unittests.
Click OK to save the settings.

Scrapy NotSupported and TimeoutError

My goal is to find out each and every link that contains daraz.com.bd/shop/
What I tried so far is bellow..
import scrapy
class LinksSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['daraz.com.bd']
extracted_links = []
shop_list = []
def start_requests(self):
start_urls = 'https://www.daraz.com.bd'
yield scrapy.Request(url=start_urls, callback=self.extract_link)
def extract_link(self, response):
str_response_content_type = str(response.headers.get('content-type'))
if str_response_content_type == "b'text/html; charset=utf-8'" :
links = response.xpath("//a/#href").extract()
for link in links:
link = link.lstrip("/")
if ("https://" or "http://") not in link:
link = "https://" + str(link)
split_link = link.split('.')
if "daraz.com.bd" in link and link not in self.extracted_links:
self.extracted_links.append(link)
if len(split_link) > 1:
if "www" in link and "daraz" in split_link[1]:
yield scrapy.Request(url=link, callback=self.extract_link, dont_filter=True)
elif "www" not in link and "daraz" in split_link[0]:
yield scrapy.Request(url=link, callback=self.extract_link, dont_filter=True)
if "daraz.com.bd/shop/" in link and link not in self.shop_list:
yield {
"links" : link
}
Here is my settings.py file:
BOT_NAME = 'chotosite'
SPIDER_MODULES = ['chotosite.spiders']
NEWSPIDER_MODULE = 'chotosite.spiders'
ROBOTSTXT_OBEY = False
REDIRECT_ENABLED = False
DOWNLOAD_DELAY = 0.25
USER_AGENT = 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z‡ Safari/537.36'
AUTOTHROTTLE_ENABLED = True
What problem am I facing ?
It stops automatically after collecting only 6-7 links that contains daraz.com.bd/shop/.
User timeout caused connection failure: Getting https://www.daraz.com.bd/kettles/ took longer than 180.0 seconds..
INFO: Ignoring response <301 https://www.daraz.com.bd/toner-and-mists/>: HTTP status code is not handled or not allowed
How do I solve those issues ? Please help me.
If you have some other idea to reach my goal I will be more than happy. thank you...
Here are some console log:
2020-12-04 22:21:23 [scrapy.extensions.logstats] INFO: Crawled 891 pages (at 33 pages/min), scraped 6 items (at 0 items/min)
2020-12-04 22:22:05 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://www.daraz.com.bd/kettles/> (failed 1 times): User timeout caused connection failure: Getting https://www.daraz.com.bd/kettles/ took longer than 180.0 seconds..
2020-12-04 22:22:11 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.daraz.com.bd/kettles/> (referer: https://www.daraz.com.bd)
2020-12-04 22:22:11 [scrapy.core.engine] INFO: Closing spider (finished)
2020-12-04 22:22:11 [scrapy.extensions.feedexport] INFO: Stored csv feed (6 items) in: dara.csv
2020-12-04 22:22:11 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 4,
'downloader/exception_type_count/scrapy.exceptions.NotSupported': 1,
'downloader/exception_type_count/twisted.internet.error.TimeoutError': 3,
'downloader/request_bytes': 565004,
'downloader/request_count': 896,
'downloader/request_method_count/GET': 896,
'downloader/response_bytes': 39063472,
'downloader/response_count': 892,
'downloader/response_status_count/200': 838,
'downloader/response_status_count/301': 45,
'downloader/response_status_count/302': 4,
'downloader/response_status_count/404': 5,
'elapsed_time_seconds': 828.333752,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 12, 4, 16, 22, 11, 864492),
'httperror/response_ignored_count': 54,
'httperror/response_ignored_status_count/301': 45,
'httperror/response_ignored_status_count/302': 4,
'httperror/response_ignored_status_count/404': 5,
'item_scraped_count': 6,
'log_count/DEBUG': 901,
'log_count/ERROR': 1,
'log_count/INFO': 78,
'memusage/max': 112971776,
'memusage/startup': 53370880,
'request_depth_max': 5,
'response_received_count': 892,
'retry/count': 3,
'retry/reason_count/twisted.internet.error.TimeoutError': 3,
'scheduler/dequeued': 896,
'scheduler/dequeued/memory': 896,
'scheduler/enqueued': 896,
'scheduler/enqueued/memory': 896,
'start_time': datetime.datetime(2020, 12, 4, 16, 8, 23, 530740)}
2020-12-04 22:22:11 [scrapy.core.engine] INFO: Spider closed (finished)
You can use link extract object to extract all link. Then you can filter your desire link.
In you scrapy shell
scrapy shell https://www.daraz.com.bd
from scrapy.linkextractors import LinkExtractor
l = LinkExtractor()
links = l.extract_links(response)
for link in links:
print(link.url)

pocketsphinx - Not running

I installed pocketsphinx using pip command
pip install pocketsphinx
i referred the link pocketsphinx installation
then i add a test.py and add code like this
from pocketsphinx import LiveSpeech
for phrase in LiveSpeech():
print(phrase)
then i run my file using python test.py command
but showing error
for phrase in LiveSpeech():
File "/home/pi/Sphinix/newvenv/lib/python3.7/site-packages/pocketsphinx/__init__.py", line 206, in __init__
self.ad = Ad(self.audio_device, self.sampling_rate)
File "/home/pi/Sphinix/newvenv/lib/python3.7/site-packages/sphinxbase/ad_pulse.py", line 122, in __init__
this = _ad_pulse.new_Ad(audio_device, sampling_rate)
RuntimeError: new_Ad returned -1
i tried
import pyaudio
p = pyaudio.PyAudio()
for i in range(p.get_device_count()):
print(p.get_device_info_by_index(i))
i got the result
{'index': 0, 'structVersion': 2, 'name': 'USB PnP Sound Device: Audio (hw:2,0)', 'hostApi': 0, 'maxInputChannels': 1, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.008684807256235827, 'defaultLowOutputLatency': -1.0, 'defaultHighInputLatency': 0.034829931972789115, 'defaultHighOutputLatency': -1.0, 'defaultSampleRate': 44100.0}
{'index': 1, 'structVersion': 2, 'name': 'dmix', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': -1.0, 'defaultLowOutputLatency': 0.021333333333333333, 'defaultHighInputLatency': -1.0, 'defaultHighOutputLatency': 0.021333333333333333, 'defaultSampleRate': 48000.0}
then i tried
from pocketsphinx import LiveSpeech
for phrase in LiveSpeech(audio_device=1):
print(phrase)
again showing
for phrase in LiveSpeech(audio_device=1):
File "/home/pi/Sphinix/newvenv/lib/python3.7/site-packages/pocketsphinx/__init__.py", line 206, in __init__
self.ad = Ad(self.audio_device, self.sampling_rate)
File "/home/pi/Sphinix/newvenv/lib/python3.7/site-packages/sphinxbase/ad_pulse.py", line 122, in __init__
this = _ad_pulse.new_Ad(audio_device, sampling_rate)
TypeError: in method 'new_Ad', argument 1 of type 'char const *'
How can i fix this.. ?
Let us see your output:
If everything works your output should be
list of phrases which you speak in text for your code above
But throws an error. This is a clear indication that your LiveSpeech recognizer did not recognize. In order for this to work your LiveSpeech requires a MIC.in your case, It is clear that your audio device is being used by other processes. You can identify which service is using your audio device and terminate its process to release the audio device so that your code can use it.
You can use this command to find the process:
lsof /dev/snd/*
Once you release the sound card from the process using it, you should be able to run the code.

how to save scraped data in db?

I m trying to save scraped data in db but got stuck,
first I have saved scraped data in csv file and using glob library to find newest csv and upload data of that csv into db-
I m not sure what i m doing wrong here plase find code and error
i have created table yahoo_data in db with same column name as that of csv and my code output
import scrapy
from scrapy.http import Request
import MySQLdb
import os
import csv
import glob
class YahooScrapperSpider(scrapy.Spider):
name = 'yahoo_scrapper'
allowed_domains = ['in.news.yahoo.com']
start_urls = ['http://in.news.yahoo.com/']
def parse(self, response):
news_url=response.xpath('//*[#class="Mb(5px)"]/a/#href').extract()
for url in news_url:
absolute_url=response.urljoin(url)
yield Request (absolute_url,callback=self.parse_text)
def parse_text(self,response):
Title=response.xpath('//meta[contains(#name,"twitter:title")]/#content').extract_first()
# response.xpath('//*[#name="twitter:title"]/#content').extract_first(),this also works
Article= response.xpath('//*[#class="canvas-atom canvas-text Mb(1.0em) Mb(0)--sm Mt(0.8em)--sm"]/text()').extract()
yield {'Title':Title,
'Article':Article}
def close(self, reason):
csv_file = max(glob.iglob('*.csv'), key=os.path.getctime)
mydb = MySQLdb.connect(host='localhost',
user='root',
passwd='prasun',
db='books')
cursor = mydb.cursor()
csv_data = csv.reader(csv_file)
row_count = 0
for row in csv_data:
if row_count != 0:
cursor.execute('INSERT IGNORE INTO yahoo_data (Title,Article) VALUES(%s, %s)', row)
row_count += 1
mydb.commit()
cursor.close()
gettting this error
ana. It should be directed not to disrespect the Sikh community and hurt its sentiments by passing such arbitrary and uncalled for orders," said Badal.', 'The SAD president also "brought it to the notice of the Haryana chief minister that Article 25 of the constitution safeguarded the rights of all citizens to profess and practices the tenets of their faith."', '"Keeping these facts in view I request you to direct the Haryana Public Service Commission to rescind its notification and allow Sikhs as well as candidates belonging to other religions to sport symbols of their faith during all examinations," said Badal. (ANI)']}
2019-04-01 16:49:41 [scrapy.core.engine] INFO: Closing spider (finished)
2019-04-01 16:49:41 [scrapy.extensions.feedexport] INFO: Stored csv feed (25 items) in: items.csv
2019-04-01 16:49:41 [scrapy.utils.signal] ERROR: Error caught on signal handler: <bound method YahooScrapperSpider.close of <YahooScrapperSpider 'yahoo_scrapper' at 0x2c60f07bac8>>
Traceback (most recent call last):
File "C:\Users\prasun.j\AppData\Local\Continuum\anaconda3\lib\site-packages\MySQLdb\cursors.py", line 201, in execute
query = query % args
TypeError: not enough arguments for format string
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\prasun.j\AppData\Local\Continuum\anaconda3\lib\site-packages\twisted\internet\defer.py", line 151, in maybeDeferred
result = f(*args, **kw)
File "C:\Users\prasun.j\AppData\Local\Continuum\anaconda3\lib\site-packages\pydispatch\robustapply.py", line 55, in robustApply
return receiver(*arguments, **named)
File "C:\Users\prasun.j\Desktop\scrapping\scrapping\spiders\yahoo_scrapper.py", line 44, in close
cursor.execute('INSERT IGNORE INTO yahoo_data (Title,Article) VALUES(%s, %s)', row)
File "C:\Users\prasun.j\AppData\Local\Continuum\anaconda3\lib\site-packages\MySQLdb\cursors.py", line 203, in execute
raise ProgrammingError(str(m))
MySQLdb._exceptions.ProgrammingError: not enough arguments for format string
2019-04-01 16:49:41 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 7985,
'downloader/request_count': 27,
'downloader/request_method_count/GET': 27,
'downloader/response_bytes': 2148049,
'downloader/response_count': 27,
'downloader/response_status_count/200': 26,
'downloader/response_status_count/301': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2019, 4, 1, 11, 19, 41, 350717),
'item_scraped_count': 25,
'log_count/DEBUG': 53,
'log_count/ERROR': 1,
'log_count/INFO': 8,
'request_depth_max': 1,
'response_received_count': 26,
'scheduler/dequeued': 27,
'scheduler/dequeued/memory': 27,
'scheduler/enqueued': 27,
'scheduler/enqueued/memory': 27,
'start_time': datetime.datetime(2019, 4, 1, 11, 19, 36, 743594)}
2019-04-01 16:49:41 [scrapy.core.engine] INFO: Spider closed (finished)
This error
MySQLdb._exceptions.ProgrammingError: not enough arguments for format string
seems motivated by the lack of a sufficient number of arguments in the row you passed.
You can try to print the row, to understand what is going wrong.
Anyway, if you want to save scraped data to DB, I suggest to write a simple item pipeline, which exports data to DB, without passing through CSV.
For further information abuot item pipelines, see http://doc.scrapy.org/en/latest/topics/item-pipeline.html#topics-item-pipeline
You can found a useful example at Writing items to a MySQL database in Scrapy
seems like you are passing list to the parameters that need to be mentioned by the comma
try to add asterix to 'row' var:
cursor.execute('INSERT IGNORE INTO yahoo_data (Title,Article) VALUES(%s, %s)', row)
to:
cursor.execute('INSERT IGNORE INTO yahoo_data (Title,Article) VALUES(%s, %s)', *row)

Scrapy's pagination error

Hi guys I'm getting the following pagination error while tying to scrape a website
2017-07-27 18:30:21 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.pedidosja.com.br/restaurantes/sao-paulo?a=rua%20tenente%20negr%C3%A3o%20200&cep=04530030&doorNumber=200&bairro=Itaim%20Bibi&lat=-23.585202&lng=-46.671715199999994> (referer: None)
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/root/Documents/Spiders/pedidosYa/pedidosYa/spiders/pedidosya.py", line 35, in parse
next_page_url = response.urljoin(next_page_url)
File "/usr/local/lib/python3.5/dist-packages/scrapy/http/response/text.py", line 82, in urljoin
return urljoin(get_base_url(self), url)
File "/usr/lib/python3.5/urllib/parse.py", line 416, in urljoin
base, url, _coerce_result = _coerce_args(base, url)
File "/usr/lib/python3.5/urllib/parse.py", line 112, in _coerce_args
raise TypeError("Cannot mix str and non-str arguments")
TypeError: Cannot mix str and non-str arguments
2017-07-27 18:30:21 [scrapy.core.engine] INFO: Closing spider (finished)
2017-07-27 18:30:21 [scrapy.extensions.feedexport] INFO: Stored csv feed (13 items) in: test3.csv
2017-07-27 18:30:21 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 653,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 62571,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 7, 27, 23, 30, 21, 221038),
'item_scraped_count': 13,
'log_count/DEBUG': 16,
'log_count/ERROR': 1,
'log_count/INFO': 8,
'memusage/max': 49278976,
'memusage/startup': 49278976,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'spider_exceptions/TypeError': 1,
'start_time': datetime.datetime(2017, 7, 27, 23, 30, 17, 538310)}
2017-07-27 18:30:21 [scrapy.core.engine] INFO: Spider closed (finished)
The spider is raising a type error "Cannot mix str and non-str arguments" I not very experienced in pyhton, I would also apreciate some
resources where I could learn about this type of error. Bellow you will find the code of the spider.
# -*- coding: utf-8 -*-
import scrapy
from pedidosYa.items import PedidosyaItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose
class PedidosyaSpider(scrapy.Spider):
name = 'pedidosya'
allowed_domains = ['www.pedidosya.com.br']
start_urls = [
'https://www.pedidosja.com.br/restaurantes/sao-paulo?a=rua%20tenente%20negr%C3%A3o%20200&cep=04530030&doorNumber=200&bairro=Itaim%20Bibi&lat=-23.585202&lng=-46.671715199999994']
def parse(self, response):
# need to define wrapper
for wrapper in response.css('.restaurant-wrapper.peyaCard.show.with_tags'):
l = ItemLoader(item=PedidosyaItem(), selector=wrapper)
l.add_css('Name', 'a.arrivalName::text')
l.add_css('Menu1', 'span.categories > span::text', MapCompose(str.strip))
l.add_css('Menu2', 'span.categories > span + span::text', MapCompose(str.strip))
l.add_css('Menu3', 'span.categories > span + span + span::text', MapCompose(str.strip))
l.add_css('Address', 'span.address::text', MapCompose(str.strip))
l.add_css('DeliveryTime', 'i.delTime::text', MapCompose(str.strip))
l.add_css('CreditCard', 'ul.content_credit_cards > li > img::attr(alt)', MapCompose(str.strip))
l.add_css('DeliveryCost', 'div.shipping > i::text', MapCompose(str.strip))
l.add_css('Rankink', 'span.ranking i::text', MapCompose(str.strip))
l.add_css('No', 'span.ranking a::text', MapCompose(str.strip))
l.add_css('Sponsor', 'span.grey_small.not-logged::text', MapCompose(str.strip))
l.add_css('DeliveryMinimun', 'div.minDelivery::text', MapCompose(str.strip))
l.add_css('Distance', 'div.distance i::text', MapCompose(str.strip))
yield l.load_item()
next_page_url = response.css('li.arrow.next > a ::attr(href)').extract()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
Thank you in advance and have a wonderfull day!!
next_page_url = response.css('li.arrow.next > a ::attr(href)').extract()
^^^^^^^^^^
if next_page_url:
next_page_url = response.urljoin(next_page_url)
^^^^^^^^^^^^^
Here you are calling urljoin on a list since extract() method when creating next_page_url returns a list of all values, even if it's only one member.
To remedy this use extract_first() instead:
next_page_url = response.css('li.arrow.next > a ::attr(href)').extract_first()
^^^^^^^^^^^^^^^
The problem is in this line:
next_page_url = response.css('li.arrow.next > a::attr(href)').extract()
because extract() method returns always a list of results, even if it founds just one. Either use extract_first() method which will give you just the first result:
next_page_url = response.css('li.arrow.next > a::attr(href)').extract_first()
or get the first element of the results list yourself:
next_page_url = response.css('li.arrow.next > a::attr(href)').extract()[0]

Resources