I am trying to run a scrapy spider through the use of a proxy and am getting errors whenever I run the code.
This is for Mac OSX, python 3.7, scrapy 1.5.1.
I have tried playing around with the settings and middlewares but to no effect.
class superSpider(scrapy.Spider):
name = "myspider"
def start_requests(self):
print('request')
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
print('parse')
The errors I get are:
2019-02-15 08:32:27 [scrapy.utils.log] INFO: Scrapy 1.5.1 started
(bot: superScraper)
2019-02-15 08:32:27 [scrapy.utils.log] INFO: Versions: lxml
4.2.5.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.19.0,
Twisted 18.9.0, Python 3.7.1 (v3.7.1:260ec2c36a, Oct 20 2018,
03:13:28) - [Clang 6.0 (clang-600.0.57)], pyOpenSSL 18.0.0 (OpenSSL
1.1.0j 20 Nov 2018), cryptography 2.4.2, Platform Darwin-17.7.0-
x86_64-i386-64bit
2019-02-15 08:32:27 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'superScraper', 'CONCURRENT_REQUESTS': 25,
'NEWSPIDER_MODULE': 'superScraper.spiders', 'RETRY_HTTP_CODES':
[500, 503, 504, 400, 403, 404, 408], 'RETRY_TIMES': 10,
'SPIDER_MODULES': ['superScraper.spiders'], 'USER_AGENT':
'Mozilla/5.0 (compatible; bingbot/2.0;
+http://www.bing.com/bingbot.htm)'}
2019-02-15 08:32:27 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats']
Unhandled error in Deferred:
2019-02-15 08:32:27 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File
"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy/crawler.py", line 171, in crawl
return self._crawl(crawler, *args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy/crawler.py", line 175, in _crawl
d = crawler.crawl(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/twisted/internet/defer.py", line 1613, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/twisted/internet/defer.py", line 1529, in _cancellableInlineCallbacks
_inlineCallbacks(None, g, status)
--- <exception caught here> ---
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy/crawler.py", line 80, in crawl
self.engine = self._create_engine()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy/crawler.py", line 105, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy/core/engine.py", line 69, in __init__
self.downloader = downloader_cls(crawler)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy/core/downloader/__init__.py", line 88, in __init__
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy/middleware.py", line 58, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy/middleware.py", line 36, in from_settings
mw = mwcls.from_crawler(crawler)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy_proxies/randomproxy.py", line 99, in from_crawler
return cls(crawler.settings)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy_proxies/randomproxy.py", line 74, in __init__
raise KeyError('PROXIES is empty')
builtins.KeyError: 'PROXIES is empty'
These websites are from the documentation for scrapy and it works without using a proxy.
For anyone else having a similar problem, this was an issue with my actual scrapy_proxies.RandomProxy code
Using the code here made it work:
https://github.com/aivarsk/scrapy-proxies
Go into the scrapy_proxies folder and replace the RandomProxy.py code with the one found on github
Mine was found here:
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy_proxies/randomproxy.py
Related
I am running following simple python3 script on Windows11:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
options = webdriver.ChromeOptions()
options.binary_location = 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'
service = Service(executable_path='C:\\ProgramFiles\\Google\\Chrome\\Application\\chromedriver.exe')
webC = webdriver.Chrome(options=options, service=service)
I am getting the flowing error:
Traceback (most recent call last):
File "E:/NNs/CrawlerStarter/selenium_test.py", line 8, in <module>
webC = webdriver.Chrome(options=options, service=service)
File "D:\Anaconda\envs\crawler\lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 69, in __init__
super().__init__(DesiredCapabilities.CHROME['browserName'], "goog",
File "D:\Anaconda\envs\crawler\lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 92, in __init__
super().__init__(
File "D:\Anaconda\envs\crawler\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 276, in __init__
self.start_session(capabilities, browser_profile)
File "D:\Anaconda\envs\crawler\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 369, in start_session
response = self.execute(Command.NEW_SESSION, parameters)
File "D:\Anaconda\envs\crawler\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 434, in execute
self.error_handler.check_response(response)
File "D:\Anaconda\envs\crawler\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 207, in check_response
raise exception_class(value)
selenium.common.exceptions.WebDriverException: Message:
Process finished with exit code 1
There is not error message to tell what's going wrong, and I have tried related solutions that could found in Google, such as setting Path in System Environment. But it's still occurred same exception. Also, I had the same problem when I switched to Firefox
Here are my versions:
python: 3.8.13
selenium: 4.4.0
google chrome: 104.0.5112.81
chromedriver: 104.0.5112.79
Thank you for answering my question!
I am trying to integrate selenium with scrapy to render javascript from a website. I have put the selenium automation code in a constructor, it performs a button click, and then the parse function scrapes the data from the page. But follwing errors are appearing in the terminal window.
code :
import scrapy
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
class test_2(scrapy.Spider):
name='test_2'
#allowed_domains=[]
start_urls=[
'https://www.jackjones.in/st-search?q=shoes'
]
def _init_(self):
print("test-1")
chrome_options=Options()
chrome_options.add_argument("--headless")
driver=webdriver.Chrome("C:/chromedriver")
driver.set_window(1920,1080)
driver.get("https://www.jackjones.in/st-search?q=shoes")
tab=driver.find_elements_by_class_name("st-single-product")
tab[4].click()
self.html=driver.page_source
print("test-2")
driver.close()
def parse(self, response):
print("test-3")
resp=Selector(text=self.html)
yield{
'title':resp.xpath("//h1/text()").get()
}
It appears that compiler does not execute the init function before going to parse function, because neither of the print statements are getting executed but the print statement in parse function is present in the output.
How to fix this?
Output:
PS C:\Users\Vasu\summer\scrapy_selenium> scrapy crawl test_2
2022-07-01 13:18:30 [scrapy.utils.log] INFO: Scrapy 2.6.1 started (bot: scrapy_selenium)
2022-07-01 13:18:30 [scrapy.utils.log] INFO: Versions: lxml 4.9.0.0, libxml2 2.9.14, cssselect 1.1.0, parsel 1.6.0,
w3lib 1.22.0, Twisted 22.4.0, Python 3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)], pyOpenSSL
22.0.0 (OpenSSL 1.1.1p 21 Jun 2022), cryptography 37.0.1, Platform Windows-10-10.0.19044-SP0
2022-07-01 13:18:30 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'scrapy_selenium',
'NEWSPIDER_MODULE': 'scrapy_selenium.spiders',
'SPIDER_MODULES': ['scrapy_selenium.spiders']}
2022-07-01 13:18:30 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-07-01 13:18:30 [scrapy.extensions.telnet] INFO: Telnet Password: 168b57499cd07735
2022-07-01 13:18:30 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2022-07-01 13:18:31 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2022-07-01 13:18:31 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2022-07-01 13:18:31 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2022-07-01 13:18:31 [scrapy.core.engine] INFO: Spider opened
2022-07-01 13:18:31 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-07-01 13:18:31 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-07-01 13:18:31 [filelock] DEBUG: Attempting to acquire lock 1385261511056 on C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\tldextract\.suffix_cache/publicsuffix.org-tlds\de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-07-01 13:18:31 [filelock] DEBUG: Lock 1385261511056 acquired on C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\tldextract\.suffix_cache/publicsuffix.org-tlds\de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-07-01 13:18:32 [filelock] DEBUG: Attempting to release lock 1385261511056 on C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\tldextract\.suffix_cache/publicsuffix.org-tlds\de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-07-01 13:18:32 [filelock] DEBUG: Lock 1385261511056 released on C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\tldextract\.suffix_cache/publicsuffix.org-tlds\de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-07-01 13:18:32 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.jackjones.in/st-search?q=shoes> (referer: None)
test-3
2022-07-01 13:18:32 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.jackjones.in/st-search?q=shoes> (referer: None)
Traceback (most recent call last):
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\utils\defer.py", line 132, in iter_errback
yield next(it)
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\utils\python.py", line 354, in __next__
return next(self.data)
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\utils\python.py", line 354, in __next__
return next(self.data)
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable
for r in iterable:
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable
for r in iterable:
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable
for r in iterable:
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable
for r in iterable:
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\Vasu\anaconda3\envs\sca_sel\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable
for r in iterable:
File "C:\Users\Vasu\summer\scrapy_selenium\scrapy_selenium\spiders\test_2.py", line 31, in parse
resp=Selector(text=self.html)
AttributeError: 'test_2' object has no attribute 'html'
2022-07-01 13:18:32 [scrapy.core.engine] INFO: Closing spider (finished)
2022-07-01 13:18:32 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 237,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 20430,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.613799,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 7, 1, 7, 48, 32, 202155),
'httpcompression/response_bytes': 87151,
'httpcompression/response_count': 1,
'log_count/DEBUG': 6,
'log_count/ERROR': 1,
'log_count/INFO': 10,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'spider_exceptions/AttributeError': 1,
'start_time': datetime.datetime(2022, 7, 1, 7, 48, 31, 588356)}
2022-07-01 13:18:32 [scrapy.core.engine] INFO: Spider closed (finished)
It's __init__, not _init_ (note the double underscores).
Secondly, there is no h1 on the page. Try this instead:
yield {
'title':resp.xpath("//title/text()").get()
}
C:\Users\pheelz\Desktop>pyinstaller -w test.py
here is the error i got
500 INFO: PyInstaller: 3.4
501 INFO: Python: 3.8.0
502 INFO: Platform: Windows-10-10.0.10240-SP0
512 INFO: wrote C:\Users\pheelz\Desktop\test.spec
538 INFO: UPX is not available.
568 INFO: Extending PYTHONPATH with paths
['C:\Users\pheelz\Desktop', 'C:\Users\pheelz\Desktop']
569 INFO: checking Analysis
1167 INFO: checking PYZ
1168 INFO: Building PYZ because PYZ-00.toc is non existent
1169 INFO: Building PYZ (ZlibArchive) C:\Users\pheelz\Desktop\build\test\PYZ-00.pyz
Traceback (most recent call last):
File "c:\users\pheelz\appdata\local\programs\python\python38-32\lib\runpy.py", line 192, in _run_module_as_main
return _run_code(code, main_globals, None,
File "c:\users\pheelz\appdata\local\programs\python\python38-32\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\Users\pheelz\AppData\Local\Programs\Python\Python38-32\Scripts\pyinstaller.exe__main__.py", line 7, in
File "c:\users\pheelz\appdata\local\programs\python\python38-32\lib\site-packages\PyInstaller__main__.py", line 111, in run
run_build(pyi_config, spec_file, **vars(args))
File "c:\users\pheelz\appdata\local\programs\python\python38-32\lib\site-packages\PyInstaller__main__.py", line 63, in run_build
PyInstaller.building.build_main.main(pyi_config, spec_file, **kwargs)
File "c:\users\pheelz\appdata\local\programs\python\python38-32\lib\site-packages\PyInstaller\building\build_main.py", line 838, in main
build(specfile, kw.get('distpath'), kw.get('workpath'), kw.get('clean_build'))
File "c:\users\pheelz\appdata\local\programs\python\python38-32\lib\site-packages\PyInstaller\building\build_main.py", line 784, in build
exec(text, spec_namespace)
File "", line 18, in
File "c:\users\pheelz\appdata\local\programs\python\python38-32\lib\site-packages\PyInstaller\building\api.py", line 98, in init
self.postinit()
File "c:\users\pheelz\appdata\local\programs\python\python38-32\lib\site-packages\PyInstaller\building\datastruct.py", line 158, in postinit
self.assemble()
File "c:\users\pheelz\appdata\local\programs\python\python38-32\lib\site-packages\PyInstaller\building\api.py", line 128, in assemble
self.code_dict = {
File "c:\users\pheelz\appdata\local\programs\python\python38-32\lib\site-packages\PyInstaller\building\api.py", line 129, in
key: strip_paths_in_code(code)
File "c:\users\pheelz\appdata\local\programs\python\python38-32\lib\site-packages\PyInstaller\building\utils.py", line 654, in strip_paths_in_code
consts = tuple(
File "c:\users\pheelz\appdata\local\programs\python\python38-32\lib\site-packages\PyInstaller\building\utils.py", line 655, in
strip_paths_in_code(const_co, new_filename)
File "c:\users\pheelz\appdata\local\programs\python\python38-32\lib\site-packages\PyInstaller\building\utils.py", line 662, in strip_paths_in_code
return code_func(co.co_argcount, co.co_kwonlyargcount, co.co_nlocals, co.co_stacksize,
TypeError: an integer is required (got type bytes)
As far as I can see, you are using PyInstaller 3.4 on Python 3.8.0.
PyInstaller works ONLY with Python 2.7 and 3.4—3.7 as stated here
Maybe you should downgrade your environment to Python 3.7.5
I was trying to scrape the titles from this website(https://minerals.usgs.gov/science/mineral-deposit-database/#products). I am using a crawl spider beacuse I intend to get more information from every url in the page later on!
but got a TypeError: 'Rule' object is not iterable!
This is the code that I used:
import scrapy
import datetime
import socket
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from usgs.items import MineralItem
from scrapy.loader import ItemLoader
class MineralSpider(CrawlSpider):
name = 'mineral'
allowed_domains = ['web']
start_urls = 'https://minerals.usgs.gov/science/mineral-deposit-
database/#products'
rules = (
Rule(LinkExtractor(
restrict_xpaths='//*[#id="products"][1]/p/a'),
callback='parse')
)
def parse(self, response):
it = ItemLoader(item=MineralItem(), response=response)
it.add_xpath('name', '//*[#class="container"]/header/h1/text()')
it.add_value('url', response.url)
it.add_value('project', self.settings.get('BOT_NAME'))
it.add_value('spider', self.name)
it.add_value('server', socket.gethostname())
it.add_value('date', datetime.datetime.now())
return it.load_item()
LOG MESSAGE:
(base) C:\Users\User\Documents\Python WebCrawling Learing
Projects\usgs\usgs\spiders>scrapy crawl mineral
2018-11-16 17:43:03 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot:
usgs)
2018-11-16 17:43:03 [scrapy.utils.log] INFO: Versions: lxml 4.2.5.0, libxml2
2.9.8, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 18.7.0, Python
3.7.0 (default, Jun 28 2018, 08:04:48) [MSC v.1912 64 bit (AMD64)],
pyOpenSSL 18.0.0 (OpenSSL 1.0.2p 14 Aug 2018), cryptography 2.3.1, Platform
Windows-10-10.0.17134-SP0
2018-11-16 17:43:03 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME':
'usgs', 'NEWSPIDER_MODULE': 'usgs.spiders', 'ROBOTSTXT_OBEY': True,
'SPIDER_MODULES': ['usgs.spiders']}
2018-11-16 17:43:03 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
Unhandled error in Deferred:
2018-11-16 17:43:03 [twisted] CRITICAL: Unhandled error in Deferred:
2018-11-16 17:43:03 [twisted] CRITICAL:
Traceback (most recent call last):
File "C:\Users\User\Anaconda3\lib\site-
packages\twisted\internet\defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "C:\Users\User\Anaconda3\lib\site-packages\scrapy\crawler.py", line
79, in crawl
self.spider = self._create_spider(*args, **kwargs)
File "C:\Users\User\Anaconda3\lib\site-packages\scrapy\crawler.py", line
102, in _create_spider
return self.spidercls.from_crawler(self, *args, **kwargs)
File "C:\Users\User\Anaconda3\lib\site-packages\scrapy\spiders\crawl.py",
line 100, in from_crawler
spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
File "C:\Users\User\Anaconda3\lib\site-
packages\scrapy\spiders\__init__.py", line 51, in from_crawler
spider = cls(*args, **kwargs)
File "C:\Users\User\Anaconda3\lib\site-packages\scrapy\spiders\crawl.py",
line 40, in __init__
self._compile_rules()
File "C:\Users\User\Anaconda3\lib\site-packages\scrapy\spiders\crawl.py",
line 92, in _compile_rules
self._rules = [copy.copy(r) for r in self.rules]
TypeError: 'Rule' object is not iterable
Any ideas?
Add a comma after your Rule object, so that it considers that it's a valid tuple:
rules = (
Rule(LinkExtractor(
restrict_xpaths='//*[#id="products"][1]/p/a'),
callback='parse'),
)
You may want to take a look at this answer as well: Why does adding a trailing comma after a variable name make it a tuple?
I am following this tutorial. After writing the first spider it directs me to use the command scrapy crawl quotes, but I seem to obtain an error.
Here is my code:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
Here is the error that I encounter:
PS C:\Users\BB\desktop\scrapy\tutorial\spiders> scrapy crawl quotes
2018-09-12 13:55:06 [scrapy.utils.log] INFO: Scrapy 1.5.0 started
(bot: tutorial)
2018-09-12 13:55:06 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0,
libxml2 2.9.8, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted
17.5.0, Python 3.6.5 |Anaconda, Inc.| (default, Mar 29 2018, 13:32:41) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2o 27 Mar
2018), cryptography 2.2.2, Platform Windows-10-10.0.17134-SP0
Traceback (most recent call last):
File "C:\Users\BB\Anaconda3\lib\site-packages\scrapy\spiderloader.py",
line 69, in load
return self._spiders[spider_name]
KeyError: 'quotes'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\BB\Anaconda3\Scripts\scrapy-script.py", line 5, in
<module>
sys.exit(scrapy.cmdline.execute())
File "C:\Users\BB\Anaconda3\lib\site-packages\scrapy\cmdline.py", line
150, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "C:\Users\BB\Anaconda3\lib\site-packages\scrapy\cmdline.py", line
90, in _run_print_help
func(*a, **kw)
File "C:\Users\BB\Anaconda3\lib\site-packages\scrapy\cmdline.py", line
157, in _run_command
cmd.run(args, opts)
File
"C:\Users\BB\Anaconda3\lib\site-packages\scrapy\commands\crawl.py",
line 57, in run
self.crawler_process.crawl(spname, **opts.spargs)
File "C:\Users\BB\Anaconda3\lib\site-packages\scrapy\crawler.py", line
170, in crawl
crawler = self.create_crawler(crawler_or_spidercls)
File "C:\Users\BB\Anaconda3\lib\site-packages\scrapy\crawler.py", line
198, in create_crawler
return self._create_crawler(crawler_or_spidercls)
File "C:\Users\BB\Anaconda3\lib\site-packages\scrapy\crawler.py", line
202, in _create_crawler
spidercls = self.spider_loader.load(spidercls)
File "C:\Users\BB\Anaconda3\lib\site-packages\scrapy\spiderloader.py",
line 71, in load
raise KeyError("Spider not found: {}".format(spider_name))
KeyError: 'Spider not found: quotes'
OK, I had created a folder called spiders, but the tutorial already did this for me and had a _pycache? _init__? files that were required for the command 'scrapy crawl quotes' to work. In short, I was runnning it from the wrong folder.