Scrapy Crawler gets terminated at random pages - python-3.x

I'm new to Scrapy. I'm crawling the r/india subreddit using a recursive parser to store the title, upvotes and URLs of each thread. It works all fine but the Scraper ends unexpectedly with a weird error that shows:
2018-04-29 00:01:12 [scrapy.core.scraper] ERROR: Spider error processing
<GET https://www.reddit.com/r/india/?count=50&after=t3_8fh5nv> (referer:
https://www.reddit.com/r/india/?count=25&after=t3_8fiqd5)
Traceback (most recent call last):
File "Z:\Anaconda\lib\site-packages\scrapy\utils\defer.py", line 102, in
iter_errback
yield next(it)
File "Z:\Anaconda\lib\site-packages\scrapy\spidermiddlewares\offsite.py",
line 30, in process_spider_output
for x in result:
File "Z:\Anaconda\lib\site-packages\scrapy\spidermiddlewares\referer.py",
line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "Z:\Anaconda\lib\site-packages\scrapy\spidermiddlewares\urllength.py",
line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "Z:\Anaconda\lib\site-packages\scrapy\spidermiddlewares\depth.py", line
58, in <genexpr>
return (r for r in result or () if _filter(r))
File
"C:\Users\jayes\myredditscraper\myredditscraper\spiders\scrapereddit.py",
line 28, in parse
yield Request(url=(next_page),callback=self.parse)
File "Z:\Anaconda\lib\site-packages\scrapy\http\request\__init__.py", line
25, in __init__
self._set_url(url)
File "Z:\Anaconda\lib\site-packages\scrapy\http\request\__init__.py", line
62, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url:
2018-04-29 00:01:12 [scrapy.core.engine] INFO: Closing spider (finished)
And the error comes at random pages each time the spider is run, making it impossible for me to detect what's causing the problem. Here's my redditscraper.py file which contains the code(I've also used Pipeline and Items.py but that doesn't contain any problems I feel)
import scrapy
import time
from scrapy.http.request import Request
from myredditscraper.items import MyredditscraperItem
class ScraperedditSpider(scrapy.Spider):
name = 'scrapereddit'
allowed_domains = ['www.reddit.com']
start_urls = ['http://www.reddit.com/r/india/']
def parse(self,response):
next_page=''
titles=response.css("a.title::text").extract()
links=response.css("a.title::attr(href)").extract()
votes=response.css("div.score.unvoted::attr(title)").extract()
for item in zip(titles,links,votes):
new_item = MyredditscraperItem()
new_item['title']=item[0]
new_item['link']=item[1]
new_item['vote']=item[2]
yield new_item
next_page = response.css("span.next-
button").css('a::attr(href)').extract()[0]
if next_page is not None:
yield Request(url=(next_page),callback=self.parse)

As your exception says
ValueError: Missing scheme in request url:
thats means you try to scrap an invalid url - missing http:// or https:// in the url.
I guess the problem is not in the start_urls because otherwise the parse function won't be called. The problem is in the parse function.
When yield Request is called you need to check if next_page contains schema. it seems like the urls you parse are relative links, so you have two options to continue scrap those links without facing this exception:
Passing absolute url.
Skipping relative urls.

Related

TypeError: <class 'str'> is not callable-Scrapy Framework

I am trying to run some codes from 'Learning Scrapy' book and ran into some errors. The codes that I ran:
`
import scrapy
from ..items import PropertiesItem
from scrapy.loader import ItemLoader
from itemloaders.processors import MapCompose, Join
from urllib.parse import urlparse
class BasicSpider(scrapy.Spider):
name = "basic"
allowed_domains = ["web"]
start_urls = (
'http://localhost:9312/properties/property_000000.html',
)
def parse(self, response):
l = ItemLoader(item=PropertiesItem(), response=response)
l.add_xpath('title', '//*[#itemprop="name"][1]/text()', MapCompose(str.strip, str.title))
l.add_xpath('price', './/*[#itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+')
l.add_xpath('description', '//*[#itemprop="description"][1]/text()', MapCompose(str.strip), Join())
l.add_xpath('address', '//*[#itemtype="http://schema.org/Place"][1]/text()', MapCompose(str.strip))
l.add_xpath('image_urls', '//*[#itemprop="image"][1]/#src', MapCompose(lambda i: urlparse.urljoin(response.url, i)))
return l.load_item()
`
And the error I got:
`
Traceback (most recent call last):
File "c:\users\sadat\appdata\local\programs\python\python39\lib\site-packages\twisted\internet\defer.py", line 857, in _runCallbacks
current.result = callback( # type: ignore[misc]
File "c:\users\sadat\appdata\local\programs\python\python39\lib\site-packages\scrapy\spiders\__init__.py", line 67, in _parse
return self.parse(response, **kwargs)
File "C:\Users\Sadat\Desktop\scrapybook\properties\properties\spiders\basic.py", line 23, in parse
l.add_xpath('image_urls', '//*[#itemprop="image"][1]/#src', 'image_urls', '//*[#itemprop="image"][1]/#src',MapCompose(lambda i: urlparse.urljoin(response.url, i)))
File "c:\users\sadat\appdata\local\programs\python\python39\lib\site-packages\itemloaders\__init__.py", line 350, in add_xpath
self.add_value(field_name, values, *processors, **kw)
File "c:\users\sadat\appdata\local\programs\python\python39\lib\site-packages\itemloaders\__init__.py", line 183, in add_value
value = self.get_value(value, *processors, **kw)
File "c:\users\sadat\appdata\local\programs\python\python39\lib\site-packages\itemloaders\__init__.py", line 246, in get_value
proc = wrap_loader_context(proc, self.context)
File "c:\users\sadat\appdata\local\programs\python\python39\lib\site-packages\itemloaders\common.py", line 11, in wrap_loader_context
if 'loader_context' in get_func_args(function):
File "c:\users\sadat\appdata\local\programs\python\python39\lib\site-packages\itemloaders\utils.py", line 53, in get_func_args
raise TypeError('%s is not callable' % type(func))
TypeError: <class 'str'> is not callable
2022-11-02 16:04:47 [scrapy.core.engine] INFO: Closing spider (finished)
`
Specifically, the code that was giving error initially was this snippet: MapCompose(unicode.strip, unicode.title), there are multiples of them. After some digging, I found out that in later versions of python, str is used instead of unicode. But even after using str I am getting this error. I need help solving this error. Thanks.
Please note that I am using:
Python 3.9.4
Scrapy 2.6.1
VS Code 1.72
I was expecting scrapy to provide a clean scraped data via the Items, not this error.

Django "Internal Server Error" and "MultiValueDictKeyError" when calling external function

I am running a django project on ubuntu that is supposed to build a website where I can
upload an image
run an external script to modify the image on click when uploaded
The uploading process works fine, but when I try to run the external script I get an the internal server error as seen below.
Is this because of the added b' and \n in the path? If so, how can I solve that, please?
Full Code can be found here https://github.com/hackanons/button-python-click/tree/master/Image%20Edit%20Html%20Button%20Run%20Python%20Script/buttonpython
Thanks a lot for help
image is birdie1.png
file raw url birdie1.png
file full url /home/felix/ucmr/button-python-click/Image_Edit_Html_Button_Run_Python_Script/buttonpython/media/birdie1.png
template url /media/birdie1.png
CompletedProcess(args=['/home/felix/anaconda3/envs/ucmr/bin/python', '//home//felix//ucmr//button-python-click//Image_Edit_Html_Button_Run_Python_Script//test.py', 'upload'], returncode=0, stdout=b'Hi upload welcome to Hackanons & time is 2021-06-01 20:35:53.229957\n')
b'/media/temp.png\n'
[01/Jun/2021 20:35:53] "POST /external/ HTTP/1.1" 200 1074
[01/Jun/2021 20:35:53] "GET /media/birdie1.png HTTP/1.1" 200 103100
Internal Server Error: /external/b'/media/temp.png/n'
Traceback (most recent call last):
File "/home/felix/anaconda3/envs/ucmr/lib/python3.7/site-packages/django/utils/datastructures.py", line 78, in __getitem__
list_ = super().__getitem__(key)
KeyError: 'image'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/felix/anaconda3/envs/ucmr/lib/python3.7/site-packages/django/core/handlers/exception.py", line 34, in inner
response = get_response(request)
File "/home/felix/anaconda3/envs/ucmr/lib/python3.7/site-packages/django/core/handlers/base.py", line 115, in _get_response
response = self.process_exception_by_middleware(e, request)
File "/home/felix/anaconda3/envs/ucmr/lib/python3.7/site-packages/django/core/handlers/base.py", line 113, in _get_response
response = wrapped_callback(request, *callback_args, **callback_kwargs)
File "/home/felix/ucmr/button-python-click/Image_Edit_Html_Button_Run_Python_Script/buttonpython/buttonpython/views.py", line 19, in external
image=request.FILES['image']
File "/home/felix/anaconda3/envs/ucmr/lib/python3.7/site-packages/django/utils/datastructures.py", line 80, in __getitem__
raise MultiValueDictKeyError(key)
django.utils.datastructures.MultiValueDictKeyError: 'image'
[01/Jun/2021 20:35:53] "GET /external/b'/media/temp.png/n' HTTP/1.1" 500 80417
views.py
from django.shortcuts import render
import requests
import sys
from subprocess import run,PIPE
from django.core.files.storage import FileSystemStorage
def button(request):
return render(request,'home.html')
def output(request):
data=requests.get("https://www.google.com/")
print(data.text)
data=data.text
return render(request,'home.html',{'data':data})
def external(request):
inp=request.POST.get('param', False)
image=request.FILES['image']
print("image is ",image)
fs=FileSystemStorage()
filename=fs.save(image.name, image)
fileurl=fs.open(filename)
templateurl=fs.url(filename)
print("file raw url",filename)
print("file full url", fileurl)
print("template url",templateurl)
out= run([sys.executable,'//home//felix//ucmr//button-python-click//Image_Edit_Html_Button_Run_Python_Script//test.py',inp],shell=False,stdout=PIPE)
image= run([sys.executable,'//home//felix//ucmr//button-python-click//Image_Edit_Html_Button_Run_Python_Script//image.py',str(fileurl),str(filename)],shell=False,stdout=PIPE)
print(out)
print(image.stdout)
return render(request,'home.html',{'data':out.stdout,'raw_url':templateurl,'edit_url':image.stdout})
The problem is that there is no image file in your request. You need to fix that first, make sure that you are getting what you expect.
You should probably start using forms and calling is_valid to do the check for you. If your request does not have the required fields, it will return an error and validation message.
Your first error happens here:
image=request.FILES['image'] # the 'image' key does not exists so you get KeyError
You can fix this using the .get notation to have a default if it doesn't exist.
image=request.FILES.get('image') # this will not throw an error but have `None` as default
After that change, you have the following:
filename=fs.save(image.name, image) # this throws an error if image is None because you can't do None.name
So now you should check that image is not None, and then continue with your logic, or return a BadRequest error response.
if image is not None:
return HttpResponseBadRequest
# <rest of your code>
The error
Internal Server Error: /external/b'/media/temp.png/n'
was caused by a corrupted path from the stdout Process. I added "universal_newlines=True" from subprocess module and everything works fine.
out= run([sys.executable,'//home//felix//ucmr//button-python-click//Image_Edit_Html_Button_Run_Python_Script//test.py',inp],shell=False,stdout=PIPE,universal_newlines=True)
image= run([sys.executable,'//home//felix//ucmr//button-python-click//Image_Edit_Html_Button_Run_Python_Script//image.py',str(fileurl),str(filename)],shell=False,stdout=PIPE,universal_newlines=True)
See also Why does Popen.communicate() return b'hi\n' instead of 'hi'?

How to get url instead of base64 value from img tags

I'm quite new regarding to the web realm and in an attempt to make my way into it today I started using Beautiful Soup and requests module. Program was going well until code execution gets on (line 78) whereon an error is raised because the source value of img tags is retrieved as base64 format. So far, I'm aware that this could be overcome by just encoding it to ascii then decoding it with base64 decoder, but the thing is that i want it as an URL value. How should i go about it?
NOTES:
(Just in case, one never knows)
python version: 3.8.5
lxml version 4.6.2
beautifulsoup4 4.9.3
ERROR
Traceback (most recent call last):
File "/home/l0new0lf/Documents/Projects/Catalog Scraping/scrape.py", line 72, in <module>
webdata['images'].append(requests.get(game_tables[j].select_one(IMG_TAG_SELECTOR)['src']).content)
File "/usr/lib/python3/dist-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3/dist-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 640, in send
adapter = self.get_adapter(url=request.url)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 731, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for
''
Any help is welcome, many thanks in advance!!!
CODE
#CSS SELECTORS FOR LOOKUPS
TITLE_TAG_SELECTOR = 'tr:first-child td.ta14b.t11 div a strong'
IMG_TAG_SELECTOR = 'tr:last-child td:first-child a img'
DESCRIPTION_TAG_SELECTOR = 'tr:last-child td:last-child p'
GENRES_TAG_SELECTOR = 'tr:last-child td:last-child div.mt05 p :last-child'
GAME_SEARCH_RESULTS_TABLE_SELECTOR = 'table.mt1.tablestriped4.froboto_real.blanca'
#CSS CLASS ATTRIBUTE
GAME_TABLE_CLASS = 'table transparente tablasinbordes'
rq = requests.get(
f'https://vandal.elespanol.com/juegos/13/pc/letra/a/inicio/1')
soup = BeautifulSoup(rq.content, 'lxml')
main_table = soup.select_one(GAME_SEARCH_RESULTS_TABLE_SELECTOR)
print('main_table:', main_table)
game_tables = main_table.find_all('table', {'class': GAME_TABLE_CLASS})
print('game_tables', game_tables)
#help(game_tables)
webdata = {
'titles' : [],
'images' : [],
'descriptions' : [],
'genres': [],
}
for j in range(len(game_tables)):
webdata['titles'].append(game_tables[j].select_one(TITLE_TAG_SELECTOR).text)
print(game_tables[j].select_one(IMG_TAG_SELECTOR)['src'])
webdata['images'].append(requests.get(game_tables[j].select_one(IMG_TAG_SELECTOR)['src']).content)
webdata['descriptions'].append(game_tables[j].select_one(DESCRIPTION_TAG_SELECTOR))
webdata['genres'].append(game_tables[j].select_one(GENRES_TAG_SELECTOR))
print(titles, images, descriptions, genres, sep='\n')

ValueError: Missing scheme in request url Scrapy

i am trying to scrape https://www.skynewsarabia.com/ using Scrapy and i having this error ValueError: Missing scheme in request url:
i tried every single solution i have found on stackoverflow and none worked for me.
here is my spider:
name = 'skynews'
allowed_domains = ['www.skynewsarabia.com']
start_urls = ['https://www.skynewsarabia.com/sport/latest-news-%D8%A2%D8%AE%D8%B1-%D8%A7%D9%84%D8%A3%D8%AE%D8%A8%D8%A7%D8%B1']
}
def parse(self, response):
link = "https://www.skynewsarabia.com"
# get the urls of each article
urls = response.css("a.item-wrapper::attr(href)").extract()
# for each article make a request to get the text of that article
for url in urls:
# get the info of that article using the parse_details function
yield scrapy.Request(url=link +url, callback=self.parse_details)
# go and get the link for the next article
next_article = response.css("a.item-wrapper::attr(href)").extract_first()
if next_article:
# keep repeating the process until the bot visits all the links in the website!
yield scrapy.Request(url=next_article, callback=self.parse) # keep calling yourself!
here is the whole error:
2019-01-30 11:49:34 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-01-30 11:49:34 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2019-01-30 11:49:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.skynewsarabia.com/robots.txt> (referer: None)
2019-01-30 11:49:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.skynewsarabia.com/sport/latest-news-%D8%A2%D8%AE%D8%B1-%D8%A7%D9%84%D8%A3%D8%AE%D8%A8%D8%A7%D8%B1> (referer: None)
2019-01-30 11:49:35 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.skynewsarabia.com/sport/latest-news-%D8%A2%D8%AE%D8%B1-%D8%A7%D9%84%D8%A3%D8%AE%D8%A8%D8%A7%D8%B1> (referer: None)
Traceback (most recent call last):
File "c:\users\hozrifai\desktop\scraping\venv\lib\site-
packages\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "c:\users\hozrifai\desktop\scraping\venv\lib\site-
packages\scrapy\spidermiddlewares\offsite.py", line 30, in
process_spider_output
for x in result:
File "c:\users\hozrifai\desktop\scraping\venv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "c:\users\hozrifai\desktop\scraping\venv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\users\hozrifai\desktop\scraping\venv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\HozRifai\Desktop\scraping\articles\articles\spiders\skynews.py", line 28, in parse
yield scrapy.Request(url=next_article, callback=self.parse) # keep calling yourself!
File "c:\users\hozrifai\desktop\scraping\venv\lib\site-packages\scrapy\http\request\__init__.py", line 25, in __init__
self._set_url(url)
File "c:\users\hozrifai\desktop\scraping\venv\lib\site-packages\scrapy\http\request\__init__.py", line 62, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: /sport/1222754-%D8%A8%D9%8A%D8%B1%D9%86%D9%84%D9%8A-%D9%8A%D8%B6%D8%B9-%D8%AD%D8%AF%D8%A7-%D9%84%D8%B3%D9%84%D8%B3%D9%84%D8%A9-%D8%A7%D9%86%D8%AA%D8%B5%D8%A7%D8%B1%D8%A7%D8%AA-%D8%B3%D9%88%D9%84%D8%B4%D8%A7%D8%B1
2019-01-30 11:49:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.skynewsarabia.com/sport/1222754-%D8%A8%D9%8A%D8%B1%D9%86%D9%84%D9%8A-%D9%8A%D8%B6%D8%B9-%D8%AD%D8%AF%D8%A7-%D9%84%D8%B3%D9%84%D8%B3%D9%84%D8%A9-%D8%A7%D9%86%D8%AA%D8%B5%D8%A7%D8%B1%D8%A7%D8%AA-%D8%B3%D9%88%D9%84%D8%B4%D8%A7%D8%B1> (referer: https://www.skynewsarabia.com/sport/latest-news-%D8%A2%D8%AE%D8%B1-%D8%A7%D9%84%D8%A3%D8%AE%D8%A8%D8%A7%D8%B1)
thanks in advance
You have next_article url without scheme. Try:
next_article = response.css("a.item-wrapper::attr(href)").get()
if next_article:
yield scrapy.Request(response.urljoin(next_article))
In Your Next article retrieval:
next_article = response.css("a.item-wrapper::attr(href)").extract_first()
Are you sure you are getting full link starting from http/https?
For better approach if we are not sure about the url we are receiving always use urljoin as:
url = response.urljoin(next_article) # you can also use this in your above logic.

Scrapy not calling the assigned pipeline when run from a script

I have a piece of code to test scrapy. My goal is to use scrapy without having to call the scrapy command from the terminal, so I can embed this code somewhere else.
The code is the following:
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.item import Item, Field
from scrapy.crawler import CrawlerProcess
import json
class JsonWriterPipeline(object):
file = None
def open_spider(self, spider):
self.file = open('items.json', 'wb')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
class StackItem(Item):
title = Field()
url = Field()
class StackSpider(Spider):
name = "stack"
allowed_domains = ["stackoverflow.com"]
start_urls = ["http://stackoverflow.com/questions?pagesize=50&sort=newest"]
def parse(self, response):
questions = Selector(response).xpath('//div[#class="summary"]/h3')
for question in questions:
item = StackItem()
item['title'] = question.xpath('a[#class="question-hyperlink"]/text()').extract()[0]
item['url'] = question.xpath('a[#class="question-hyperlink"]/#href').extract()[0]
yield item
if __name__ == '__main__':
settings = dict()
settings['USER_AGENT'] = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
settings['ITEM_PIPELINES'] = {'JsonWriterPipeline': 1}
process = CrawlerProcess(settings=settings)
spider = StackSpider()
process.crawl(spider)
process.start()
As you see, the code is self contained and I override two settings; the USER_AGENT and the ITEM_PIPELINES. However when I set debug points in the JsonWriterPipeline class, I see that the code is executed and the debug points are never reached, thus the custom pipeline is not being used.
How can this be fixed?
I get 2 errors when running your script with scrapy 1.3.2 and Python 3.5.
First:
Unhandled error in Deferred:
2017-02-21 13:47:23 [twisted] CRITICAL: Unhandled error in Deferred:
2017-02-21 13:47:23 [twisted] CRITICAL:
Traceback (most recent call last):
File "/home/paul/.virtualenvs/scrapy13.py3/lib/python3.5/site-packages/scrapy/utils/misc.py", line 39, in load_object
dot = path.rindex('.')
ValueError: substring not found
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/paul/.virtualenvs/scrapy13.py3/lib/python3.5/site-packages/twisted/internet/defer.py", line 1301, in _inlineCallbacks
result = g.send(result)
File "/home/paul/.virtualenvs/scrapy13.py3/lib/python3.5/site-packages/scrapy/crawler.py", line 72, in crawl
self.engine = self._create_engine()
File "/home/paul/.virtualenvs/scrapy13.py3/lib/python3.5/site-packages/scrapy/crawler.py", line 97, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "/home/paul/.virtualenvs/scrapy13.py3/lib/python3.5/site-packages/scrapy/core/engine.py", line 70, in __init__
self.scraper = Scraper(crawler)
File "/home/paul/.virtualenvs/scrapy13.py3/lib/python3.5/site-packages/scrapy/core/scraper.py", line 71, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "/home/paul/.virtualenvs/scrapy13.py3/lib/python3.5/site-packages/scrapy/middleware.py", line 58, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/home/paul/.virtualenvs/scrapy13.py3/lib/python3.5/site-packages/scrapy/middleware.py", line 34, in from_settings
mwcls = load_object(clspath)
File "/home/paul/.virtualenvs/scrapy13.py3/lib/python3.5/site-packages/scrapy/utils/misc.py", line 41, in load_object
raise ValueError("Error loading object '%s': not a full path" % path)
ValueError: Error loading object 'JsonWriterPipeline': not a full path
You need to give a complete path for the pipeline. For example here, the __main__ namespace works:
settings['ITEM_PIPELINES'] = {'__main__.JsonWriterPipeline': 1}
Second (with this pipeline class fix above), you get loads of:
2017-02-21 13:47:52 [scrapy.core.scraper] ERROR: Error processing {'title': 'Apply Remote Commits to a Local Pull Request',
'url': '/questions/42367647/apply-remote-commits-to-a-local-pull-request'}
Traceback (most recent call last):
File "/home/paul/.virtualenvs/scrapy13.py3/lib/python3.5/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "test.py", line 20, in process_item
self.file.write(line)
TypeError: a bytes-like object is required, not 'str'
which you can fix with writing items JSON as bytes:
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line.encode('ascii'))
return item

Resources