POST request using scrapy.FormRequest

POST request using scrapy.FormRequest - python-3.x

I need to get the data from v2?count=3 from the page https://support.hpe.com/hpesc/public/km/Security-Bulletin-Library#sort=relevancy&layout=table&numberOfResults=25&f:#kmdocsecuritybulletin=[4000003]&f:#kmdoclanguagecode=[cv1871440,cv1871463]&hpe=1
The data I need is shown in the image
class HPUXSpider(_BaseSpider):
name = 'hp_ux_spider'
def start_requests(self):
return [scrapy.FormRequest(
url='https://platform.cloud.coveo.com/rest/search/v2?count=3',
method='POST',
formdata={
'actionsHistory': r'[{"name":"Query","time":"\"2020-07-13T12:49:51.480Z\""},{"name":"Query","time":"\"2020-07-13T10:44:35.303Z\""},{"name":"Query","time":"\"2020-07-13T07:49:10.078Z\""},{"name":"Query","time":"\"2020-07-13T06:58:59.532Z\""},{"name":"Query","time":"\"2020-07-13T06:57:24.599Z\""},{"name":"Query","time":"\"2020-07-12T21:47:41.323Z\""},{"name":"Query","time":"\"2020-07-12T16:38:19.741Z\""},{"name":"Query","time":"\"2020-07-12T06:04:36.049Z\""},{"name":"Query","time":"\"2020-07-12T05:59:39.814Z\""},{"name":"Query","time":"\"2020-07-11T19:31:55.963Z\""},{"name":"Query","time":"\"2020-07-11T19:29:55.997Z\""},{"name":"Query","time":"\"2020-07-11T19:23:29.999Z\""},{"name":"Query","time":"\"2020-07-11T19:21:09.859Z\""},{"name":"Query","time":"\"2020-07-11T19:19:03.748Z\""},{"name":"Query","time":"\"2020-07-11T19:17:23.735Z\""},{"name":"Query","time":"\"2020-07-11T19:14:51.152Z\""},{"name":"Query","time":"\"2020-07-11T18:54:03.418Z\""},{"name":"Query","time":"\"2020-07-11T12:28:39.484Z\""},{"name":"Query","time":"\"2020-07-10T13:08:42.876Z\""},{"name":"Query","time":"\"2020-07-10T12:57:51.285Z\""}]',
'referrer': 'https://support.hpe.com/hpesc/public/km/Security-Bulletin-Library',
'visitorId': '33b0ede7-3274-486f-a31c-23ed3001ad91',
'isGuestUser': 'false',
'aq': '(#kmdoctypedetails==cv66000018) ((NOT #kmdoctype=cv60000001)) (#kmdocsecuritybulletin==4000003) (#kmdoclanguagecode==(cv1871440,cv1871463))',
'cq': '(#source=="cdp-km-document-pro-h4-v2")',
'searchHub': 'HPE-SecurityBulletins-Page',
'locale': 'ru',
'firstResult': '0',
'numberOfResults': '25',
'excerptLength': '500',
'enableDidYouMean': 'true',
'sortCriteria': 'relevancy',
'queryFunctions': '[]',
'rankingFunctions': '[]',
'groupBy': r'[{"field":"#kmdocsecuritybulletin","maximumNumberOfValues":20,"sortCriteria":"nosort","injectionDepth":1000,"completeFacetWithStandardValues":true,"allowedValues":["4000019","4000018","4000005","4000004","4000017","4000003","4000009","4000006","4000007","4000008","4000001","4000002","4000010","4000011","4000012","4000013","4000014","4000015","4000016"],"advancedQueryOverride":"(#kmdoctypedetails==cv66000018) ((NOT #kmdoctype=cv60000001)) (#kmdoclanguagecode==(cv1871440,cv1871463))","constantQueryOverride":"(#source==\"cdp-km-document-pro-h4-v2\")"},{"field":"#kmdoclanguagecode","maximumNumberOfValues":6,"sortCriteria":"Score","injectionDepth":1000,"completeFacetWithStandardValues":true,"allowedValues":["cv1871440","cv1871463"],"advancedQueryOverride":"(#kmdoctypedetails==cv66000018) ((NOT #kmdoctype=cv60000001)) (#kmdocsecuritybulletin==4000003)","constantQueryOverride":"(#source==\"cdp-km-document-pro-h4-v2\")"},{"field":"#kmdoctopissue","maximumNumberOfValues":6,"sortCriteria":"Score","injectionDepth":1000,"completeFacetWithStandardValues":true,"allowedValues":[],"advancedQueryOverride":"(#kmdoctypedetails==cv66000018) ((NOT #kmdoctype=cv60000001)) (#kmdocsecuritybulletin==4000003) (#kmdoclanguagecode==(cv1871440,cv1871463))","constantQueryOverride":"(#source==\"cdp-km-document-pro-h4-v2\") #kmdoctopissueexpirationdate>today"},{"field":"#kmdocdisclosurelevel","maximumNumberOfValues":6,"sortCriteria":"Score","injectionDepth":1000,"completeFacetWithStandardValues":true,"allowedValues":[]},{"field":"#hpescuniversaldate","completeFacetWithStandardValues":true,"maximumNumberOfValues":1,"sortCriteria":"nosort","generateAutomaticRanges":true,"advancedQueryOverride":"(#kmdoctypedetails==cv66000018) ((NOT #kmdoctype=cv60000001)) (#kmdocsecuritybulletin==4000003) (#kmdoclanguagecode==(cv1871440,cv1871463)) #uri","constantQueryOverride":"(#source==\"cdp-km-document-pro-h4-v2\") #hpescuniversaldate>1970/01/01#00:00:00"},{"field":"#hpescuniversaldate","completeFacetWithStandardValues":true,"maximumNumberOfValues":1,"sortCriteria":"nosort","generateAutomaticRanges":true,"constantQueryOverride":"(#source==\"cdp-km-document-pro-h4-v2\") #hpescuniversaldate>1970/01/01#00:00:00 #hpescuniversaldate>1970/01/01#00:00:00"},{"field":"#hpescuniversaldate","maximumNumberOfValues":5,"sortCriteria":"nosort","injectionDepth":1000,"completeFacetWithStandardValues":true,"rangeValues":[{"start":"1900-01-31T18:20:09.000Z","end":"2020-07-13T17:00:00.000Z","label":"All dates","endInclusive":false},{"start":"2020-07-05T17:00:00.000Z","end":"2020-07-13T17:00:00.000Z","label":"Last 7 days","endInclusive":false},{"start":"2020-06-12T17:00:00.000Z","end":"2020-07-13T17:00:00.000Z","label":"Last 30 days","endInclusive":false},{"start":"2020-05-13T17:00:00.000Z","end":"2020-07-13T17:00:00.000Z","label":"Last 60 days","endInclusive":false},{"start":"2020-04-13T17:00:00.000Z","end":"2020-07-12T17:00:00.000Z","label":"Last 90 days","endInclusive":false}]}]',
'facetOptions': '{}',
'categoryFacets': '[]',
'retrieveFirstSentences': 'true',
'timezone': 'Asia/Tomsk',
'enableQuerySyntax': 'false',
'enableDuplicateFiltering': 'false',
'enableCollaborativeRating': 'false',
'debug': 'false',
'context': '{"tracking_id":"HPESCXwxYkRD5BgcAAFnGlJ0AAAAY","active_features":"DCS,DHFWS,SA2,patchCoveoSearchToggle,sa2_product_focus_target_levels_toggle,toggleCsr,toggleSecBulletin","user_tracking_id":"XwRimRD5AcgAAFl2OMkAAAAW"}',
'allowQueriesWithoutKeywords': 'true',
},
callback=self.save_response,
cb_kwargs=dict(path_dir=DATA_DIR, file_name='1.json')
) ]
Log
2020-07-14 07:17:33 [scrapy.core.engine] INFO: Spider opened
2020-07-14 07:17:33 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-07-14 07:17:33 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-07-14 07:17:34 [scrapy_user_agents.middlewares] DEBUG: Assigned User-Agent Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36
2020-07-14 07:17:34 [scrapy.core.engine] DEBUG: Crawled (401) <POST https://platform.cloud.coveo.com/rest/search/v2?count=3> (referer: None)
2020-07-14 07:17:34 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <401 https://platform.cloud.coveo.com/rest/search/v2?count=3>: HTTP status code is not handled or not allowed
2020-07-14 07:17:34 [scrapy.core.engine] INFO: Closing spider (finished)
2020-07-14 07:17:34 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
What am I doing wrong?
Traceback1
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/scrapy/crawler.py", line 192, in crawl
return self._crawl(crawler, *args, **kwargs)
File "/usr/local/lib/python3.7/site-packages/scrapy/crawler.py", line 196, in _crawl
d = crawler.crawl(*args, **kwargs)
File "/usr/local/lib/python3.7/site-packages/twisted/internet/defer.py", line 1613, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "/usr/local/lib/python3.7/site-packages/twisted/internet/defer.py", line 1529, in _cancellableInlineCallbacks
_inlineCallbacks(None, g, status)
--- <exception caught here> ---
File "/usr/local/lib/python3.7/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/usr/local/lib/python3.7/site-packages/scrapy/crawler.py", line 88, in crawl
start_requests = iter(self.spider.start_requests())
File "/code/hp_ux/splash/spiders/hp_ux_spider.py", line 50, in start_requests
cb_kwargs=dict(path_dir=DATA_DIR, file_name='1.json')
File "/usr/local/lib/python3.7/site-packages/scrapy/http/request/form.py", line 27, in __init__
super(FormRequest, self).__init__(*args, **kwargs)
builtins.TypeError: __init__() got an unexpected keyword argument 'params'
2020-07-14 11:32:04 [twisted] CRITICAL:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/usr/local/lib/python3.7/site-packages/scrapy/crawler.py", line 88, in crawl
start_requests = iter(self.spider.start_requests())
File "/code/hp_ux/splash/spiders/hp_ux_spider.py", line 50, in start_requests
cb_kwargs=dict(path_dir=DATA_DIR, file_name='1.json')
File "/usr/local/lib/python3.7/site-packages/scrapy/http/request/form.py", line 27, in __init__
super(FormRequest, self).__init__(*args, **kwargs)
TypeError: __init__() got an unexpected keyword argument 'params'

you have to use headers with Authorization for this website:
def parse(self, response):
headers = {
'Connection': 'keep-alive',
'Authorization': 'Bearer eyJhbGciOiJIUzI1NiJ9.eyJwaXBlbGluZSI6ImNkcC1ocGVzYy1waXBlbGluZS1wcm8taDQtdjEyIiwidXNlckdyb3VwcyI6WyJMT0NBTF9QT1JUQUxfSFBQX1VTRVJTIiwiTE9DQUxfUE9SVEFMX0NPVU5UUllfVVMiLCJMT0NBTF9QT1JUQUxfTEFOR1VBR0VfRU4iLCJMT0NBTF9QT1JUQUxfQ09NUEFOWV9IUEUiLCJMT0NBTF9QT1JUQUxfR1VFU1RfVVNFUlMiXSwidjgiOnRydWUsIm9yZ2FuaXphdGlvbiI6Imhld2xldHRwYWNrYXJkcHJvZHVjdGlvbml3bWc5Yjl3IiwidXNlcklkcyI6W3sicHJvdmlkZXIiOiJFbWFpbCBTZWN1cml0eSBQcm92aWRlciIsIm5hbWUiOiJhbm9ueW1vdXNAY292ZW8uY29tIiwidHlwZSI6IlVzZXIifV0sInJvbGVzIjpbInF1ZXJ5RXhlY3V0b3IiXSwiZXhwIjoxNTk0ODEzODI0LCJpYXQiOjE1OTQ3Mjc0MjR9.O-SGmzsy2QdMClI9CfmN5MY9G1JBQmCe9m379zFpa4Y',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset="UTF-8"',
'Accept': '*/*',
'Origin': 'https://support.hpe.com',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://support.hpe.com/hpesc/public/km/Security-Bulletin-Library',
'Accept-Language': 'en-US,en;q=0.9,ru-RU;q=0.8,ru;q=0.7,uk;q=0.6,en-GB;q=0.5',
}
data = {
'actionsHistory': '[{"name":"Query","time":"\\"2020-07-14T11:50:24.995Z\\""},{"name":"Query","time":"\\"2020-07-14T11:15:14.602Z\\""}]',
'referrer': '',
'visitorId': 'deabe929-cc0e-41eb-ab62-f62e40aca82a',
'isGuestUser': 'false',
'aq': '(#kmdoctypedetails==cv66000018) ((NOT #kmdoctype=cv60000001)) (#kmdocsecuritybulletin==4000003) (#kmdoclanguagecode==(cv1871440,cv1871463))',
'cq': '(#source=="cdp-km-document-pro-h4-v2")',
'searchHub': 'HPE-SecurityBulletins-Page',
'locale': 'en',
'firstResult': '25',
'numberOfResults': '25',
'excerptLength': '500',
'enableDidYouMean': 'true',
'sortCriteria': 'relevancy',
'queryFunctions': '[]',
'rankingFunctions': '[]',
'groupBy': '[{"field":"#kmdocsecuritybulletin","maximumNumberOfValues":20,"sortCriteria":"nosort","injectionDepth":1000,"completeFacetWithStandardValues":true,"allowedValues":["4000019","4000018","4000005","4000004","4000017","4000003","4000009","4000006","4000007","4000008","4000001","4000002","4000010","4000011","4000012","4000013","4000014","4000015","4000016"],"advancedQueryOverride":"(#kmdoctypedetails==cv66000018) ((NOT #kmdoctype=cv60000001)) (#kmdoclanguagecode==(cv1871440,cv1871463))","constantQueryOverride":"(#source==\\"cdp-km-document-pro-h4-v2\\")"},{"field":"#kmdoclanguagecode","maximumNumberOfValues":6,"sortCriteria":"Score","injectionDepth":1000,"completeFacetWithStandardValues":true,"allowedValues":["cv1871440","cv1871463"],"advancedQueryOverride":"(#kmdoctypedetails==cv66000018) ((NOT #kmdoctype=cv60000001)) (#kmdocsecuritybulletin==4000003)","constantQueryOverride":"(#source==\\"cdp-km-document-pro-h4-v2\\")"},{"field":"#kmdoctopissue","maximumNumberOfValues":6,"sortCriteria":"Score","injectionDepth":1000,"completeFacetWithStandardValues":true,"allowedValues":[],"advancedQueryOverride":"(#kmdoctypedetails==cv66000018) ((NOT #kmdoctype=cv60000001)) (#kmdocsecuritybulletin==4000003) (#kmdoclanguagecode==(cv1871440,cv1871463))","constantQueryOverride":"(#source==\\"cdp-km-document-pro-h4-v2\\") #kmdoctopissueexpirationdate>today"},{"field":"#kmdocdisclosurelevel","maximumNumberOfValues":6,"sortCriteria":"Score","injectionDepth":1000,"completeFacetWithStandardValues":true,"allowedValues":[]},{"field":"#hpescuniversaldate","maximumNumberOfValues":5,"sortCriteria":"nosort","injectionDepth":1000,"completeFacetWithStandardValues":true,"rangeValues":[{"start":"1900-01-31T21:57:56.000Z","end":"2020-07-14T21:00:00.000Z","label":"All dates","endInclusive":false},{"start":"2020-07-06T21:00:00.000Z","end":"2020-07-14T21:00:00.000Z","label":"Last 7 days","endInclusive":false},{"start":"2020-06-13T21:00:00.000Z","end":"2020-07-14T21:00:00.000Z","label":"Last 30 days","endInclusive":false},{"start":"2020-05-14T21:00:00.000Z","end":"2020-07-14T21:00:00.000Z","label":"Last 60 days","endInclusive":false},{"start":"2020-04-14T21:00:00.000Z","end":"2020-07-13T21:00:00.000Z","label":"Last 90 days","endInclusive":false}]},{"field":"#hpescuniversaldate","completeFacetWithStandardValues":true,"maximumNumberOfValues":1,"sortCriteria":"nosort","generateAutomaticRanges":true,"advancedQueryOverride":"(#kmdoctypedetails==cv66000018) ((NOT #kmdoctype=cv60000001)) (#kmdocsecuritybulletin==4000003) (#kmdoclanguagecode==(cv1871440,cv1871463)) #uri","constantQueryOverride":"(#source==\\"cdp-km-document-pro-h4-v2\\") #hpescuniversaldate>1970/01/01#00:00:00"},{"field":"#hpescuniversaldate","completeFacetWithStandardValues":true,"maximumNumberOfValues":1,"sortCriteria":"nosort","generateAutomaticRanges":true,"constantQueryOverride":"(#source==\\"cdp-km-document-pro-h4-v2\\") #hpescuniversaldate>1970/01/01#00:00:00 #hpescuniversaldate>1970/01/01#00:00:00"}]',
'facetOptions': '{}',
'categoryFacets': '[]',
'retrieveFirstSentences': 'true',
'timezone': 'Europe/Kiev',
'enableQuerySyntax': 'false',
'enableDuplicateFiltering': 'false',
'enableCollaborativeRating': 'false',
'debug': 'false',
'context': '{"tracking_id":"HPESCXw2cKBD5AcgAADvUM8IAAAAa","active_features":"DCS,DHFWS,SA2,patchCoveoSearchToggle,sa2_product_focus_target_levels_toggle,toggleCsr,toggleSecBulletin","user_tracking_id":"Xw2TthD5AcgAACecWi0AAAAZ"}',
'allowQueriesWithoutKeywords': 'true'
}
url = 'https://platform.cloud.coveo.com/rest/search/v2?count=3'
yield scrapy.FormRequest(
url=url,
formdata=data,
headers=headers,
callback=self.parse_result
)
def parse_result(self, response):
j_obj = json.loads(response.body_as_unicode())
print(j_obj)

Related

requests.get return error HTTPSConnectionPool Python

The code below, needs to return 200, but an error occurs for some domains.
import requests
url1 = 'https://www.pontofrio.com.br/'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.11 (KHTML, like Gecko) '
'Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
response = requests.get(url1, headers, timeout=10)
print(response.status_code)
Return:
Traceback (most recent call last):
File "C:\Python34\lib\site-packages\urllib3\connectionpool.py", line 384, in _make_request
six.raise_from(e, None)
File "<string>", line 2, in raise_from
File "C:\Python34\lib\site-packages\urllib3\connectionpool.py", line 380, in _make_request
httplib_response = conn.getresponse()
File "C:\Python34\lib\http\client.py", line 1148, in getresponse
response.begin()
File "C:\Python34\lib\http\client.py", line 352, in begin
version, status, reason = self._read_status()
File "C:\Python34\lib\http\client.py", line 314, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Python34\lib\socket.py", line 371, in readinto
return self._sock.recv_into(b)
File "C:\Python34\lib\site-packages\urllib3\contrib\pyopenssl.py", line 309, in recv_into
return self.recv_into(*args, **kwargs)
File "C:\Python34\lib\site-packages\urllib3\contrib\pyopenssl.py", line 307, in recv_into
raise timeout('The read operation timed out')
socket.timeout: The read operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Python34\lib\site-packages\requests\adapters.py", line 449, in send
timeout=timeout
File "C:\Python34\lib\site-packages\urllib3\connectionpool.py", line 638, in urlopen
_stacktrace=sys.exc_info()[2])
File "C:\Python34\lib\site-packages\urllib3\util\retry.py", line 367, in increment
raise six.reraise(type(error), error, _stacktrace)
File "C:\Python34\lib\site-packages\urllib3\packages\six.py", line 686, in reraise
raise value
File "C:\Python34\lib\site-packages\urllib3\connectionpool.py", line 600, in urlopen
chunked=chunked)
File "C:\Python34\lib\site-packages\urllib3\connectionpool.py", line 386, in _make_request
self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
File "C:\Python34\lib\site-packages\urllib3\connectionpool.py", line 306, in _raise_timeout
raise ReadTimeoutError(self, url, "Read timed out. (read timeout=%s)" % timeout_value)
urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='www.pontofrio.com.br', port=443): Read timed out. (read timeout=10)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "c:/teste.py", line 219, in <module>
url = montaurl(dominio)
File "c:/teste.py", line 81, in montaurl
response = requests.get(url1, headers, timeout=10)
File "C:\Python34\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:\Python34\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Python34\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python34\lib\site-packages\requests\sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "C:\Python34\lib\site-packages\requests\adapters.py", line 529, in send
raise ReadTimeout(e, request=request)
requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='www.pontofrio.com.br', port=443): Read timed out. (read timeout=10)
Domain that works:
https://www.pichau.com.br/
Domains that don't work:
casasbahia.com.br
extra.com.br
boticario.com.br
I believe it is some block on the server of the pontofrio, how can I get around this?

There seemed to be a couple of issues, the first being how the headers were being set. The below doesn't actually pass the custom headers to the requests.get function.
response = requests.get(url1, headers, timeout=10)
This can be tested against httpbin:
import requests
url1 = 'https://httpbin.org/headers'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.11 (KHTML, like Gecko) '
'Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'
}
response = requests.get(url1, headers, timeout=10)
print(response.text)
print(response.status_code)
Which outputs:
{
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Host": "httpbin.org",
"User-Agent": "python-requests/2.25.1",
"X-Amzn-Trace-Id": "Root=1-608a0391-3f1cfa79444ac04865ad9111"
}
}
200
To properly set the headers argument:
response = requests.get(url1, headers=headers, timeout=10)
Let's test:
import requests
url1 = 'https://httpbin.org/headers'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.11 (KHTML, like Gecko) '
'Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'
}
response = requests.get(url1, headers=headers, timeout=10)
print(response.text)
print(response.status_code)
Here's the output:
{
"headers": {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"Accept-Encoding": "none",
"Accept-Language": "en-US,en;q=0.8",
"Host": "httpbin.org",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"X-Amzn-Trace-Id": "Root=1-608a0533-40c8281f5faa85d1050c6b6a"
}
}
200
Finally, the order of the headers and the 'Connection': 'keep-alive' header in particular were causing problems. Once I reordered and removed the Connection header it starting working on all of the urls.
Here's the code I used to test:
import requests
urls = ['https://www.pontofrio.com.br/',
'https://www.casasbahia.com.br',
'https://www.extra.com.br',
'https://www.boticario.com.br']
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4491.0 Safari/537.36'}
for url1 in urls:
print("Trying url: %s"% url1)
response = requests.get(url1, headers=headers, timeout=10)
print(response.status_code)
And the output:
Trying url: https://www.pontofrio.com.br/
200
Trying url: https://www.casasbahia.com.br
200
Trying url: https://www.extra.com.br
200
Trying url: https://www.boticario.com.br
200

I've tested to access the page with wget but without success. The problem seems that the server responds only to HTTP/2 requests.
Test with curl:
This times out:
$ curl --http1.1 -A "Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/81.0" "https://www.pontofrio.com.br/"
# times out
This succeeds (note the --http2 parameter):
$ curl --http2 -A "Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/81.0" "https://www.pontofrio.com.br/"
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
...
Unfortunately, requests module doesn't support it. You can however use httpx module that has experimental HTTP/2 support:
import httpx
import asyncio
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0",
}
async def get_text(url):
async with httpx.AsyncClient(http2=True, headers=headers) as client:
r = await client.get(url)
return r.text
txt = asyncio.run(get_text("https://www.pontofrio.com.br/"))
print(txt)
Prints:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
...
To install the httpx module with HTTP/2 support, use for example pip install httpx[http2]

Instagram login with tor

i am trying to login to instagram through tor
(i am using python3 on a linux machine if this helps!)
here is the code:
import json
import requests
import os
from colorama import Fore
from stem import Signal
from stem.control import Controller
def tor_session():
session = requests.session()
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
return session
def login(username, password):
# params:
# [string]username- the username of the instagram account to log in to
# [string]password- the password to use in the log in process
# description:
# logs in to the account with the specified username and with the specified password
# session setup
sess = tor_session()
sess.cookies.update({
'sessionid': '',
'mid': '',
'ig_pr': '1',
'ig_vw': '1920',
'csrftoken': '',
's_network': '',
'ds_user_id': ''
})
sess.headers.update({
'UserAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'x-instagram-ajax': '1',
'X-Requested-With': 'XMLHttpRequest',
'origin': 'https://www.instagram.com',
'ContentType': 'application/x-www-form-urlencoded',
'Connection': 'keep-alive',
'Accept': '*/*',
'Referer': 'https://www.instagram.com',
'authority': 'www.instagram.com',
'Host': 'www.instagram.com',
'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4',
'Accept-Encoding': 'gzip, deflate'
})
# get csrftoken and the instagram main page
r = sess.get('https://www.instagram.com/')
sess.headers.update({'X-CSRFToken': r.cookies.get_dict()['csrftoken']})
# log in
data = {'username': username, 'password': password}
r = sess.post('https://www.instagram.com/accounts/login/ajax/', data=data, allow_redirects=True)
token = r.cookies.get_dict()['csrftoken']
sess.headers.update({'X-CSRFToken': token})
# parse the response from the log in
data = json.loads(r.text)
print(data)
if data['status'] == 'fail':
return None
if data['authenticated']:
return True
else:
return False
login("username", "password")
the problem is that almost every time i have tried to run this
it didnt work and threw an exception:
Traceback (most recent call last):
File "main.py", line 156, in <module>
main()
File "main.py", line 152, in main
brute_force(username, pass_file_path)
File "main.py", line 114, in brute_force
logged_in = login(username, password)
File "main.py", line 81, in login
sess.headers.update({'X-CSRFToken': r.cookies.get_dict()['csrftoken']})
KeyError: 'csrftoken'
and sometimes it threw this exception:
File "main.py", line 94, in login
if data['authenticated']:
KeyError: 'authenticated'
how can i fix this?
i tried restarting tor
changing its configs
but nothing works
please help if you can!

It appears that Instagram doesn't set cookies for tor users:
>>> s = your_setup_code_for_session()
>>> r = s.get('https://www.instagram.com')
>>> r.cookies.get_dict()
{}
I also tested this using the tor browser and got the same results:
It looks like you'll need to use a vpn or a Tor + vpn combination.

Using WebSocket for Web Page Data Scraping

I want to scrape some of the data from here which is implemented based on websockets. So after inspecting the Chrome DevTools for wss address and header:
and the negotiation message:
I wrote:
from websocket import create_connection
headers = {
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,fa;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'Upgrade',
'Host': 'stream179.forexpros.com',
'Origin': 'https://www.investing.com',
'Pragma': 'no-cache',
'Sec-WebSocket-Extensions': 'client_max_window_bits',
'Sec-WebSocket-Key': 'ldcvnZNquzPkSNvpSdI09g==',
'Sec-WebSocket-Version': '13',
'Upgrade': 'websocket',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
ws = create_connection('wss://stream179.forexpros.com/echo/894/l27e2ja8/websocket', header=headers)
nego_message = '''["{\"_event\":\"bulk-subscribe\",\"tzID\":8,\"message\":\"pid-1:%%pid-8839:%%pid-166:%%pid-20:%%pid-169:%%pid-170:%%pid-44336:%%pid-27:%%pid-172:%%pid-2:%%pid-3:%%pid-5:%%pid-7:%%pid-9:%%pid-10:%%pid-945629:%%pid-11:%%pid-16:%%pid-68:%%pidTechSumm-1:%%pidTechSumm-2:%%pidTechSumm-3:%%pidTechSumm-5:%%pidTechSumm-7:%%pidTechSumm-9:%%pidTechSumm-10:%%pidExt-1:%%event-393634:%%event-393633:%%event-393636:%%event-393638:%%event-394479:%%event-394518:%%event-394514:%%event-394516:%%event-394515:%%event-394517:%%event-393654:%%event-394467:%%event-393653:%%event-394468:%%event-394545:%%event-394549:%%event-394548:%%event-394547:%%event-394550:%%event-394546:%%event-394551:%%event-394553:%%event-394552:%%event-394743:%%event-394744:%%event-393661:%%event-394469:%%event-394470:%%event-393680:%%event-393682:%%event-393681:%%event-393687:%%event-393694:%%event-393685:%%event-393689:%%event-393688:%%event-393695:%%event-393698:%%event-393704:%%event-393705:%%event-393724:%%event-393723:%%event-393725:%%event-393726:%%event-394591:%%event-393736:%%event-393733:%%event-393734:%%event-393740:%%event-393731:%%event-393732:%%event-393730:%%event-394617:%%event-394616:%%event-393737:%%event-378304:%%event-393645:%%event-394619:%%event-393755:%%event-393757:%%event-393760:%%event-393756:%%event-393758:%%event-393759:%%event-393761:%%event-393762:%%event-394481:%%event-394625:%%event-393754:%%event-394483:%%event-393775:%%event-394621:%%event-394622:%%event-376710:%%event-394623:%%event-394484:%%event-394624:%%isOpenExch-1:%%isOpenExch-2:%%isOpenExch-13:%%isOpenExch-3:%%isOpenExch-4:%%isOpenPair-1:%%isOpenPair-8839:%%isOpenPair-44336:%%cmt-1-5-1:%%domain-1:\"}"]'''
ws.send(nego_message)
while True:
print(ws.recv())
but I'm getting:
o
Traceback (most recent call last):
File "test.py", line 647, in <module>
print(ws.recv())
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_core.py", line 313, in recv
opcode, data = self.recv_data()
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_core.py", line 330, in recv_data
opcode, frame = self.recv_data_frame(control_frame)
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_core.py", line 343, in recv_data_frame
frame = self.recv_frame()
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_core.py", line 377, in recv_frame
return self.frame_buffer.recv_frame()
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_abnf.py", line 361, in recv_frame
self.recv_header()
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_abnf.py", line 309, in recv_header
header = self.recv_strict(2)
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_abnf.py", line 396, in recv_strict
bytes_ = self.recv(min(16384, shortage))
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_core.py", line 452, in _recv
return recv(self.sock, bufsize)
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_socket.py", line 115, in recv
"Connection is already closed.")
websocket._exceptions.WebSocketConnectionClosedException: Connection is already closed.
[Finished in 1.9s]
What am I missing here?
Update 1: updating code using WebSocketApp:
def on_message(ws, message):
print("message:", message)
def on_error(ws, error):
print("error:", error)
def on_close(ws):
print("closed.")
def on_open(ws):
print("opened")
time.sleep(1)
ws.send(nego_message)
ws = websocket.WebSocketApp(
"wss://stream179.forexpros.com/echo/894/l27e2ja8/websocket",
on_open = on_open,
on_message = on_message,
on_error = on_error,
on_close = on_close,
header = headers
)
websocket.enableTrace(True)
ws.run_forever()
but still no success:
--- request header ---
GET /echo/894/l27e2ja8/websocket HTTP/1.1
Upgrade: websocket
Connection: Upgrade
Host: stream179.forexpros.com
Origin: http://stream179.forexpros.com
Accept-Encoding: gzip, deflate, br
Accept-Language: en-US,en;q=0.9,fa;q=0.8
Cache-Control: no-cache
Connection: Upgrade
Host: stream179.forexpros.com
Origin: https://www.investing.com
Pragma: no-cache
Sec-WebSocket-Extensions: client_max_window_bits
Sec-WebSocket-Key: ldcvnZNquzPkSNvpSdI09g==
Sec-WebSocket-Version: 13
Upgrade: websocket
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36
-----------------------
--- response header ---
HTTP/1.1 101 Switching Protocols
Upgrade: websocket
Connection: Upgrade
Sec-WebSocket-Accept: XPKKpUMZLpSYx/1z8Q0499hcobs=
-----------------------
opened
send: b'\x81\xfe\x06{_\xda7\xd2\x04\xf8L\xf0\x00\xbfA\xb71\xae\x15\xe8}\xb8B\xbe4\xf7D\xa7=\xa9T\xa06\xb8R\xf0s\xf8C\xa8\x16\x9e\x15\xe8g\xf6\x15\xbf:\xa9D\xb38\xbf\x15\xe8}\xaa^\xb6r\xeb\r\xf7z\xaa^\xb6r\xe2\x0f\xe1f\xe0\x12\xf7/\xb3S\xffn\xec\x01\xe8z\xffG\xbb;\xf7\x05\xe2e\xff\x12\xa26\xbe\x1a\xe3i\xe3\r\xf7z\xaa^\xb6r\xeb\x00\xe2e\xff\x12\xa26\xbe\x1a\xe6k\xe9\x04\xe4e\xff\x12\xa26\xbe\x1a\xe0h\xe0\x12\xf7/\xb3S\xffn\xed\x05\xe8z\xffG\xbb;\xf7\x05\xe8z\xffG\xbb;\xf7\x04\xe8z\xffG\xbb;\xf7\x02\xe8z\xffG\xbb;\xf7\x00\xe8z\xffG\xbb;\xf7\x0e\xe8z\xffG\xbb;\xf7\x06\xe2e\xff\x12\xa26\xbe\x1a\xebk\xef\x01\xe0f\xe0\x12\xf7/\xb3S\xffn\xeb\r\xf7z\xaa^\xb6r\xeb\x01\xe8z\xffG\xbb;\xf7\x01\xeae\xff\x12\xa26\xbec\xb7<\xb2d\xa72\xb7\x1a\xe3e\xff\x12\xa26\xbec\xb7<\xb2d\xa72\xb7\x1a\xe0e\xff\x12\xa26\xbec\xb7<\xb2d\xa72\xb7\x1a\xe1e\xff\x12\xa26\xbec\xb7<\xb2d\xa72\xb7\x1a\xe7e\xff\x12\xa26\xbec\xb7<\xb2d\xa72\xb7\x1a\xe5e\xff\x12\xa26\xbec\xb7<\xb2d\xa72\xb7\x1a\xebe\xff\x12\xa26\xbec\xb7<\xb2d\xa72\xb7\x1a\xe3o\xe0\x12\xf7/\xb3S\x97\'\xae\x1a\xe3e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1i\xe9\x03\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe4l\xe9\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x01\xe1i\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xec\x04\xeae\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6k\xed\x0e\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe7n\xe2\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xee\x02\xe3k\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xef\x06\xe4e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6j\xeb\x02\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe7n\xed\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x01\xe7k\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xee\x01\xe5e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1i\xef\x04\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe6i\xe2\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xee\x02\xe6j\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xef\x03\xebe\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6j\xee\x0f\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe7k\xed\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xee\x02\xe7o\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xef\x03\xe4e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6j\xef\x06\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe7j\xe9\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xee\x02\xe7m\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xed\x03\xe1e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6h\xee\x03\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe4i\xeb\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xee\x03\xe4f\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xee\x00\xe2e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1i\xe2\x07\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe4g\xe8\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x01\xean\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xec\x0f\xe5e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1i\xe3\x03\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe4g\xef\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x01\xeaf\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xec\x0f\xeae\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1i\xe3\x02\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe4f\xe2\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe2k\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xed\x07\xe7e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1h\xe8\x03\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe5m\xe9\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe0j\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xed\x05\xe4e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6j\xe3\x06\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe5l\xec\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe1l\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xed\x04\xe6e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1h\xee\x07\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe5l\xeb\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe1m\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xed\x04\xe2e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6i\xeb\x00\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe4n\xec\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe1h\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xe5g\xe9\x07\xe6e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1i\xee\x02\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe4n\xe3\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe7j\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xed\x02\xe5e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1h\xec\x07\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe5j\xec\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe7g\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xed\x02\xebe\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1h\xec\x06\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe5i\xe8\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xee\x03\xean\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xec\x05\xe7e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1h\xef\x03\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe6g\xe9\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe5j\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xec\x05\xe3e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6i\xe8\x05\xe8z\xffR\xa4:\xb4C\xffl\xed\x01\xe5n\xea\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xee\x01\xe0l\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xee\x0f\xe6e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6i\xe8\x03\xe8z\xff^\xa1\x10\xaaR\xbc\x1a\xa2T\xbar\xeb\r\xf7z\xb3D\x9d/\xbfY\x97\'\xb9_\xffm\xe0\x12\xf76\xa9x\xa2:\xb4r\xaa<\xb2\x1a\xe3l\xe0\x12\xf76\xa9x\xa2:\xb4r\xaa<\xb2\x1a\xe1e\xff\x12\xbb,\x95G\xb71\x9fO\xb17\xf7\x03\xe8z\xff^\xa1\x10\xaaR\xbc\x0f\xbb^\xa0r\xeb\r\xf7z\xb3D\x9d/\xbfY\x82>\xb3E\xffg\xe2\x04\xebe\xff\x12\xbb,\x95G\xb71\x8aV\xbb-\xf7\x03\xe6l\xe9\x01\xe8z\xffT\xbf+\xf7\x06\xffj\xf7\x06\xe8z\xffS\xbd2\xbb^\xbcr\xeb\r\xf0"\xf8j'
message: o
send: b'\x88\x82!\xdd\x07\xcf"5'
closed.
[Finished in 2.3s]

I tried to remove all the dashes from the message sent and eventually it worked.
nego_message = '{"_event":"bulk-subscribe","tzID":8,"message":"pid-0:%%isOpenExch-1:%%pid-8849:%%isOpenExch-1004:%%pid-8833:%%pid-8862:%%pid-8830:%%pid-8836:%%pid-8831:%%pid-8916:%%pid-8832:%%pid-169:%%pid-20:%%isOpenExch-2:%%pid-166:%%pid-172:%%isOpenExch-4:%%pid-27:%%isOpenExch-3:%%pid-167:%%isOpenExch-9:%%pid-178:%%isOpenExch-20:%%pid-6408:%%pid-6369:%%pid-13994:%%pid-6435:%%pid-13063:%%pid-26490:%%pid-243:%%pid-1:%%isOpenExch-1002:%%pid-2:%%pid-3:%%pid-5:%%pid-7:%%pid-9:%%pid-10:%%pid-23705:%%pid-23706:%%pid-23703:%%pid-23698:%%pid-8880:%%isOpenExch-118:%%pid-8895:%%pid-1141794:%%pid-1175152:%%isOpenExch-152:%%pid-1175153:%%pid-14958:%%pid-44336:%%isOpenExch-97:%%pid-8827:%%pid-6497:%%pid-941155:%%pid-104395:%%pid-1013048:%%pid-1055979:%%pid-1177973:%%pid-1142416:%%pidExt-1:%%cmt-1-5-1:%%pid-252:%%pid-1031244:%%isOpenExch-125:"}'
ws.send(nego_message)
while True:
print(ws.recv())
Outputs:
a["{\"message\":\"pid-3::{\\\"pid\\\":\\\"3\\\",\\\"last_dir\\\":\\\"greenBg\\\",\\\"last_numeric\\\":149.19,\\\"last\\\":\\\"149.19\\\",\\\"bid\\\":\\\"149.18\\\",\\\"ask\\\":\\\"149.19\\\",\\\"high\\\":\\\"149.29\\\",\\\"low\\\":\\\"149.12\\\",\\\"last_close\\\":\\\"149.26\\\",\\\"pc\\\":\\\"-0.07\\\",\\\"pcp\\\":\\\"-0.05%\\\",\\\"pc_col\\\":\\\"redFont\\\",\\\"turnover\\\":\\\"18.13K\\\",\\\"turnover_numeric\\\":\\\"18126\\\",\\\"time\\\":\\\"0:39:09\\\",\\\"timestamp\\\":1666139948}\"}"]

The while loop is calling ws.recv() twice. If you simply do:
print(ws.recv())
It will not attempt to call .recv() on a closed connection. The result of your message output is printing o before the stack trace.
As an aside, it seems like you might want a longer running connection using websocket.WebSocketApp (example) for a scrape.

Scrapy throuws Exception "raise _DefGen_Return(val) twisted.internet.defer._DefGen_Return: "

When I run the code locally (windows 10) everything works fine.
Have checked other answers here and other resources, but failed to figure out any solution.
After deploying to ScrapingHub Im getting this error message:
[scrapy.core.scraper] Spider error processing <POST http://oris.co.palm-beach.fl.us/or_web1/new_sch.asp> (referer: http://oris.co.palm-beach.fl.us/or_web1/) Less
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1299, in _inlineCallbacks
result = g.send(result)
File "/usr/local/lib/python3.6/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1276, in returnValue
raise _DefGen_Return(val)
twisted.internet.defer._DefGen_Return: <200 http://oris.co.palm-beach.fl.us/or_web1/new_sch.asp>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 42, in process_spider_input
result = method(response=response, spider=spider)
File "/usr/local/lib/python3.6/site-packages/scrapy_pagestorage.py", line 68, in process_spider_input
self.save_response(response, spider)
File "/usr/local/lib/python3.6/site-packages/scrapy_pagestorage.py", line 102, in save_response
self._writer.write(payload)
File "/usr/local/lib/python3.6/site-packages/scrapinghub/hubstorage/batchuploader.py", line 224, in write
data = jsonencode(item)
File "/usr/local/lib/python3.6/site-packages/scrapinghub/hubstorage/serialization.py", line 38, in jsonencode
return dumps(o, default=jsondefault)
File "/usr/local/lib/python3.6/json/__init__.py", line 238, in dumps
**kw).encode(obj)
File "/usr/local/lib/python3.6/json/encoder.py", line 199, in encode
chunks = self.iterencode(o, _one_shot=True)
File "/usr/local/lib/python3.6/json/encoder.py", line 257, in iterencode
return _iterencode(o, 0)
TypeError: keys must be a string
Here is a snippet of my Scrapy function that throws this error
The ToDate and FromDate are passed as arguments to spider:
start_urls = ['http://oris.co.palm-beach.fl.us/or_web1/']
def parse(self, response):
# inspect_response(response, self)
url = 'http://oris.co.palm-beach.fl.us/or_web1/new_sch.asp'
headers = {
'upgrade-insecure-requests': "1",
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
'origin': "http://oris.co.palm-beach.fl.us",
'content-type': "application/x-www-form-urlencoded",
'dnt': "1",
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
'cache-control': "no-cache",
}
# Date range should be visin 90 days
data = {'FromDate': self.FromDate,
'PageSize': '500',
'RecSetSize': '500',
'ToDate': self.ToDate,
'consideration': '',
'search_by': 'DocType',
'search_entry': 'LP'}
body = urlencode(data)
yield scrapy.Request(url, method="POST", headers = headers, body = body, callback = self.parsed)
def parsed(self, response):
# inspect_response(response, self)
# Getting all View urls.
urls=response.xpath("//a[#class = 'list_2']/#href").extract()
for url in urls:
url = url.replace('\r', '').replace('\t','').replace('\n','')
url = url.replace('\r', '').replace('\t','').replace('\n','')
url = response.urljoin(url)
url = url.replace('details.asp','details_des.asp') + '&linked=&party_seq='
yield scrapy.Request(url, callback = self.details)

ok. the issue was with - Messagepack is not available ( this was in the Debug log, not in the errors though., and page storage enabled for this project.
I have disabled pagestorage and it works fine now.
I wished Error messages were more readable in Scrapy and SH.

scrapy - [twisted] NameError: name 'connect' is not defined

I am trying to use scrapy on Ubuntu 16 with python 3.6. When I run "scrapy crawl mySpiderName", it shows an error as:
2018-06-17 11:18:50 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: ToutiaoAppSpider)
2018-06-17 11:18:50 [scrapy.utils.log] INFO: Versions: lxml 3.5.0.0, libxml2 2.9.3, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 18.4.0, Python 3.5.2 (default, Nov 23 2017, 16:37:01) - [GCC 5.4.0 20160609], pyOpenSSL 18.0.0 (OpenSSL 1.1.0h 27 Mar 2018), cryptography 2.2.2, Platform Linux-4.13.0-43-generic-x86_64-with-Ubuntu-16.04-xenial
2018-06-17 11:18:50 [scrapy.crawler] INFO: Overridden settings: {'SPIDER_MODULES': ['ToutiaoAppSpider.spiders'], 'CONCURRENT_REQUESTS_PER_IP': 10, 'DOWNLOAD_TIMEOUT': 15, 'CONCURRENT_REQUESTS': 10, 'NEWSPIDER_MODULE': 'ToutiaoAppSpider.spiders', 'BOT_NAME': 'ToutiaoAppSpider', 'CONCURRENT_REQUESTS_PER_DOMAIN': 10}
2018-06-17 11:18:50 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.logstats.LogStats',
'scrapy.extensions.memusage.MemoryUsage']
2018-06-17 11:18:50 [ToutiaoHao] INFO: Reading start URLs from redis key 'ToutiaoHao:start_urls' (batch size: 10, encoding: utf-8
2018-06-17 11:18:50 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2018-06-17 11:18:50 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
Unhandled error in Deferred:
2018-06-17 11:18:50 [twisted] CRITICAL: Unhandled error in Deferred:
2018-06-17 11:18:50 [twisted] CRITICAL:
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/twisted/internet/defer.py", line 1386, in _inlineCallbacks
result = g.send(result)
File "/usr/local/lib/python3.5/dist-packages/scrapy/crawler.py", line 80, in crawl
self.engine = self._create_engine()
File "/usr/local/lib/python3.5/dist-packages/scrapy/crawler.py", line 105, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "/usr/local/lib/python3.5/dist-packages/scrapy/core/engine.py", line 70, in __init__
self.scraper = Scraper(crawler)
File "/usr/local/lib/python3.5/dist-packages/scrapy/core/scraper.py", line 71, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "/usr/local/lib/python3.5/dist-packages/scrapy/middleware.py", line 58, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/usr/local/lib/python3.5/dist-packages/scrapy/middleware.py", line 36, in from_settings
mw = mwcls.from_crawler(crawler)
File "/home/tuijian/crawler/ToutiaoAppSpider/ToutiaoAppSpider/pipelines.py", line 34, in from_crawler
redis_host=crawler.settings['REDIS_HOST'])
File "/home/tuijian/crawler/ToutiaoAppSpider/ToutiaoAppSpider/pipelines.py", line 17, in __init__
self.db = connect[mongodb_db]
NameError: name 'connect' is not defined
I searched this situation on stack overflow but most questions are not suitable for my problem.
Settings.py content is shown below:
# -*- coding: utf-8 -*-
# Scrapy settings for ToutiaoAppSpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'ToutiaoAppSpider'
SPIDER_MODULES = ['ToutiaoAppSpider.spiders']
NEWSPIDER_MODULE = 'ToutiaoAppSpider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'ToutiaoAppSpider (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 10
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 10
CONCURRENT_REQUESTS_PER_IP = 10
DOWNLOAD_TIMEOUT = 15
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'ToutiaoAppSpider.middlewares.ToutiaoappspiderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None
#'ToutiaoAppSpider.middlewares.ProxyMiddleware': 543,
#'ToutiaoAppSpider.middlewares.Push2RedisMiddleware': 544,
#'ToutiaoAppSpider.middlewares.SkipYg365Middleware': 545,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'ToutiaoAppSpider.pipelines.ToutiaoappspiderPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HEADERS = {
'Host': 'is.snssdk.com',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip',
'request_timestamp_client': '332763604',
'X-SS-REQ-TICKET': '',
'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 7.1.2; Redmi 4X MIUI/8.2.1) NewsArticle/6.5.8 cronet/58.0.2991.0'
}
DETAIL_HEADERS = {
'Accept-Encoding': 'gzip',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Linux; Android 7.1.2; Redmi 4X Build/N2G47H; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/62.0.3202.84 Mobile Safari/537.36 JsSdk/2 NewsArticle/6.5.8 NetType/wifi',
'Connection': 'Keep-Alive'
}
COMPLEX_DETAIL_HEADERS = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'cache-control': 'max-age=0',
'dnt': '1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'
}
TOUTIAOHAO_HEADERS = {
'accept': 'application/json, text/javascript',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'content-type': 'application/x-www-form-urlencoded',
'dnt': '1',
'referer': 'https://www.toutiao.com/c/user/{user_id}/',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'}
FEED_URL_old = 'https://is.snssdk.com/api/news/feed/v77/?list_count={list_count}&category={category}&refer=1&count=20' \
'{max_behot_time}&last_refresh_sub_entrance_interval=1519286082&loc_mode=7&tt_from=pre_load_more' \
'&plugin_enable=3&_d_s=1&iid=26133660951&device_id=37559129771&ac=wifi&channel=tianzhuo_toutiao_sg' \
'&aid=13&app_name=news_article&version_code=658&version_name=6.5.8&device_platform=android' \
'&ab_version=281719%2C278039%2C249665%2C249684%2C249686%2C249642%2C249670%2C249673%2C281732%2C229304%2C249671' \
'%2C282686%2C282218%2C275584%2C277466%2C281426%2C280418%2C232362%2C265707%2C279809%2C239097%2C170988%2C281158' \
'%2C269426%2C273499%2C279386%2C281391%2C281612%2C276203%2C281098%2C257281%2C281472%2C280149%2C277718%2C278670' \
'%2C271717%2C259492%2C280773%2C282147%2C272683%2C251795%2C276758%2C282776%2C251713%2C280097%2C282669%2C31210' \
'%2C279129%2C270335%2C280969%2C227649%2C280220%2C264034%2C258356%2C247850%2C280448%2C282714%2C281293%2C278160' \
'%2C249045%2C244746%2C264615%2C260657%2C241181%2C282157%2C271178%2C252767%2C249828%2C246859' \
'&ab_client=a1%2Cc4%2Ce1%2Cf2%2Cg2%2Cf7&ab_group=100167&ab_feature=94563%2C102749&abflag=3&ssmix=a' \
'&device_type=Redmi+4X&device_brand=Xiaomi&language=zh&os_api=25&os_version=7.1.2&uuid=864698037116551' \
'&openudid=349f495868d2f06d&manifest_version_code=658&resolution=720*1280&dpi=320&update_version_code=65809' \
'&_rticket={_rticket}&plugin=10575&fp=TlTqLYFuLlXrFlwSPrU1FYmeFSwt&rom_version=miui_v9_8.2.1' \
'&ts={ts}&as={_as}&mas={mas}&cp={cp}'
FEED_URL = 'http://ib.snssdk.com/api/news/feed/v78/?fp=RrTqLWwuL2GSFlHSFrU1FYFeL2Fu&version_code=6.6.0' \
'&app_name=news_article&vid=49184A00-1B68-4FE9-9FD5-B24BED70032C&device_id=48155472608&channel=App%20Store' \
'&resolution=1242*2208&aid=13&ab_version=264071,275782,275652,271178,252769,249828,246859,285542,278039,' \
'249664,249685,249687,249675,249668,249669,249673,281730,229305,249672,285965,285209,283849,277467,283757,' \
'286568,285618,284439,280773,286862,286446,285535,251712,283794,285944,31210,285223,283836,286558,258356,' \
'247849,280449,284752,281296,249045,275612,278760,283491,264613,260652,286224,286477,261338,241181,283777,' \
'285703,285404,283370,286426,239096,286955,170988,273497,285788,279386,281389,276203,286300,286056,286878,' \
'257282,281472,277769,280147,284908&ab_feature=201616,z1&ab_group=z1,201616' \
'&openudid=a8b364d577ac6c59e96dbcf3cc57d9c4eaa9420c&idfv=49184A00-1B68-4FE9-9FD5-B24BED70032C&ac=WIFI' \
'&os_version=11.2.5&ssmix=a&device_platform=iphone&iid=25922108613&ab_client=a1,f2,f7,e1' \
'&device_type=iPhone%208%20Plus&idfa=2F0B6A5F-B939-4EB5-9D51-D81698D8729D&refresh_reason=4&category={category}' \
'&last_refresh_sub_entrance_interval=1519794464&detail=1&tt_from=unknown&count=20&list_count={list_count}' \
'&LBS_status=authroize&loc_mode=1&cp={cp}&{max_behot_time}&session_refresh_idx=1&image=1&strict=0&refer=1' \
'&city=%E5%8C%97%E4%BA%AC&concern_id=6215497896830175745&language=zh-Hans-CN&st_time=67&as={_as}&ts={ts}'
DETAIL_URL = 'http://is.snssdk.com/2/article/information/v23/?fp=RrTqLWwuL2GSFlHSFrU1FYFeL2Fu&version_code=6.6.0' \
'&app_name=news_article&vid=49184A00-1B68-4FE9-9FD5-B24BED70032C&device_id=48155472608' \
'&channel=App%20Store&resolution=1242*2208&aid=13&ab_version=264071,275782,275652,271178,252769,249828,' \
'246859,283790,278039,249664,249685,249687,249675,249668,249669,249673,281730,229305,249672,283849,' \
'277467,283757,284439,280773,283091,283787,282775,251712,283794,280097,284801,31210,283489,283836,280966,' \
'280220,258356,247849,280449,284752,281296,281724,249045,281414,275612,278760,283491,264613,282974,260652,' \
'261338,241181,283777,284201,282897,283370,239096,170988,269426,273497,279386,281389,281615,276203,279014,' \
'257282,281472,277769,280147&ab_feature=201616,z1&ab_group=z1,201616' \
'&openudid=a8b364d577ac6c59e96dbcf3cc57d9c4eaa9420c&idfv=49184A00-1B68-4FE9-9FD5-B24BED70032C' \
'&ac=WIFI&os_version=11.2.5&ssmix=a&device_platform=iphone&iid=25922108613&ab_client=a1,f2,f7,e1' \
'&device_type=iPhone%208%20Plus&idfa=2F0B6A5F-B939-4EB5-9D51-D81698D8729D&article_page=0' \
'&group_id={group_id}&device_id=48155472608&aggr_type=1&item_id={item_id}&from_category=__all__' \
'&as={_as}&ts={ts}'
COMMENTS_URL = 'https://is.snssdk.com/article/v2/tab_comments/?group_id={group_id}&item_id={item_id}&aggr_type=1' \
'&count=20&offset={offset}&tab_index=0&fold=1&iid=26133660951&device_id=37559129771&ac=wifi' \
'&channel=tianzhuo_toutiao_sg&aid=13&app_name=news_article&version_code=658&version_name=6.5.8' \
'&device_platform=android&ab_version=281719%2C278039%2C249665%2C249684%2C249686%2C283244%2C249642' \
'%2C249670%2C249673%2C281732%2C229304%2C249671%2C282686%2C282218%2C275584%2C277466%2C281426' \
'%2C280418%2C282898%2C232362%2C265707%2C279809%2C239097%2C170988%2C281158%2C269426%2C273499%2C279386' \
'%2C281391%2C281612%2C276203%2C281098%2C257281%2C281472%2C280149%2C277718%2C283104%2C271717%2C259492' \
'%2C283184%2C280773%2C282147%2C272683%2C251795%2C283177%2C282776%2C251713%2C280097%2C282669%2C31210' \
'%2C283097%2C283138%2C270335%2C280969%2C227649%2C280220%2C264034%2C258356%2C247850%2C280448%2C283165' \
'%2C281293%2C278160%2C249045%2C244746%2C264615%2C282973%2C260657%2C241181%2C282157%2C271178%2C252767' \
'%2C249828%2C246859&ab_client=a1%2Cc4%2Ce1%2Cf2%2Cg2%2Cf7&ab_group=100167&ab_feature=94563%2C102749' \
'&abflag=3&ssmix=a&device_type=Redmi+4X&device_brand=Xiaomi&language=zh&os_api=25&os_version=7.1.2' \
'&uuid=864698037116551&openudid=349f495868d2f06d&manifest_version_code=658&resolution=720*1280' \
'&dpi=320&update_version_code=65809&_rticket={_rticket}&plugin=10575&fp=TlTqLYFuLlXrFlwSPrU1FYmeFSwt' \
'&rom_version=miui_v9_8.2.1&ts={ts}&as={_as}&mas={mas}'
ACCOUNT_HEADERS = {
'accept': 'application/json, text/javascript',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'content-type': 'application/x-www-form-urlencoded',
# 'cookie:WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=160b9d56b07aa1-0322b0f040feaa-32607e02-13c680-160b9d56b08f0f; uuid="w:38808a712dd144679f3b524be9378a9e"; __utmc=24953151; __utmz=24953151.1515146646.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=24953151.940475929.1515146646.1515146646.1515149010.2; _ga=GA1.2.940475929.1515146646; utm_source=toutiao; tt_webid=74484831828; CNZZDATA1259612802=2101983015-1514944629-https%253A%252F%252Fwww.google.com.ph%252F%7C1519894157; __tasessionId=ni5yklwbb1519898143309; tt_webid=6527912825075254788
'dnt': '1',
# 'referer': 'https://www.toutiao.com/search/?keyword={}',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
MONGODB_URI = 'mongodb://wifi:zenmen#10.19.83.217'
MONGODB_DB = 'toutiao_app'
MONGODB_COLLECTION_NEWS = 'news'
MONGODB_COLLECTION_DETAIL = 'detail'
MONGODB_COLLECTION_COMMENTS = 'comments'
REDIS_HOST = '54.169.202.250'
Please is there anyone can help me to fix this problem?
Thank you!

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

POST request using scrapy.FormRequest - python-3.x

Related

requests.get return error HTTPSConnectionPool Python

Instagram login with tor

Using WebSocket for Web Page Data Scraping

Scrapy throuws Exception "raise _DefGen_Return(val) twisted.internet.defer._DefGen_Return: "

scrapy - [twisted] NameError: name 'connect' is not defined

Categories

Resources