Trouble converting a curl command to a request-promise command in nodejs - node.js

So I know that this curl command does work. When I run it, I get the html document that should appear after this login:
curl 'https://login.url'
-H 'Pragma: no-cache'
-H 'Origin: https://blah'
-H 'Accept-Encoding: gzip, deflate, br'
-H 'Accept-Language: en-US,en;q=0.8'
-H 'Upgrade-Insecure-Requests: 1'
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'
-H 'Content-Type: application/x-www-form-urlencoded'
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
-H 'Cache-Control: no-cache'
-H 'Referer: https://blash.jsp'
-H 'Connection: keep-alive'
--data 'Username=mary&Password=<PASSWORD>&Go=Login&Action=Srh-1-1' --compressed ;
This is my attempt at converting it to a node request promise. When I run it, I get some weird characters back.
var url = 'https://login.url';
var headers = {
'Pragma': 'no-cache',
'Origin': 'https://blah',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cache-Control': 'no-cache',
'Referer': 'https://blash.jsp',
'Connection': 'keep-alive',
}
var data = {
'Username':'mary',
'Password':'<Password>',
'Go':'Login',
'Action':'Srh-1-1'
}
var html = yield request.post({url:url,form:data,headers:headers});
Here's an example of the weird characters:
�X}o�6����4\��b�N��%
What am I doing incorrectly?

You need to tell request that you accept compression by setting the gzip option to true.
Be aware that depending on how/where you get the data, you may get the compressed or uncompressed response. Check the documentation for request for details (search for "compr" on the page).

Related

HTTP request - Request module - 302 redirect location containing html escaped character

I'm using request module to send an http post request to a server which respond http 302 with the location redirect url "strangely" encoded:
'https&#58%3B//xxx.xxx.com&#63%3Bsrcext%3Dvalue&amp%3Berl=rrf
When i do the same request in chrome, chrome show me the location redirect URL as:
"https://xxx.xxx.com?srcext=value&erl=rrf"
Curl give me the same "response" as request module:
'https&#58%3B//xxx.xxx.com&#63%3Bsrcext%3Dvalue&amp%3Berl=rrf
Request options "followRedirect/followAllRedirects" fail to follow the link, curl option -L fail to follow the link.
No problem in chrome
I know i can apply "by hand" a method to decode the url "properly" but this is tricky as some parameters included need to stay encoded.
Is there an option in curl or request that i miss to directly "decode" the url and make the redirection option works ?
Thanks in advance
=========== EDIT =======
for request i'm using
request({
method: "POST",
url: "https://balbalab.com",
headers: {
'Cookie': "XXXXX",
'Accept-Encoding': 'gzip, deflate, fr',
'Accept-Language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/55.0.2883.87 Chrome/55.0.2883.87 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
"Pragma": "no-cache",
"Content-Type": "application/x-www-form-urlencoded"
},
gzip: true,
form: formFields,
followRedirect: true,
followAllRedirects: true
}
for curl i'm using:
curl -X POST "https://url1.url1.com" \
-H 'Accept-Encoding: gzip, deflate, fr' \
-H 'Accept-Language: fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7' \
-H 'Upgrade-Insecure-Requests: 1' \
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/55.0.2883.87 Chrome/55.0.2883.87 Safari/537.36' \
-H 'Content-Type: application/x-www-form-urlencoded' \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
-H 'Cache-Control: no-cache' \
-H 'Connection: keep-alive' \
-H "Pragma: no-cache" \
-H "Content-Type: application/x-www-form-urlencoded" \
-d 'xx=ff' \
-L
How do you parse the URL? It seems like you are missing url-encoded flag in your parser.

Curl gives response but python does not and the request call does not terminate?

I am trying this following curl request
curl 'https://www.nseindia.com/api/historical/cm/equity?symbol=COALINDIA&series=\[%22EQ%22\]&from=03-05-2020&to=03-05-2021&csv=true' \
-H 'authority: www.nseindia.com' \
-H 'accept: */*' \
-H 'user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/88.0.4324.182 Safari/537.36' \
-H 'x-requested-with: XMLHttpRequest' \
-H 'sec-gpc: 1' \
-H 'sec-fetch-site: same-origin' \
-H 'sec-fetch-mode: cors' \
-H 'sec-fetch-dest: empty' \
-H 'referer: https://www.nseindia.com/get-quotes/equity?symbol=COALINDIA' \
-H 'accept-language: en-GB,en-US;q=0.9,en;q=0.8' \
-H 'cookie: ak_bmsc=2D5CCD6F330B77016DD02ADFD8BADB8A58DDD69E733C0000451A9060B2DF0E5C~pllIy1yQvFABwPqSfaqwV4quP8uVOfZBlZe9dhyP7+7vCW/YfXy32hQoUm4wxCSxUjj8K67PiZM+8wE7cp0WV5i3oFyw7HRmcg22nLtNY4Wb4xn0qLv0kcirhiGKsq4IO94j8oYTZIzN227I73UKWQBrCSiGOka/toHASjz/R10sX3nxqvmMSBlWvuuHkgKOzrkdvHP1YoLPMw3Cn6OyE/Z2G3oc+mg+DXe8eX1j8b9Hc=; nseQuoteSymbols=[{"symbol":"COALINDIA","identifier":null,"type":"equity"}]; nsit=X5ZCfROTTuLVwZzLBn7OOtf0; AKA_A2=A; bm_mi=6CE0B82205ACE5A1F72250ACDDFF563E~LZ4/HQ257rSMBPCrxy0uSDvrSxj4hHpLQqc8R5JZOzUZYo1OqZg5Q/GOt88XNtMbsWM8bB22vtCXzvksGwPcC/bH2nPFEZr0ci6spQ4GOpCa/TM7soc02HVf0tyDTkmg/ZdLZlWzond4r0vn+QpSB7f3fiVza1Gdx9OaFL1i3rvqe1OKmFONreHEue20PL0hlREVWeLcFM/5DxKArPwzCSopPp62Eea1510iivl7GmY=; nseappid=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJhcGkubnNlIiwiYXVkIjoiYXBpLm5zZSIsImlhdCI6MTYyMDA2MTQ5OSwiZXhwIjoxNjIwMDY1MDk5fQ.YBTQ0MqRayD3QBM3V6zUt5zbRRICkbIhWWNedkDYrdU; bm_sv=C49B743B48F174C77F3DDAD188AA6D87~bm5TD36snlaRLx9M5CS+FOUicUcbVV3OIKjZU2WLwd1PtHYUum7hnBfYeUCDv+5Xdb9ADklnmm1cwZGJJbiBstcA6c5vju53C7aTFBorl8SJZjBN/4ku61oz0ncrQYCaSxkFGkRRY9VMWm6SpQwHXfMsUzc/Qk7301zs7KZuGCY=' \
--compressed
This gives us the required response (example below)
"Date ","series ","OPEN ","HIGH ","LOW ","PREV. CLOSE ","ltp ","close ","vwap ","52W H","52W L ","VOLUME ","VALUE ","No of trades "
"03-May-2021","EQ","133.00","133.45","131.20","133.05","132.20","132.20","132.21","163.00","109.55",10262391,"1,356,811,541.80",59409
But if I use the following python script to get the data
import requests
headers = {
'authority': 'www.nseindia.com',
'accept': '*/*',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'sec-gpc': '1',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://www.nseindia.com/get-quotes/equity?symbol=COALINDIA',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8','cookie':'ak_bmsc=2D5CCD6F330B77016DD02ADFD8BADB8A58DDD69E733C0000451A9060B2DF0E5C~pllIy1yQvFABwPqSfaqwV4quP8uVOfZBlZe9dhyP7+7vCW/YfXy32hQoUm4wxCSxUjj8K67PiZM+8wE7cp0WV5i3oFyw7HRmcg22nLtNY4Wb4xn0qLv0kcirhiGKsq4IO94j8oYTZIzN227I73UKWQBrCSiGOka/toHASjz/R10sX3nxqvmMSBlWvuuHkgKOzrkdvHP1YoLPMw3Cn6OyE/Z2G3oc+mg+DXe8eX1j8b9Hc=; nseQuoteSymbols=[{"symbol":"COALINDIA","identifier":null,"type":"equity"}]; nsit=X5ZCfROTTuLVwZzLBn7OOtf0; AKA_A2=A; bm_mi=6CE0B82205ACE5A1F72250ACDDFF563E~LZ4/HQ257rSMBPCrxy0uSDvrSxj4hHpLQqc8R5JZOzUZYo1OqZg5Q/GOt88XNtMbsWM8bB22vtCXzvksGwPcC/bH2nPFEZr0ci6spQ4GOpCa/TM7soc02HVf0tyDTkmg/ZdLZlWzond4r0vn+QpSB7f3fiVza1Gdx9OaFL1i3rvqe1OKmFONreHEue20PL0hlREVWeLcFM/5DxKArPwzCSopPp62Eea1510iivl7GmY=; nseappid=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJhcGkubnNlIiwiYXVkIjoiYXBpLm5zZSIsImlhdCI6MTYyMDA2MTQ5OSwiZXhwIjoxNjIwMDY1MDk5fQ.YBTQ0MqRayD3QBM3V6zUt5zbRRICkbIhWWNedkDYrdU; bm_sv=C49B743B48F174C77F3DDAD188AA6D87~bm5TD36snlaRLx9M5CS+FOUicUcbVV3OIKjZU2WLwd1PtHYUum7hnBfYeUCDv+5Xdb9ADklnmm1cwZGJJbiBstcA6c5vju53C7aTFBorl8SJZjBN/4ku61oz0ncrQYCaSxkFGkRRY9VMWm6SpQwHXfMsUzc/Qk7301zs7KZuGCY=',}
params = (
('symbol', 'COALINDIA'),
('series', '/["EQ"/]'),
('from', '30-04-2021'),
('to', '03-05-2021'),
('csv', 'true'),
)
response = requests.get('https://www.nseindia.com/api/historical/cm/equity', headers=headers, params=params)
It gets stuck in the last line.
I am using python3.9 and urllib3.
Not sure what is the problem.
This url downloads a csv file from the website.
You have to jump through some loops with Python to get the file you're after. Mainly, you need to get the request header cookie part right, otherwise you'll keep getting 401 code.
First, you need to get the regular cookies from the authority www.nseindia.com. Then, you need to get the bm_sv cookie from the https://www.nseindia.com/json/quotes/equity-historical.json. Finally, add something that's called nseQuoteSymbols.
Glue all that together and make the request to get the file.
Here's how:
from urllib.parse import urlencode
import requests
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/88.0.4324.182 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'referer': 'https://www.nseindia.com/get-quotes/equity?symbol=COALINDIA',
}
payload = {
"symbol": "COALINDIA",
"series": '["EQ"]',
"from": "04-04-2021",
"to": "04-05-2021",
"csv": "true",
}
api_endpoint = "https://www.nseindia.com/api/historical/cm/equity?"
nseQuoteSymbols = 'nseQuoteSymbols=[{"symbol":"COALINDIA","identifier":null,"type":"equity"}]; '
def make_cookies(cookie_dict: dict) -> str:
return "; ".join(f"{k}={v}" for k, v in cookie_dict.items())
with requests.Session() as connection:
authority = connection.get("https://www.nseindia.com", headers=headers)
historical_json = connection.get("https://www.nseindia.com/json/quotes/equity-historical.json", headers=headers)
bm_sv_string = make_cookies(historical_json.cookies.get_dict())
cookies = make_cookies(authority.cookies.get_dict()) + nseQuoteSymbols + bm_sv_string
connection.headers.update({**headers, **{"cookie": cookies}})
the_real_slim_shady = connection.get(f"{api_endpoint}{urlencode(payload)}")
csv_file = the_real_slim_shady.headers["Content-disposition"].split("=")[-1]
with open(csv_file, "wb") as f:
f.write(the_real_slim_shady.content)
Output -> a .csv file that looks like this:

why postmant request and scrapy request send me to diferent web?

i did this spider, and when i make a request with that, send me to diferent web
a regret that I put the same parameters and almost the same header
def start_requests(self):
url ="https://assr.parcelquest.com/Statewide"
rows =self.getPin('parcels/Parcel.csv')
for row in rows:
params = {
'co3': 'MRN',
'apn': 022-631-01,
'recaptchaSuccess':'0',
'IndexViewModel': 'PQGov.Models.IndexViewModel'
}
header = {
'Content-Type': 'application/x-www-form-urlencoded',
'Accept' : '*/*',
'host':'assr.parcelquest.com',
'Referer': 'https://assr.parcelquest.com/Statewide/Index',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
'Accept-Encoding':'gzip, deflate, br',
'Connection':'keep-alive'
}
#print(params)
yield scrapy.Request(url=self.url,headers=header,body=json.dumps(params),method='POST',callback=self.property,meta = {'parcel':row},dont_filter=True)
this is postman:
could explain me anybody why?

'requests.get' Why can't I get the correct response?

I test api by CURL tool.I get correct response.But function 'requests.get' not get .
This is my curl
curl 'https://xyq.cbg.163.com/equip?s=352&eid=201809300000113-352-DCEBW3UFAURC&view_loc=equip_list' -H 'Connection: keep-alive' -H 'Cache-Control: max-age=0' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' -H 'Referer: https://xyq.cbg.163.com/cgi-bin/login.py?next_url=%2Fequip%3Fs%3D352%26eid%3D201809300000113-352-DCEBW3UFAURC%26view_loc%3Dequip_list&server_id=352&act=do_anon_auth' -H 'Accept-Encoding: gzip, deflate, br' -H 'Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7' -H 'Cookie: vjuids=5568da96a.1645fa80ea3.0.9bce666ccd158; vjlast=1530613207.1530613207.30; _ntes_nnid=6225af57f6bccb13029779bc5f612dc1,1530613206698; _ntes_nuid=6225af57f6bccb13029779bc5f612dc1; P_INFO=jiangwei1995910#163.com|1538213634|0|mail163|11&10|bej&1538141485&xyq#bej&null#10#0#0|&0|xyq&cbg|jiangwei1995910#163.com; nts_mail_user=jiangwei1995910#163.com:-1:1; mail_psc_fingerprint=bfb14236794fea1c71128deb46355f02; usertrack=CrH3/luvRwWoRWDVAx0mAg==; area_id=52; __session__=1; fingerprint=1264686547; cur_servername=%25E5%25BE%25B7%25E9%2598%25B3%25E6%2596%2587%25E5%25BA%2599; sid=smTmSs6c2gPxqlDfsmXuUsNNJPcba_CNNrv7uvbJ; last_login_serverid=352; wallet_data=%7B%22is_locked%22%3A%20false%2C%20%22checking_balance%22%3A%200%2C%20%22balance%22%3A%200%2C%20%22free_balance%22%3A%200%7D; latest_views=159_2928104-275_1957182-167_1244727-123_2368954-167_1178024-167_1232781-167_1244661-167_1243011-167_1245563-123_2327135-221_1481157-829_181405-173_3263018-352_2811234' --compressed
And this is my code:
import requests
res=requests.get("https://xyq.cbg.163.com/equip?s=352&eid=201809300000113-352-DCEBW3UFAURC&view_loc=equip_list")
headers={
'Host': 'xyq.cbg.163.com',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': 'https://xyq.cbg.163.com/cgi-bin/login.py?next_url=%2Fequip%3Fs%3D352%26eid%3D201809300000113-352-DCEBW3UFAURC%26view_loc%3Dequip_list&server_id=352&act=do_anon_auth',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7',
'Cookie': 'vjuids=5568da96a.1645fa80ea3.0.9bce666ccd158; vjlast=1530613207.1530613207.30; _ntes_nnid=6225af57f6bccb13029779bc5f612dc1,1530613206698; _ntes_nuid=6225af57f6bccb13029779bc5f612dc1; P_INFO=jiangwei1995910#163.com|1538213634|0|mail163|11&10|bej&1538141485&xyq#bej&null#10#0#0|&0|xyq&cbg|jiangwei1995910#163.com; nts_mail_user=jiangwei1995910#163.com:-1:1; mail_psc_fingerprint=bfb14236794fea1c71128deb46355f02; usertrack=CrH3/luvRwWoRWDVAx0mAg==; area_id=52; __session__=1; fingerprint=1264686547; cur_servername=%25E5%25BE%25B7%25E9%2598%25B3%25E6%2596%2587%25E5%25BA%2599; sid=smTmSs6c2gPxqlDfsmXuUsNNJPcba_CNNrv7uvbJ; last_login_serverid=352; wallet_data=%7B%22is_locked%22%3A%20false%2C%20%22checking_balance%22%3A%200%2C%20%22balance%22%3A%200%2C%20%22free_balance%22%3A%200%7D; latest_views=159_2928104-275_1957182-167_1244727-123_2368954-167_1178024-167_1232781-167_1244661-167_1243011-167_1245563-123_2327135-221_1481157-829_181405-173_3263018-352_2811234'
}
res.encoding='gb2312'
print(res.text)
Why i get different response?
You are defining the headers dictionary but not using it for the request. You should pass it to the get function through the headers named parameter
res=requests.get(your_url, headers=headers)

GET request returns different JSON contents

I am crawling some data by using Scrapy. Every time I open product detail on browser and check this request was requested by browser always returned the same correct content without character '?????'
But if I open the request above on browser then It returned correct content about 10 times. Then, It returned wrong content by adding character '?????'
Can you explain why does this problem happen? And how to make Scrapy act as real browser?
This is correct content
{"itemid": 43369300, "liked": false, "offer_count": 6, "videos": [], "image": "41dabd8fe9b7cbc2ab30501592f65a80", "image_list": ["41dabd8fe9b7cbc2ab30501592f65a80", "91bf75885fffd2b1fbcc55099457bc22", "f4516bb9667f8329f031ff75896a71fd", "d2639a1ffe75912873de6d8e011dc0dd", "38d00637b021e1701542a6afa7ae58f3", "10ab99e3bd211bd4dd63993555d6454b"].....
And this is wrong content
{"itemid": 43369300, "liked": false, "offer_count": 10, "videos": [], "rating_star": 4.069458216402549, "image": "41dabd8fe9?????????????????????", "image_list": ["41dabd8fe9?????????????????????", "91bf75885f?????????????????????", "f4516bb966?????????????????????", "d2639a1ffe?????????????????????", "38d00637b0?????????????????????", "10ab99e3bd?????????????????????"].....
You can test with other requests request1, request2,...
The issue may be because you are hitting the API directly and they are preventing scraping. If I hit the below URL using curl and extra headers 10-15 times, it works fine
curl 'https://xxxx.vn/api/v0/shop/6088300/item/43369300/shipping_info_to_address/?state=H%C3%A0%20N%E1%BB%99i&city=Huy%E1%BB%87n%20Ba%20V%C3%AC&district=' \
-H 'Pragma: no-cache' \
-H 'DNT: 1' \
-H 'Accept-Encoding: gzip, deflate, br' \
-H 'Accept-Language: en-US,en;q=0.8' \
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36' \
-H 'X-API-SOURCE: pc' \
-H 'Accept: */*' \
-H 'Cache-Control: no-cache' \
-H 'X-Requested-With: XMLHttpRequest' \
-H 'Referer: https://xxx.vn/H%E1%BB%99p-%C4%91%E1%BB%B1ng-gi%C3%A0y-trong-su%E1%BB%91t-theo-d%C3%B5i-c%C3%B3-gi%C3%A1-t%E1%BB%91t-i.6088300.43369300' \
--compressed
So I think 4 important headers that you should send are below
'X-Requested-With: XMLHttpRequest'
'X-API-SOURCE: pc'
'Referer: https://xxx.vn/H%E1%BB%99p-%C4%91%E1%BB%B1ng-gi%C3%A0y-trong-su%E1%BB%91t-theo-d%C3%B5i-c%C3%B3-gi%C3%A1-t%E1%BB%91t-i.6088300.43369300'
'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
Send these headers while creating the Request in Scrapy

Resources