Python3 requests post correctly but get nothing(but by browser is ok) - python-3.x

When I visit 'https://baike.baidu.com/wikitag/taglist?tagId=75953' on chrome,through fiddler I find the browser sends a post request to 'https://baike.baidu.com//wikitag/api/getlemmas'.
So I'm trying to send a 'POST' request with form data to the url:'https://baike.baidu.com//wikitag/api/getlemmas' and get the JSON data from its 'response' request.
I get all the headers and form data through the Fiddler and try to send the same 'POST' request by python3 using requests package.
But even I send the 'POST' request with the same headers and form data, I get the request(status:200) with an empty body.
the same request I send by 'postman' is also all right, but by python3 I failed anyway.
# -*- coding:UTF-8 -*-
import requests
def disease_json():
host = 'https://baike.baidu.com'
target = host + '/wikitag/api/getlemmas'
cookies={
'BAIDUID':'EEE35ACB030447144E615B191397065B:FG=1;PSTM=1523192637;BIDUPSID=B34DD366905D15BB907C1667346970AE;Hm_lvt_55b574651fcae74b0a9f1cf9c8d7c93a=1522304864,1522305101,1523192946,1523253565;PSINO=2;H_PS_PSSID=1990_1438_26082_21 125_22074;BDORZ=B490B5EBF6F3CD402E515D22BCDA1598'
}
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'Content-Length':'91',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Referer':'https://baike.baidu.com/wikitag/taglist?tagId=75953',
'Origin':'https://baike.baidu.com',
'Connection':'keep-alive',
'Host':'baike.baidu.com',
}
forms = {
'limit': '24',
'timeout': '3000',
'filterTags':'[]',
'tagID': '75953',
'fromLemma': 'false',
'contentLength': '40',
'page': '0',
}
req=requests.post(url=target,data=forms,verify=False,headers=headers)
print(req.text)
"""
html = json.loads(req.text)
for each in html['lemmaList']:
print('lemmaCroppedTitle:',each['lemmaCroppedTitle'])
print(req.text)
"""
def main():
disease_json()
if __name__ == '__main__':
main()
Following is the correct request sent by browser:

Modified content-type and your request payload. Also added method encode_multipart_data for payload transformation to be consistent with multipart-form-data
import sys
import requests
def encode_multipart_data(fields):
boundary = '------WebKitFormBoundary7MA4YWxkTrZu0gW'
CRLF = '\r\n'
L = []
for key, value in fields.items():
L.append(boundary)
L.append('Content-Disposition: form-data; name="%s"\r\n' % key)
L.append(value)
L.append(boundary + "--")
body = CRLF.join(L)
return body
def disease_json():
host = 'https://baike.baidu.com'
target = host + '/wikitag/api/getlemmas'
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
# changed content-type
'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
'Referer':'https://baike.baidu.com/wikitag/taglist?tagId=75953',
'Origin':'https://baike.baidu.com',
'Connection':'keep-alive',
'Host':'baike.baidu.com'
}
forms = {
'limit': '24',
'timeout': '3000',
'filterTags': '[]',
'tagId': '75953',
'fromLemma': 'false',
'contentLength': '40',
'page': '0',
}
payload = encode_multipart_data(forms)
resp = requests.post(url=target, data=payload, headers=headers)
print(resp.text)
if __name__ == '__main__':
disease_json()

This way can also solve the problem.
import requests
import http.cookiejar
import json
url = "https://baike.baidu.com/wikitag/api/getlemmas"
payload = "limit=24&timeout=3000&filtetTags=%5B%5D&tagId=75953&fromLemma=false&contentLegth=40&page=0"
headers = {
'Content-Type': "application/x-www-form-urlencoded",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181\
Safari/537.36"
}
def get_cookies():
session = requests.Session()
session.cookies = http.cookiejar.LWPCookieJar("cookie")
response = session.post(url, headers=headers, data=payload, allow_redirects=False,verify=False)
session.cookies.save(ignore_discard=True, ignore_expires=True)
return response
def disease_json(times=-1):
times += 1
response = get_cookies()
if response.status_code == 302:
session = requests.session()
session.cookies = http.cookiejar.LWPCookieJar(filename='cookie')
session.cookies.load(ignore_discard=True)
url = response.headers['Location']
response = session.post(url, headers=headers, data=payload, allow_redirects=False)
json_data = response.text
print(json.loads(json_data))
print(times)
if __name__ == '__main__':
disease_json()

Related

Response provided for One Page(200) but not for Other(401)

On the same website here, a consistent valid response is provided for One Page(200-url/nifty) but not provided for anotherpage(401-url/oispurtscontracts).
It does provide a valid response sometimes and other times returns a 401 error.
Browser cache cleared and reloaded.
Please provide a solution.
Error :
response.status_code = 401 for https://www.nseindia.com/api/live-analysis-oi-spurts-contracts
Code:
import requests
def connRequest(url,headers):
session = requests.Session()
request = session.get(url, headers=headers)
cookies = dict(request.cookies)
# print(cookies)
print(f"response.status_code = {request.status_code} for {url}")
response = session.get(url, headers=headers, cookies=cookies).json()
print(f"response = {response}")
return response
# Working - Response Provided
def nifty_Working():
url = 'https://www.nseindia.com/api/option-chain-indices?symbol=NIFTY'
# data = requests.get(url)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.5',
'Accept':'application/json'
}
response = connRequest(url, headers)
# 401 Error
def oiSpurtsContracts_NotWorking():
url = 'https://www.nseindia.com/api/live-analysis-oi-spurts-contracts'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en',
'Accept': 'application/json'
}
response = connRequest(url, headers)
def main():
# Working - Response Provided
nifty_Working()
print()
print()
print()
print()
# 401 Error
time.sleep(1)
oiSpurtsContracts_NotWorking()
main()

Scraping values from View Source using Requests Python 3

So this code below is working fine but when i change the url to another site it doesn't work
import requests
import re
url = "https://www.autotrader.ca/a/ram/1500/hamilton/ontario/19_12052335_/?showcpo=ShowCpo&ncse=no&ursrc=pl&urp=2&urm=8&sprx=-2"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
response = requests.get(url, headers=headers)
phone_number = re.findall('"phoneNumber":"([\d-]+)"', response.text)
print(phone_number)
['905-870-7127']
This code below doesn't work it gives the output [] Please tell me what am i doing wrong
import requests
import re
urls = "https://www.kijijiautos.ca/vip/22686710/","https://www.kijijiautos.ca/vip/22686710/"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
for url in urls:
response = requests.get(url, headers=headers)
number = re.findall('"number":"([\d-]+)"', response.text)
print(number)
[]
I think you are not getting The HTTP 200 OK success status as a response.for that cause you are unable to get the exptected ouptput. To get the HTTP 200 OK success status, I have changed the headers from inspecting http requests.
please try this
import requests
import re
import requests
headers = {
'authority': 'www.kijijiautos.ca',
'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
'pragma': 'no-cache',
'accept-language': 'en-CA',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
'content-type': 'application/json',
'accept': 'application/json',
'cache-control': 'no-cache',
'x-client-id': 'c89e7ff8-1d5a-4c2b-a095-c08dc08ccd3b',
'x-client': 'ca.move.web.app',
'sec-ch-ua-platform': '"Linux"',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://www.kijijiautos.ca/cars/hyundai/sonata/used/',
'cookie': 'mvcid=c89e7ff8-1d5a-4c2b-a095-c08dc08ccd3b; locale=en-CA; trty=e; _gcl_au=1.1.1363596757.1633936124; _ga=GA1.2.1193080228.1633936126; _gid=GA1.2.71842091.1633936126; AAMC_kijiji_0=REGION%7C3; aam_uuid=43389576784435124231935699643302941454; _fbp=fb.1.1633936286669.1508597061; __gads=ID=bb71a6fc168c1c33:T=1633936286:S=ALNI_MZk3lgy-9xgSGLPnfrkBET60uS6fA; GCLB=COyIgrWs-PWPsQE; lux_uid=163402080128473094; cto_bundle=zxCnjF95NFglMkZrTG5EZ2dzNHFSdjJ6QSUyQkJvM1BUbk5WTkpjTms0aWdZb3RxZUR3Qk1nd1BmcSUyQjZaZVFUdFdSelpua3pKQjFhTFk0U2ViTHVZbVg5ODVBNGJkZ2NqUGg1cHZJN3V0MWlwRkQwb1htcm5nNDlqJTJGUUN3bmt6ZFkzU1J0bjMzMyUyRkt5aGVqWTJ5RVJCa2ZJQUwxcFJnJTNEJTNE; _td=7f855061-c320-4570-b2d2-73c94bd22b13; rbzid=54THgSkyCRKwhVBqy+iHmjb1RG+uE6uH1XjpsXIazO5nO45GtpIXHGYii/PbJcdG3ahjIgKaBrjh0Yx2J6YCOLHEv3QYL559oz3jQaVrssH2/1Ui9buvIpuCwBOGG2xXGWW2qvcU5807PGsdubQDUvLkxmy4sor+4EzCI1OoUHMOG2asQwsgChqwzJixVvrE21E/NJdRfDLlejb5WeGEgU4B3dOYH95yYf5h+7fxV6H/XLhqbNa8e41DM3scfyeYWeqWCWmOH2VWZ7i3oQ0OXW1SkobLy0D6G+V9J5QMxb0=; rbzsessionid=ca53a07d3404ca93b3f8bc879291dc83; _uetsid=131a47702a6211ecba407d9ff6588dde; _uetvid=131b13602a6211ecacd0b56b0815e9b2',
}
response = requests.get('https://www.kijijiautos.ca/consumer/svc/a/22686710', headers=headers)
if response.status_code == 200:
# print(response.text)
numbers = re.findall(r'"number":"\+\d+"', response.text) # number one or more
print(numbers[0])
else:
print('status code is ', response.status_code)
output
# "number":"+17169905088"

Response with scrapy shows wrong or other data

I am scratching a page, but when I request a link with all the information it shows me that the data does not exist, but I check the json with the firefox inspector and the response has all the information, I have manipulated the headers, but I have not succeeded in getting me show the data.
my code:
settings.py:
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 1
DOWNLOAD_DELAY = 3
COOKIES_ENABLED = False
mi_spider.py:
from scrapy import Spider
from scrapy.http import Request
from json import loads, dump
N_categoria = 0
API_key = 'P1MfFHfQMOtL16Zpg36NcntJYCLFm8FqFfudnavl'
class MetrocScrapingSpider(Spider):
name = 'metroc_scraping'
allowed_domains = ['metrocuadrado.com']
start_urls = ['https://www.metrocuadrado.com/']
def parse(self, response):
print()
print('Entra aca 1')
print()
aptos_links = response.xpath('//*[#class= "box-list"]')[N_categoria].xpath('.//li//a/#href').extract()
data_links = []
for url in aptos_links:
items = {}
url = url.split('.com')[-1].split('/')
for ind, info in enumerate(url):
if info == '':
url.pop(ind)
items['inmu_'] = url[0]
items['type_'] = url[1]
items['loc_'] = url[-1]
data_links.append(items)
n_cat = 1
yield Request(url= response.url,
callback= self.first_parse,
meta= {'data_links': data_links,
'n_cat': n_cat,
'aptos_links': aptos_links},
dont_filter= True)
def first_parse(self, response):
data_links = response.meta['data_links']
n_cat = response.meta['n_cat']
aptos_links = response.meta['aptos_links']
n_from = 0
cat_linl = aptos_links[n_cat]
data_link = data_links[n_cat]
print(data_link)
inmu_ = data_link['inmu_']
type_ = data_link['type_']
loc_ = data_link['loc_']
api_link = 'https://www.metrocuadrado.com/rest-search/search?realEstateTypeList='+inmu_+'&realEstateBusinessList='+type_+'&city='+loc_+'&from='
yield Request(url= api_link + str(n_from) + '&size=50',
callback= self.main_parse,
meta= {'data_links': data_links,
'n_cat': n_cat,
'n_from': n_from,
'api_link': api_link},
dont_filter= True,
headers= {'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive',
'DNT': '1',
'Host': 'www.metrocuadrado.com',
'Upgrade-Insecure-Requests': '1',
'Referer': cat_linl,
'Pragma': 'no-cache',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0',
'X-Api-Key': API_key,
'X-Requested-With': 'XMLHttpRequest'})
def main_parse(self, response):
print()
print(response.url)
print()
print(response.status)
print()
jsonresponse = loads(response.text)
print(jsonresponse)
below the link and the response status, is the json response
as you can see, "totalHits" is 0, "totalEntries" is 0 too and results is empty. But, if you look at the firefox inspector:
screenshot of the request headers
a part of the response in firefox inspector (i dont know if is hard to see but "totalHits" is 3135 and "totalEntries" 3135:
enter image description here
i don't know why it happens, any help please ?

Understanding Bearer Authorization for web scraping using python 3.8 and requests

So I am looking to scrape the following site:
https://hyland.csod.com/ux/ats/careersite/4/home?c=hyland
What I am running into using the Python Requests library is that the header requires I pass along an Authorization header that bears a token of some kind. While I can get this to work if I manually go to the page, copy and paste it, and then run my program, I am wondering how I could bypass this issue (After all, what is the point in running a scraper if I still have to visit the actual site manually and retrieve the authorization token).
I am newer to authorization/ bearer headers and am hoping someone might be able to clarify how the browser generates a token to retrieve this information/ how I can simulate this. Here is my code:
import requests
import json
import datetime
today = datetime.datetime.today()
url = "https://hyland.csod.com/services/x/career-site/v1/search"
# actual sitehttps://hyland.csod.com/ux/ats/careersite/4/home?c=hyland
headers = {
'authority': 'hyland.csod.com',
'origin': 'https://hyland.csod.com',
'authorization': 'Bearer eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCIsImNsaWQiOiI0bDhnbnFhbGk3NjgifQ.eyJzdWIiOi0xMDMsImF1ZCI6IjRxNTFzeG5oY25yazRhNXB1eXZ1eGh6eCIsImNvcnAiOiJoeWxhbmQiLCJjdWlkIjoxLCJ0emlkIjoxNCwibmJkIjoiMjAxOTEyMzEyMTE0MTU5MzQiLCJleHAiOiIyMDE5MTIzMTIyMTUxNTkzNCIsImlhdCI6IjIwMTkxMjMxMjExNDE1OTM0In0.PlNdWXtb1uNoMuGIhI093ZbheRN_DwENTlkNoVr0j7Zah6JHd5cukudVFnZEiQmgBZ_nlDU4C-9JO_2We380Vg',
'content-type': 'application/json',
'accept': 'application/json; q=1.0, text/*; q=0.8, */*; q=0.1',
'x-requested-with': 'XMLHttpRequest',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'csod-accept-language': 'en-US',
'referer': 'https://hyland.csod.com/ux/ats/careersite/4/home?c=hyland',
'accept-encoding': 'gzip, deflate, br',
'cookie': 'CYBERU_lastculture=en-US; ASP.NET_SessionId=4q51sxnhcnrk4a5puyvuxhzx; cscx=hyland^|-103^|1^|14^|KumB4VhzYXML22MnMxjtTB9SKgHiWW0tFg0HbHnOek4=; c-s=expires=1577909201~access=/clientimg/hyland/*^!/content/hyland/*~md5=78cd5252d2efff6eb77d2e6bf0ce3127',
}
data = ['{"careerSiteId":4,"pageNumber":1,"pageSize":25,"cultureId":1,"searchText":"","cultureName":"en-US","states":["oh"],"countryCodes":[],"cities":[],"placeID":"","radius":null,"postingsWithinDays":null,"customFieldCheckboxKeys":[],"customFieldDropdowns":[],"customFieldRadios":[]}',
'{"careerSiteId":4,"pageNumber":2,"pageSize":25,"cultureId":1,"searchText":"","cultureName":"en-US","states":["oh"],"countryCodes":[],"cities":[],"placeID":"","radius":null,"postingsWithinDays":null,"customFieldCheckboxKeys":[],"customFieldDropdowns":[],"customFieldRadios":[]}']
def hyland(url, data):
# for openings in data:
dirty = requests.post(url, headers=headers, data=data).text
if 'Unauthorized' in dirty:
print(dirty)
print("There was an error connecting. Check Info")
# print(dirty)
clean = json.loads(dirty)
cleaner = json.dumps(clean, indent=4)
print("Openings at Hyland Software in Westlake as of {}".format(today.strftime('%m-%d-%Y')))
for i in range(0,60):
try:
print(clean["data"]["requisitions"][i]["displayJobTitle"])
print("")
print("")
except:
print("{} Openings at Hyland".format(i))
break
for datum in data:
hyland(url, data=datum)
So basically what my code is doing is sending a post request to the url above along with the headers and necessary data to retrieve what I want. This scraper works for a short period of time, but if I leave and come back after a few hours it no longer works due to authorization (at least that is what I have concluded).
Any help/ clarification on how all this works would be greatly appreciated.
Your code has a few problems:
As you noted you have to get the bearer token
You have to send your requests using requests.session() (as this webpage seems to pay attention to the cookies you send)
Optional: your headers had a lot of unnecessary headers that could be removed
All in all, here bellow is the working code:
import requests
import json
import datetime
today = datetime.datetime.today()
session = requests.session()
url = "https://hyland.csod.com:443/ux/ats/careersite/4/home?c=hyland"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:71.0) Gecko/20100101 Firefox/71.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "DNT": "1", "Connection": "close", "Upgrade-Insecure-Requests": "1"}
raw = session.get(url, headers=headers).text
token = raw[raw.index("token")+8:]
token = token[:token.index("\"")]
bearer_token = f"Bearer {token}"
url = "https://hyland.csod.com/services/x/career-site/v1/search"
# actual sitehttps://hyland.csod.com/ux/ats/careersite/4/home?c=hyland
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:71.0) Gecko/20100101 Firefox/71.0", "Authorization": bearer_token}
data = ['{"careerSiteId":4,"pageNumber":1,"pageSize":25,"cultureId":1,"searchText":"","cultureName":"en-US","states":["oh"],"countryCodes":[],"cities":[],"placeID":"","radius":null,"postingsWithinDays":null,"customFieldCheckboxKeys":[],"customFieldDropdowns":[],"customFieldRadios":[]}',
'{"careerSiteId":4,"pageNumber":2,"pageSize":25,"cultureId":1,"searchText":"","cultureName":"en-US","states":["oh"],"countryCodes":[],"cities":[],"placeID":"","radius":null,"postingsWithinDays":null,"customFieldCheckboxKeys":[],"customFieldDropdowns":[],"customFieldRadios":[]}']
def hyland(url, data, session= session):
# for openings in data:
dirty = session.post(url, headers=headers, data=data).text
if 'Unauthorized' in dirty:
print(dirty)
print("There was an error connecting. Check Info")
# print(dirty)
clean = json.loads(dirty)
cleaner = json.dumps(clean, indent=4)
print("Openings at Hyland Software in Westlake as of {}".format(today.strftime('%m-%d-%Y')))
for i in range(0,60):
try:
print(clean["data"]["requisitions"][i]["displayJobTitle"])
print("")
print("")
except:
print("{} Openings at Hyland".format(i))
break
for datum in data:
hyland(url, data=datum, session = session)
hope this helps

why i can't get the result from lagou this web site by using web scraping

I m using python 3.6.5 and my os system is macOS 10.13.6.
I m learning Web Scraping and I want to catch data from this web site(https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=)
Here is my code:
# encoding: utf-8
import requests
from lxml import etree
def parse_list_page():
url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'X-Anit-Forge-Code':'0',
'X-Anit-Forge-Token':None,
'X-Requested-With':'XMLHttpRequest',
}
data = {
'first':'false',
'pn':1,
'kd':'python',
}
response = requests.post(url,headers=headers,data=data)
print(response.json())
def main():
parse_list_page()
if __name__ == '__main__':
main()
I m appreciate you for spending the time to answer my question.
I got the answer, here is the code below:
# encoding: utf-8
import requests
from lxml import etree
import time
def parse_list_page():
url = 'https://www.lagou.com/jobs/list_python?px=default&city=%E6%B7%B1%E5%9C%B3'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/',
'Connection':'keep-alive',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
'Upgrade-Insecure-Requests':'1',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Cache-Control':'no-cache',
'Pragma':'no-cache',
}
response = requests.get(url,headers=headers)
# print(response.text)
r = requests.utils.dict_from_cookiejar(response.cookies)
print(r)
print('='*30)
# r['LGUID'] = r['LGRID']
# r['user_trace_token'] = r['LGRID']
# r['LGSID'] = r['LGRID']
cookies = {
# 'X_MIDDLE_TOKEN':'df7c1d3cfdf279f0caf13df990723620',
# 'JSESSIONID':'ABAAABAAAIAACBI29FE9BDFB6838D8DD69C580E517292C9',
# '_ga':'GA1.2.820168368.1551196380',
# '_gat':'1',
# 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1551196381',
# 'user_trace_token':'20190226235303-99bc357a-39de-11e9-921f-525400f775ce',
# 'LGSID':'20190311094827-c3bc2393-439f-11e9-a15a-525400f775ce',
# 'PRE_UTM':'',
# 'PRE_HOST':'',
# 'PRE_SITE':'',
# 'PRE_LAND':'https%3A%2F%2Fwww.lagou.com%2F',
# 'LGUID':'20190226235303-99bc3944-39de-11e9-921f-525400f775ce',
# '_gid':'GA1.2.1391680888.1552248111',
# 'index_location_city':'%E6%B7%B1%E5%9C%B3',
# 'TG-TRACK-CODE':'index_search',
# 'LGRID':'20190311100452-0ed0525c-43a2-11e9-9113-5254005c3644',
# 'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1552269893',
# 'SEARCH_ID':'aae3c38ec76545fc86cd4e23153afe44',
}
cookies.update(r)
print(r)
print('=' * 30)
print(cookies)
print('=' * 30)
headers = {
'Origin':'https://www.lagou.com',
'X-Anit-Forge-Code': '0',
'X-Anit-Forge-Token': None,
'X-Requested-With': 'XMLHttpRequest',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'h-CN,zh;q=0.9,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.lagou.com/jobs/list_python?px=default&city=%E6%B7%B1%E5%9C%B3',
'Connection': 'keep-alive',
}
params = {
'px':'default',
'city':'深圳',
'needAddtionalResult':'false'
}
data = {
'first':'true',
'pn':1,
'kd':'python',
}
url_json = 'https://www.lagou.com/jobs/positionAjax.json'
response = requests.post(url=url_json,headers=headers,params=params,cookies=cookies,data=data)
print(response.json())
def main():
parse_list_page()
if __name__ == '__main__':
main()
The reason why I can't get the json as response is the against web scraping rules here is you need to use the first cookie when you send the request.
so when you first send the request you need to save the cookies and then update it to use your second page request. Hope it will helpful for you to do web scraping when you face this problem

Resources