Understanding Bearer Authorization for web scraping using python 3.8 and requests - python-3.x

So I am looking to scrape the following site:
https://hyland.csod.com/ux/ats/careersite/4/home?c=hyland
What I am running into using the Python Requests library is that the header requires I pass along an Authorization header that bears a token of some kind. While I can get this to work if I manually go to the page, copy and paste it, and then run my program, I am wondering how I could bypass this issue (After all, what is the point in running a scraper if I still have to visit the actual site manually and retrieve the authorization token).
I am newer to authorization/ bearer headers and am hoping someone might be able to clarify how the browser generates a token to retrieve this information/ how I can simulate this. Here is my code:
import requests
import json
import datetime
today = datetime.datetime.today()
url = "https://hyland.csod.com/services/x/career-site/v1/search"
# actual sitehttps://hyland.csod.com/ux/ats/careersite/4/home?c=hyland
headers = {
'authority': 'hyland.csod.com',
'origin': 'https://hyland.csod.com',
'authorization': 'Bearer eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCIsImNsaWQiOiI0bDhnbnFhbGk3NjgifQ.eyJzdWIiOi0xMDMsImF1ZCI6IjRxNTFzeG5oY25yazRhNXB1eXZ1eGh6eCIsImNvcnAiOiJoeWxhbmQiLCJjdWlkIjoxLCJ0emlkIjoxNCwibmJkIjoiMjAxOTEyMzEyMTE0MTU5MzQiLCJleHAiOiIyMDE5MTIzMTIyMTUxNTkzNCIsImlhdCI6IjIwMTkxMjMxMjExNDE1OTM0In0.PlNdWXtb1uNoMuGIhI093ZbheRN_DwENTlkNoVr0j7Zah6JHd5cukudVFnZEiQmgBZ_nlDU4C-9JO_2We380Vg',
'content-type': 'application/json',
'accept': 'application/json; q=1.0, text/*; q=0.8, */*; q=0.1',
'x-requested-with': 'XMLHttpRequest',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'csod-accept-language': 'en-US',
'referer': 'https://hyland.csod.com/ux/ats/careersite/4/home?c=hyland',
'accept-encoding': 'gzip, deflate, br',
'cookie': 'CYBERU_lastculture=en-US; ASP.NET_SessionId=4q51sxnhcnrk4a5puyvuxhzx; cscx=hyland^|-103^|1^|14^|KumB4VhzYXML22MnMxjtTB9SKgHiWW0tFg0HbHnOek4=; c-s=expires=1577909201~access=/clientimg/hyland/*^!/content/hyland/*~md5=78cd5252d2efff6eb77d2e6bf0ce3127',
}
data = ['{"careerSiteId":4,"pageNumber":1,"pageSize":25,"cultureId":1,"searchText":"","cultureName":"en-US","states":["oh"],"countryCodes":[],"cities":[],"placeID":"","radius":null,"postingsWithinDays":null,"customFieldCheckboxKeys":[],"customFieldDropdowns":[],"customFieldRadios":[]}',
'{"careerSiteId":4,"pageNumber":2,"pageSize":25,"cultureId":1,"searchText":"","cultureName":"en-US","states":["oh"],"countryCodes":[],"cities":[],"placeID":"","radius":null,"postingsWithinDays":null,"customFieldCheckboxKeys":[],"customFieldDropdowns":[],"customFieldRadios":[]}']
def hyland(url, data):
# for openings in data:
dirty = requests.post(url, headers=headers, data=data).text
if 'Unauthorized' in dirty:
print(dirty)
print("There was an error connecting. Check Info")
# print(dirty)
clean = json.loads(dirty)
cleaner = json.dumps(clean, indent=4)
print("Openings at Hyland Software in Westlake as of {}".format(today.strftime('%m-%d-%Y')))
for i in range(0,60):
try:
print(clean["data"]["requisitions"][i]["displayJobTitle"])
print("")
print("")
except:
print("{} Openings at Hyland".format(i))
break
for datum in data:
hyland(url, data=datum)
So basically what my code is doing is sending a post request to the url above along with the headers and necessary data to retrieve what I want. This scraper works for a short period of time, but if I leave and come back after a few hours it no longer works due to authorization (at least that is what I have concluded).
Any help/ clarification on how all this works would be greatly appreciated.

Your code has a few problems:
As you noted you have to get the bearer token
You have to send your requests using requests.session() (as this webpage seems to pay attention to the cookies you send)
Optional: your headers had a lot of unnecessary headers that could be removed
All in all, here bellow is the working code:
import requests
import json
import datetime
today = datetime.datetime.today()
session = requests.session()
url = "https://hyland.csod.com:443/ux/ats/careersite/4/home?c=hyland"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:71.0) Gecko/20100101 Firefox/71.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "DNT": "1", "Connection": "close", "Upgrade-Insecure-Requests": "1"}
raw = session.get(url, headers=headers).text
token = raw[raw.index("token")+8:]
token = token[:token.index("\"")]
bearer_token = f"Bearer {token}"
url = "https://hyland.csod.com/services/x/career-site/v1/search"
# actual sitehttps://hyland.csod.com/ux/ats/careersite/4/home?c=hyland
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:71.0) Gecko/20100101 Firefox/71.0", "Authorization": bearer_token}
data = ['{"careerSiteId":4,"pageNumber":1,"pageSize":25,"cultureId":1,"searchText":"","cultureName":"en-US","states":["oh"],"countryCodes":[],"cities":[],"placeID":"","radius":null,"postingsWithinDays":null,"customFieldCheckboxKeys":[],"customFieldDropdowns":[],"customFieldRadios":[]}',
'{"careerSiteId":4,"pageNumber":2,"pageSize":25,"cultureId":1,"searchText":"","cultureName":"en-US","states":["oh"],"countryCodes":[],"cities":[],"placeID":"","radius":null,"postingsWithinDays":null,"customFieldCheckboxKeys":[],"customFieldDropdowns":[],"customFieldRadios":[]}']
def hyland(url, data, session= session):
# for openings in data:
dirty = session.post(url, headers=headers, data=data).text
if 'Unauthorized' in dirty:
print(dirty)
print("There was an error connecting. Check Info")
# print(dirty)
clean = json.loads(dirty)
cleaner = json.dumps(clean, indent=4)
print("Openings at Hyland Software in Westlake as of {}".format(today.strftime('%m-%d-%Y')))
for i in range(0,60):
try:
print(clean["data"]["requisitions"][i]["displayJobTitle"])
print("")
print("")
except:
print("{} Openings at Hyland".format(i))
break
for datum in data:
hyland(url, data=datum, session = session)
hope this helps

Related

Scraping kenpom data in 2023

I have been scouring across the web to figure out how to use beautifulsoup and pandas to scrape kenpom.com college basketball data. I do not have an account to his website, hence why I am not using the kenpompy library. I have seen some past examples to scrape it from years past, including using the pracpred library (though I have zero experience on it, I'll admit) or using the curlconverter to grab the headers, cookies, and parameters during the requests.get, but now the website seems stingier in terms of grabbing the main table these days. I have used the following code so far:
import requests
from bs4 import BeautifulSoup
url ='https://kenpom.com/index.php?y=2023'
import requests
import requests
cookies = {
'PHPSESSID': 'f04463ec42584dbd1bf7a480098947d1',
'_ga': 'GA1.2.120164513.1673124870',
'_gid': 'GA1.2.622021765.1673496414',
'__stripe_mid': '71a4117b-cbef-4d3c-b31b-9d18e5c99a33183485',
'__stripe_sid': '99b77b80-1222-4f7a-b2a8-acf5cf19c7d18a637f',
'kenpomtry': 'https%3A%2F%2Fkenpom.com%2Fsummary.php%3Fs%3DRankAPL_Off%26y%3D2021',
'_gat_gtag_UA_11558853_1': '1',
}
headers = {
'authority': 'kenpom.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9',
# 'cookie': 'PHPSESSID=f04463ec42584dbd1bf7a480098947d1; _ga=GA1.2.120164513.1673124870; _gid=GA1.2.622021765.1673496414; __stripe_mid=71a4117b-cbef-4d3c-b31b-9d18e5c99a33183485; __stripe_sid=99b77b80-1222-4f7a-b2a8-acf5cf19c7d18a637f; kenpomtry=https%3A%2F%2Fkenpom.com%2Fsummary.php%3Fs%3DRankAPL_Off%26y%3D2021; _gat_gtag_UA_11558853_1=1',
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
}
params = {
'y': '2023',
}
response = requests.get('https://kenpom.com/index.php', params=params, cookies=cookies, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
soup
# table = soup.find('table',{'id':'ratings-table'}).tbody
Any suggestions beyond this would be truly appreciated.
Using requests and adding UserAgent as headers (It's pretty messy with multiple hierarchal indexes so will need further parsing to clean completely):
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36"
}
url = "https://kenpom.com/index.php?y=2023"
with requests.Session() as request:
response = request.get(url, timeout=30, headers=headers)
if response.status_code != 200:
print(response.raise_for_status())
soup = BeautifulSoup(response.text, "html.parser")
df = pd.concat(pd.read_html(str(soup)))

why i can't get the result from lagou this web site by using web scraping

I m using python 3.6.5 and my os system is macOS 10.13.6.
I m learning Web Scraping and I want to catch data from this web site(https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=)
Here is my code:
# encoding: utf-8
import requests
from lxml import etree
def parse_list_page():
url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'X-Anit-Forge-Code':'0',
'X-Anit-Forge-Token':None,
'X-Requested-With':'XMLHttpRequest',
}
data = {
'first':'false',
'pn':1,
'kd':'python',
}
response = requests.post(url,headers=headers,data=data)
print(response.json())
def main():
parse_list_page()
if __name__ == '__main__':
main()
I m appreciate you for spending the time to answer my question.
I got the answer, here is the code below:
# encoding: utf-8
import requests
from lxml import etree
import time
def parse_list_page():
url = 'https://www.lagou.com/jobs/list_python?px=default&city=%E6%B7%B1%E5%9C%B3'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/',
'Connection':'keep-alive',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
'Upgrade-Insecure-Requests':'1',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Cache-Control':'no-cache',
'Pragma':'no-cache',
}
response = requests.get(url,headers=headers)
# print(response.text)
r = requests.utils.dict_from_cookiejar(response.cookies)
print(r)
print('='*30)
# r['LGUID'] = r['LGRID']
# r['user_trace_token'] = r['LGRID']
# r['LGSID'] = r['LGRID']
cookies = {
# 'X_MIDDLE_TOKEN':'df7c1d3cfdf279f0caf13df990723620',
# 'JSESSIONID':'ABAAABAAAIAACBI29FE9BDFB6838D8DD69C580E517292C9',
# '_ga':'GA1.2.820168368.1551196380',
# '_gat':'1',
# 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1551196381',
# 'user_trace_token':'20190226235303-99bc357a-39de-11e9-921f-525400f775ce',
# 'LGSID':'20190311094827-c3bc2393-439f-11e9-a15a-525400f775ce',
# 'PRE_UTM':'',
# 'PRE_HOST':'',
# 'PRE_SITE':'',
# 'PRE_LAND':'https%3A%2F%2Fwww.lagou.com%2F',
# 'LGUID':'20190226235303-99bc3944-39de-11e9-921f-525400f775ce',
# '_gid':'GA1.2.1391680888.1552248111',
# 'index_location_city':'%E6%B7%B1%E5%9C%B3',
# 'TG-TRACK-CODE':'index_search',
# 'LGRID':'20190311100452-0ed0525c-43a2-11e9-9113-5254005c3644',
# 'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1552269893',
# 'SEARCH_ID':'aae3c38ec76545fc86cd4e23153afe44',
}
cookies.update(r)
print(r)
print('=' * 30)
print(cookies)
print('=' * 30)
headers = {
'Origin':'https://www.lagou.com',
'X-Anit-Forge-Code': '0',
'X-Anit-Forge-Token': None,
'X-Requested-With': 'XMLHttpRequest',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'h-CN,zh;q=0.9,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.lagou.com/jobs/list_python?px=default&city=%E6%B7%B1%E5%9C%B3',
'Connection': 'keep-alive',
}
params = {
'px':'default',
'city':'深圳',
'needAddtionalResult':'false'
}
data = {
'first':'true',
'pn':1,
'kd':'python',
}
url_json = 'https://www.lagou.com/jobs/positionAjax.json'
response = requests.post(url=url_json,headers=headers,params=params,cookies=cookies,data=data)
print(response.json())
def main():
parse_list_page()
if __name__ == '__main__':
main()
The reason why I can't get the json as response is the against web scraping rules here is you need to use the first cookie when you send the request.
so when you first send the request you need to save the cookies and then update it to use your second page request. Hope it will helpful for you to do web scraping when you face this problem

i want to crawl a website with python,but i meet a trouble . requests library is ok but 400 with Scrapy,the code below

i want to crawl a website with python,but i meet a trouble . requests library is ok but 400 with Scrapy,the code below
import requests
urls = "https://pan.baidu.com/s/1sj1JLJv"
headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
"Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
'Content-Length': '0',
"Connection": "keep-alive"<br>
}
print(str((requests.get(urls, headers=header)).content, 'utf-8'))
from scrapy_redis.spiders import RedisCrawlSpider
class baiduuSpider(RedisCrawlSpider):
...
...
...
urls = "https://pan.baidu.com/s/1sj1JLJv"
yield scrapy.Request(url = urls,headers = headers,callback = self.first_parse)
def first_parse(self, response):
print(response.body.decode('utf-8'))
How do I fix this question
I'm sorry, but you won't succeed, because the page loads dynamically.
It is necessary to compile javascript on the fly - Selenium, Splash

Python3 requests post correctly but get nothing(but by browser is ok)

When I visit 'https://baike.baidu.com/wikitag/taglist?tagId=75953' on chrome,through fiddler I find the browser sends a post request to 'https://baike.baidu.com//wikitag/api/getlemmas'.
So I'm trying to send a 'POST' request with form data to the url:'https://baike.baidu.com//wikitag/api/getlemmas' and get the JSON data from its 'response' request.
I get all the headers and form data through the Fiddler and try to send the same 'POST' request by python3 using requests package.
But even I send the 'POST' request with the same headers and form data, I get the request(status:200) with an empty body.
the same request I send by 'postman' is also all right, but by python3 I failed anyway.
# -*- coding:UTF-8 -*-
import requests
def disease_json():
host = 'https://baike.baidu.com'
target = host + '/wikitag/api/getlemmas'
cookies={
'BAIDUID':'EEE35ACB030447144E615B191397065B:FG=1;PSTM=1523192637;BIDUPSID=B34DD366905D15BB907C1667346970AE;Hm_lvt_55b574651fcae74b0a9f1cf9c8d7c93a=1522304864,1522305101,1523192946,1523253565;PSINO=2;H_PS_PSSID=1990_1438_26082_21 125_22074;BDORZ=B490B5EBF6F3CD402E515D22BCDA1598'
}
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'Content-Length':'91',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Referer':'https://baike.baidu.com/wikitag/taglist?tagId=75953',
'Origin':'https://baike.baidu.com',
'Connection':'keep-alive',
'Host':'baike.baidu.com',
}
forms = {
'limit': '24',
'timeout': '3000',
'filterTags':'[]',
'tagID': '75953',
'fromLemma': 'false',
'contentLength': '40',
'page': '0',
}
req=requests.post(url=target,data=forms,verify=False,headers=headers)
print(req.text)
"""
html = json.loads(req.text)
for each in html['lemmaList']:
print('lemmaCroppedTitle:',each['lemmaCroppedTitle'])
print(req.text)
"""
def main():
disease_json()
if __name__ == '__main__':
main()
Following is the correct request sent by browser:
Modified content-type and your request payload. Also added method encode_multipart_data for payload transformation to be consistent with multipart-form-data
import sys
import requests
def encode_multipart_data(fields):
boundary = '------WebKitFormBoundary7MA4YWxkTrZu0gW'
CRLF = '\r\n'
L = []
for key, value in fields.items():
L.append(boundary)
L.append('Content-Disposition: form-data; name="%s"\r\n' % key)
L.append(value)
L.append(boundary + "--")
body = CRLF.join(L)
return body
def disease_json():
host = 'https://baike.baidu.com'
target = host + '/wikitag/api/getlemmas'
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
# changed content-type
'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
'Referer':'https://baike.baidu.com/wikitag/taglist?tagId=75953',
'Origin':'https://baike.baidu.com',
'Connection':'keep-alive',
'Host':'baike.baidu.com'
}
forms = {
'limit': '24',
'timeout': '3000',
'filterTags': '[]',
'tagId': '75953',
'fromLemma': 'false',
'contentLength': '40',
'page': '0',
}
payload = encode_multipart_data(forms)
resp = requests.post(url=target, data=payload, headers=headers)
print(resp.text)
if __name__ == '__main__':
disease_json()
This way can also solve the problem.
import requests
import http.cookiejar
import json
url = "https://baike.baidu.com/wikitag/api/getlemmas"
payload = "limit=24&timeout=3000&filtetTags=%5B%5D&tagId=75953&fromLemma=false&contentLegth=40&page=0"
headers = {
'Content-Type': "application/x-www-form-urlencoded",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181\
Safari/537.36"
}
def get_cookies():
session = requests.Session()
session.cookies = http.cookiejar.LWPCookieJar("cookie")
response = session.post(url, headers=headers, data=payload, allow_redirects=False,verify=False)
session.cookies.save(ignore_discard=True, ignore_expires=True)
return response
def disease_json(times=-1):
times += 1
response = get_cookies()
if response.status_code == 302:
session = requests.session()
session.cookies = http.cookiejar.LWPCookieJar(filename='cookie')
session.cookies.load(ignore_discard=True)
url = response.headers['Location']
response = session.post(url, headers=headers, data=payload, allow_redirects=False)
json_data = response.text
print(json.loads(json_data))
print(times)
if __name__ == '__main__':
disease_json()

LinkedIn HTTP Error 999 - Request denied

I am writing a simple script to get public profile visible without login on LinkedIn.
Below is my code to get the page for beautifulsoup. I am using public proxies as well.
import urllib.request, urllib.error
from bs4 import BeautifulSoup
url = "https://www.linkedin.com/company/amazon"
proxy = urllib.request.ProxyHandler({'https': proxy, })
opener = urllib.request.build_opener(proxy)
urllib.request.install_opener(opener)
hdr = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3218.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,hi;q=0.8',
'Connection': 'keep-alive'}
req = urllib.request.Request(url, headers=hdr)
page = urllib.request.urlopen(req, timeout=20)
self.soup = BeautifulSoup(page.read(), "lxml")
But it is raising "HTTPError 999 - request Denied" error. This is only for testing purpose till I am getting access via partnership program.
What am I doing wrong? Please help.
You did not do anything wrong, LinkedIn blacklist cloud servers ip addresses to prevent "stealing" their data. Questionable practice but this is how it is.

Resources