I am sending a POST request to filter the IDs then parse the output. I need to get all the content below the colspan selector (The DESIRED OUTPUT is shown in the end). Under colspan="4" there are lots of b,table tags and it contains tbody > tr > td but my script returns only b tags' content.
The URL: https://e-mehkeme.gov.az/Public/Cases
import requests
from bs4 import BeautifulSoup as bs
request_headers = {
'authority': 'e-mehkeme.gov.az',
'method': 'POST',
'path': '/Public/Cases',
'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
'application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en,en-GB;q=0.9',
'cache-control': 'max-age=0',
'content-length': '66',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://e-mehkeme.gov.az',
'referer': 'https://e-mehkeme.gov.az/Public/Cases',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/75.0.3770.142 Safari/537.36',
}
voens = {'1303450301',
'1700393071',
'2002283071',
}
form_data = {
'CourtId': '',
'CaseNo': '',
'DocFin': '',
'DocSeries': '',
'DocNumber': '',
'VOEN': voens,
'button': 'Search',
}
url = 'https://e-mehkeme.gov.az/Public/Cases?courtid='
response = requests.post(url, data=form_data, headers=request_headers)
s = bs(response.content, 'lxml')
# PRINT THE HEADERS!
sHeader = s.findAll('tr', {'class': 'centeredheader'})[0]
headers = [sHeader.get_text().strip()]
print(headers)
# PRINT THE CONTENTS OF EACH SEARCH!
for voen in voens:
form_data['VOEN'] = voen
idData = [string for string in s.select("td", colspan_="4")]
print(idData)
DESIRED OUTPUT:
Ətraflı məlumat:
İşə baxan hakim və ya tərkib
Abiddin Hüseynov - sədrlik edən hakim
Azad İmanov - tərkib üzvü
Vahid Sadıqov - tərkib üzvü
Tərəflər
Cavabdeh: ƏLİYEV HAFİZ RAMİZ
İddiaçı: "OPTİMAL ELEKTRONİKA" MƏHDUD MƏSULİYYƏTLİ CƏMİYYƏTİ
İşin mahiyyəti
Müqavilələrdən əmələ gələn öhdəliklər üzrə mübahisələr
You need to make a POST with each updated voen value in a loop and extract the ids and make new requests.
import requests,re
from bs4 import BeautifulSoup as bs
data = {'VOEN': '', 'button': 'Search'}
voens = ['1303450301', '1700393071', '2002283071']
for voen in voens:
data['VOEN'] = voen
r = requests.post('https://e-mehkeme.gov.az/Public/Cases', data=data)
soup = bs(r.text, 'lxml')
ids = [i['value'] for i in soup.select('.casedetail')]
for i in ids:
r = requests.get(f'https://e-mehkeme.gov.az/Public/CaseDetail?caseId={i}')
soup = bs(r.content, 'lxml')
print([re.sub('\n|\r|\n','',i.text.strip()) for i in soup.select('[colspan="4"]')])
Related
I need to get an element from this website: https://channelstore.roku.com/en-gb/details/38e7b84fe064cf927ad471ed632cc3d8/vlrpdd2
Task image:
I tried this code:
import requests
from bs4 import BeautifulSoup
page = requests.get('https://channelstore.roku.com/en-gb/details/38e7b84fe064cf927ad471ed632cc3d8/vlrpdd2')
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())
I got a document, but I only see metadata without the result I expected:
If you inspect your browsers Network calls (Click on F12), you'll see that the data is loaded dynamically from:
https://channelstore.roku.com/api/v6/channels/detailsunion/38e7b84fe064cf927ad471ed632cc3d8
So, to mimic the response, you can send a GET request to the URL.
Note, there's no need for BeautifulSoup:
import requests
headers = {
'authority': 'channelstore.roku.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'cookie': '_csrf=ZaFYG2W7HQA4xqKW3SUfuta0; ks.locale=j%3A%7B%22language%22%3A%22en%22%2C%22country%22%3A%22GB%22%7D; _usn=c2062c71-f89e-456f-9374-7c7767afc665; _uc=54c11aa8-597e-4155-bfa1-379add00fc85%3Aa044adeb2f02798a3c8335d874d49562; _ga=GA1.3.760826055.1671811832; _gid=GA1.3.2471563.1671811832; _ga=GA1.1.760826055.1671811832; _cs_c=0; roku_test; AWSELB=0DC9CDB91658555B919B869A2ED9157DFA13B446022D0100EBAAD7261A39D5A536AC0223E5570FAECF0099832FA9F5DB8028018FCCD9D0A49D8F2BDA087916BC1E51F73D1E; AWSELBCORS=0DC9CDB91658555B919B869A2ED9157DFA13B446022D0100EBAAD7261A39D5A536AC0223E5570FAECF0099832FA9F5DB8028018FCCD9D0A49D8F2BDA087916BC1E51F73D1E; _gat_UA-678051-1=1; _ga_ZZXW5ZLMQ5=GS1.1.1671811832.1.1.1671812598.0.0.0; _cs_id=8a1f1aec-e083-a585-e054-158b6714ab4a.1671811832.1.1671812598.1671811832.1.1705975832137; _cs_s=10.5.0.1671814398585',
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
}
params = {
'country': 'GB',
'language': 'en',
}
response = requests.get(
'https://channelstore.roku.com/api/v6/channels/detailsunion/38e7b84fe064cf927ad471ed632cc3d8',
params=params,
headers=headers,
).json()
# Uncomment to print all the data
# from pprint import pprint
# pprint(response)
print(response.get("feedChannel").get("name"))
print("rating: ", response.get("feedChannel").get("starRatingCount"))
Prints:
The Silver Collection Comedies
rating: 3
you will need to use selenium for python as you need to load the javascript.
beautifulsoup can only really handle static websites.
For those who are reading my thread, I'd like to thank you in advance for your assistance in advance and would also like to ask for a bit of leniency when it comes to incorrect terminology, as I am still a 'Newbie'.
I've been trying to retrieve stock codes from KRX website, as I could not find any other resource to retrieve the information that I need. I tried to use requests library in python, but because the data I needed was loaded Asynchronously, which made the data inaccessible.
The problem is that in order to retrieve the information, I need to make two requests to an endpoint, one to retrieve code to be used as body for the second request, but when I made the second request, it returns empty list.
I managed to locate the API calls which retrieved the stock codes as shown below.
TwoRequests
To my knowledge, it requires two API calls, one to retrieve code, which works as access token for the second request in order to retrieve the Stock code that I am trying to retrieve.
I've managed to retrieve the code for the first request with the following codes
import requests
url = 'https://global.krx.co.kr/contents/COM/GenerateOTP.jspx'
headers = {
'Cookie': 'SCOUTER=x22rkf7ltsmr7l; __utma=88009422.986813715.1652669493.1652669493.1652669493.1; SCOUTER=z6pj0p85muce99; JSESSIONID=bOnAJtLWSpK1BiCuhWD0ldj1TqW5z6wEcn65oVgtyie841OlbdJs3fEHpUs1QtAV.bWRjX2RvbWFpbi9tZGNvd2FwMS1tZGNhcHAwMQ==; JSESSIONID=C2794518AD56B7119F0DA630B73B05AA.58tomcat2',
'Connection': 'keep-alive',
'accept': '*/*',
'accept-enconding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,ko;q=0.8',
'host': 'global.krx.co.kr',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
}
params = {
'bld': 'COM/stock_isu_info',
'name': 'finderBld',
'_': '1668677450106',
}
# make get request to the url and keep the connection open
response = requests.get(url, headers=headers, params=params, stream=True)
# response = requests.get(url, params=params, headers=headers)
relay_data = response.text
but upon sending a request to the second endpoint with the code as payload, it returns empty list, but I was expecting the response value for the second request as the following:
PayloadNeeded
The code I used to make the second request is the following (I added lots values for the header and body in hopes to retrieve the data by simulating the values used on the web page):
url = 'https://global.krx.co.kr/contents/GLB/99/GLB99000001.jspx'
headers = {
# ':authority': 'global.krx.co.kr',
# ':method': 'POST',
# ':path': '/contents/GLB/99/GLB99000001.jspx',
# ':scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,ko;q=0.8',
'content-length': '0',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'SCOUTER=x22rkf7ltsmr7l; __utma=88009422.986813715.1652669493.1652669493.1652669493.1; SCOUTER=z6pj0p85muce99; JSESSIONID=bOnAJtLWSpK1BiCuhWD0ldj1TqW5z6wEcn65oVgtyie841OlbdJs3fEHpUs1QtAV.bWRjX2RvbWFpbi9tZGNvd2FwMS1tZGNhcHAwMQ==; JSESSIONID=C2794518AD56B7119F0DA630B73B05AA.58tomcat2',
'origin': 'https://global.krx.co.kr',
'referer': 'https://global.krx.co.kr/contents/GLB/99/GLB99000001.jsp',
'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'sec-gpc': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
}
payload = {
'market_gubun': '0',
'isu_cdnm': 'All',
'isu_cd': '',
'isu_nm': '',
'isu_srt_cd': '',
'sort':'',
'ck_std_ind_cd': '20',
'par_pr': '',
'cpta_scl': '',
'sttl_trm': '',
'lst_stk_vl': '1',
'in_lst_stk_vl': '',
'in_lst_stk_vl2': '',
'cpt': '1',
'in_cpt': '',
'in_cpt2': '',
'nat_tot_amt': '1',
'in_nat_tot_amt': '',
'in_nat_tot_amt2': '',
'pagePath': '/contents/GLB/03/0308/0308010000/GLB0308010000.jsp',
'code': relay_data,
'pageFirstCall': 'Y',
}
# make request with url, headers, body
response = requests.post(url, headers=headers, data=payload)
print(response.text)
And here is the output for the code above:
{"DS1":[]}
Any help would be very much appreciated
I am scratching a page, but when I request a link with all the information it shows me that the data does not exist, but I check the json with the firefox inspector and the response has all the information, I have manipulated the headers, but I have not succeeded in getting me show the data.
my code:
settings.py:
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 1
DOWNLOAD_DELAY = 3
COOKIES_ENABLED = False
mi_spider.py:
from scrapy import Spider
from scrapy.http import Request
from json import loads, dump
N_categoria = 0
API_key = 'P1MfFHfQMOtL16Zpg36NcntJYCLFm8FqFfudnavl'
class MetrocScrapingSpider(Spider):
name = 'metroc_scraping'
allowed_domains = ['metrocuadrado.com']
start_urls = ['https://www.metrocuadrado.com/']
def parse(self, response):
print()
print('Entra aca 1')
print()
aptos_links = response.xpath('//*[#class= "box-list"]')[N_categoria].xpath('.//li//a/#href').extract()
data_links = []
for url in aptos_links:
items = {}
url = url.split('.com')[-1].split('/')
for ind, info in enumerate(url):
if info == '':
url.pop(ind)
items['inmu_'] = url[0]
items['type_'] = url[1]
items['loc_'] = url[-1]
data_links.append(items)
n_cat = 1
yield Request(url= response.url,
callback= self.first_parse,
meta= {'data_links': data_links,
'n_cat': n_cat,
'aptos_links': aptos_links},
dont_filter= True)
def first_parse(self, response):
data_links = response.meta['data_links']
n_cat = response.meta['n_cat']
aptos_links = response.meta['aptos_links']
n_from = 0
cat_linl = aptos_links[n_cat]
data_link = data_links[n_cat]
print(data_link)
inmu_ = data_link['inmu_']
type_ = data_link['type_']
loc_ = data_link['loc_']
api_link = 'https://www.metrocuadrado.com/rest-search/search?realEstateTypeList='+inmu_+'&realEstateBusinessList='+type_+'&city='+loc_+'&from='
yield Request(url= api_link + str(n_from) + '&size=50',
callback= self.main_parse,
meta= {'data_links': data_links,
'n_cat': n_cat,
'n_from': n_from,
'api_link': api_link},
dont_filter= True,
headers= {'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive',
'DNT': '1',
'Host': 'www.metrocuadrado.com',
'Upgrade-Insecure-Requests': '1',
'Referer': cat_linl,
'Pragma': 'no-cache',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0',
'X-Api-Key': API_key,
'X-Requested-With': 'XMLHttpRequest'})
def main_parse(self, response):
print()
print(response.url)
print()
print(response.status)
print()
jsonresponse = loads(response.text)
print(jsonresponse)
below the link and the response status, is the json response
as you can see, "totalHits" is 0, "totalEntries" is 0 too and results is empty. But, if you look at the firefox inspector:
screenshot of the request headers
a part of the response in firefox inspector (i dont know if is hard to see but "totalHits" is 3135 and "totalEntries" 3135:
enter image description here
i don't know why it happens, any help please ?
I have following code in my webscraper:
postbody = {'Submit': {}, 'czas_kon2': '', 'czas_pocz2': '', 'num_pacz': '', 'typ': 'wsz'}
post = requests.post(spolka, data=postbody)
data = post.text
I am executing it over 400 webpages in a loop, to scrape data using multiprocessing (8 processes).
data is supposed to contain whole html page for further xml processing.
But out of 400 pages I get 2 that does not return meaningful content. I suspect it is because of heavy load I create. I tried time.sleep(1), time.sleep(10) but no luck here.
How could I ensure that data or post variable always contain whole page, like for 398 working ones?
I tried simple while loop for retry... but it is far from perfect (I was able to get 1 out of remaining 2 pages) afrer one extra attempt.
while len(data) < 1024:
postbody = {'Submit': {}, 'czas_kon2': '', 'czas_pocz2': '', 'num_pacz': '', 'typ': 'wsz'}
post = requests.post(spolka, data=postbody)
data = post.text
I think you should add a request headers.
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'}
postbody = {'Submit': {}, 'czas_kon2': '', 'czas_pocz2': '', 'num_pacz': '', 'typ': 'wsz'}
post = requests.post(spolka, data=postbody, headers=headers)
and more headers example:
headers = {
'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Host': 'www.google.com',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'X-Requested-With': 'XMLHttpRequest',
'Cookies': '',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'
}
When I visit 'https://baike.baidu.com/wikitag/taglist?tagId=75953' on chrome,through fiddler I find the browser sends a post request to 'https://baike.baidu.com//wikitag/api/getlemmas'.
So I'm trying to send a 'POST' request with form data to the url:'https://baike.baidu.com//wikitag/api/getlemmas' and get the JSON data from its 'response' request.
I get all the headers and form data through the Fiddler and try to send the same 'POST' request by python3 using requests package.
But even I send the 'POST' request with the same headers and form data, I get the request(status:200) with an empty body.
the same request I send by 'postman' is also all right, but by python3 I failed anyway.
# -*- coding:UTF-8 -*-
import requests
def disease_json():
host = 'https://baike.baidu.com'
target = host + '/wikitag/api/getlemmas'
cookies={
'BAIDUID':'EEE35ACB030447144E615B191397065B:FG=1;PSTM=1523192637;BIDUPSID=B34DD366905D15BB907C1667346970AE;Hm_lvt_55b574651fcae74b0a9f1cf9c8d7c93a=1522304864,1522305101,1523192946,1523253565;PSINO=2;H_PS_PSSID=1990_1438_26082_21 125_22074;BDORZ=B490B5EBF6F3CD402E515D22BCDA1598'
}
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'Content-Length':'91',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Referer':'https://baike.baidu.com/wikitag/taglist?tagId=75953',
'Origin':'https://baike.baidu.com',
'Connection':'keep-alive',
'Host':'baike.baidu.com',
}
forms = {
'limit': '24',
'timeout': '3000',
'filterTags':'[]',
'tagID': '75953',
'fromLemma': 'false',
'contentLength': '40',
'page': '0',
}
req=requests.post(url=target,data=forms,verify=False,headers=headers)
print(req.text)
"""
html = json.loads(req.text)
for each in html['lemmaList']:
print('lemmaCroppedTitle:',each['lemmaCroppedTitle'])
print(req.text)
"""
def main():
disease_json()
if __name__ == '__main__':
main()
Following is the correct request sent by browser:
Modified content-type and your request payload. Also added method encode_multipart_data for payload transformation to be consistent with multipart-form-data
import sys
import requests
def encode_multipart_data(fields):
boundary = '------WebKitFormBoundary7MA4YWxkTrZu0gW'
CRLF = '\r\n'
L = []
for key, value in fields.items():
L.append(boundary)
L.append('Content-Disposition: form-data; name="%s"\r\n' % key)
L.append(value)
L.append(boundary + "--")
body = CRLF.join(L)
return body
def disease_json():
host = 'https://baike.baidu.com'
target = host + '/wikitag/api/getlemmas'
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
# changed content-type
'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
'Referer':'https://baike.baidu.com/wikitag/taglist?tagId=75953',
'Origin':'https://baike.baidu.com',
'Connection':'keep-alive',
'Host':'baike.baidu.com'
}
forms = {
'limit': '24',
'timeout': '3000',
'filterTags': '[]',
'tagId': '75953',
'fromLemma': 'false',
'contentLength': '40',
'page': '0',
}
payload = encode_multipart_data(forms)
resp = requests.post(url=target, data=payload, headers=headers)
print(resp.text)
if __name__ == '__main__':
disease_json()
This way can also solve the problem.
import requests
import http.cookiejar
import json
url = "https://baike.baidu.com/wikitag/api/getlemmas"
payload = "limit=24&timeout=3000&filtetTags=%5B%5D&tagId=75953&fromLemma=false&contentLegth=40&page=0"
headers = {
'Content-Type': "application/x-www-form-urlencoded",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181\
Safari/537.36"
}
def get_cookies():
session = requests.Session()
session.cookies = http.cookiejar.LWPCookieJar("cookie")
response = session.post(url, headers=headers, data=payload, allow_redirects=False,verify=False)
session.cookies.save(ignore_discard=True, ignore_expires=True)
return response
def disease_json(times=-1):
times += 1
response = get_cookies()
if response.status_code == 302:
session = requests.session()
session.cookies = http.cookiejar.LWPCookieJar(filename='cookie')
session.cookies.load(ignore_discard=True)
url = response.headers['Location']
response = session.post(url, headers=headers, data=payload, allow_redirects=False)
json_data = response.text
print(json.loads(json_data))
print(times)
if __name__ == '__main__':
disease_json()