Why do I get code 400 when POST "multipart/form-datain Scrapy. Python 3 - python-3.x

Trying hard to submit the form to no success.
This form is supposed to redirect and return new url with PDF.
Here is how to access the page in question:
Start with Search Page
Click on Document Type tab
Enter LP, click Search
Click View
Click Get Image
View PDF button is the one that Im interested in.
I need to mimic multipart formdata which looks like this:
<form name="courtform" action="http://oris.co.palm-beach.fl.us:8080/PdfServlet/PdfServlet27" method="post" enctype="multipart/form-data">
<input type="hidden" name="hostURL" value="http://oris.co.palm-beach.fl.us/or_web1/" size="60">
<input type="hidden" name="pdfPath" value="\\wcp01zfs-03.clerk.local\files2\ORISPDF\" size="60">
<input type="hidden" name="pdfURL" value="http://oris.co.palm-beach.fl.us/pdf/" size="60">
<input type="hidden" name="pages" value="1" size="60">
<!--<input type="hidden" name="pages" value="1" size="60">-->
<input type="hidden" name="id" value="22590889" size="60">
<input type="hidden" name="mpages" value="1" size="60">
<input type="hidden" name="doc_id" value="22590889" size="60">
<input type="hidden" name="page1" value="image_from_file.asp?imageurl=\\ors_fs\ORImage\O\30336\O.30336.1200.0001.tif" size="60">
<input type="hidden" name="WaterMarkText" value="1" size="60">
<input name="button" type="button" value="View PDF" onclick="javascript:ValidateAndSubmit(this.form)">
Here is part of my Scrapy code responsible for this request:
def get_image(self, response):
# inspect_response(response, self)
url = 'http://oris.co.palm-beach.fl.us:8080/PdfServlet/PdfServlet27'
headers = { 'Connection': 'keep-alive',
'origin': "http://oris.co.palm-beach.fl.us",
'upgrade-insecure-requests': "1",
'dnt': "1",
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
'cache-control': "max-age=0",
'Accept-Encoding': 'gzip,deflate',
}
id = response.xpath("//input[#name='doc_id']/#value").extract_first()
body = {'WaterMarkText': '0',
'hostURL': 'http://oris.co.palm-beach.fl.us/or_web1/',
'mpages': '1',
'page1': 'image_from_file.asp?imageurl=\\ors_fs\\ORImage\\O\\30338\\O.30338.0268.0001.tif',
'pages': '1',
'pdfPath': '\\wcp01zfs-03.clerk.local\\files2\\ORISPDF\\',
'pdfURL': 'http://oris.co.palm-beach.fl.us/pdf/',
}
body['doc_id'] = id
body['id'] = id
me = MultipartEncoder(fields=body, boundary='------WebKitFormBoundarygGHlhpHs08goICxO')
me_body = me.to_string()
headers['Content-Type'] =me.content_type
headers['Content-Length'] = me.len
yield scrapy.Request(url, method = 'POST', body = me_body, callback = self.get_pdf, headers = headers)
yield {'body':me_body}
def get_pdf(self, response):
inspect_response(response, self)
Whenever I run the code Im getting Response 400.
How do I mimic this form correctly?
UPDATE:
It appears I do not need to provide Content-Length manually.
After I removed it worked just one time. And then reverted to 404
error.
Is Boundary supposed to be new for every request? From what I read it
looks like it does not, since it is just a divider with no other
purpose.

I had to automate the entire process of filling the form and now it seems to work just fine.
def get_image(self, response):
# inspect_response(response, self)
item = response.meta['item']
url = 'http://oris.co.palm-beach.fl.us:8080/PdfServlet/PdfServlet27'
headers = {
'Connection': 'keep-alive',
'origin': "http://oris.co.palm-beach.fl.us",
'upgrade-insecure-requests': "1",
'dnt': "1",
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
'cache-control': "max-age=0",
'Accept-Encoding': 'gzip,deflate',
}
body={}
# Generate body from form
for i in response.xpath("//form[#name='courtform']/input"):
name = i.xpath(".//#name").extract_first()
val = i.xpath(".//#value").extract_first()
body[name] = val
# Remove watermakr from PDF
body['WaterMarkText'] = '0'
me = MultipartEncoder(fields=body, boundary='----WebKitFormBoundarygGHghpHs08goICxO')
me_body = me.to_string()
headers['Content-Type'] =me.content_type
yield scrapy.Request(url, method = 'POST', body = me_body, callback = self.get_pdf, headers = headers, meta={'item' : item})

Related

Response provided for One Page(200) but not for Other(401)

On the same website here, a consistent valid response is provided for One Page(200-url/nifty) but not provided for anotherpage(401-url/oispurtscontracts).
It does provide a valid response sometimes and other times returns a 401 error.
Browser cache cleared and reloaded.
Please provide a solution.
Error :
response.status_code = 401 for https://www.nseindia.com/api/live-analysis-oi-spurts-contracts
Code:
import requests
def connRequest(url,headers):
session = requests.Session()
request = session.get(url, headers=headers)
cookies = dict(request.cookies)
# print(cookies)
print(f"response.status_code = {request.status_code} for {url}")
response = session.get(url, headers=headers, cookies=cookies).json()
print(f"response = {response}")
return response
# Working - Response Provided
def nifty_Working():
url = 'https://www.nseindia.com/api/option-chain-indices?symbol=NIFTY'
# data = requests.get(url)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.5',
'Accept':'application/json'
}
response = connRequest(url, headers)
# 401 Error
def oiSpurtsContracts_NotWorking():
url = 'https://www.nseindia.com/api/live-analysis-oi-spurts-contracts'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en',
'Accept': 'application/json'
}
response = connRequest(url, headers)
def main():
# Working - Response Provided
nifty_Working()
print()
print()
print()
print()
# 401 Error
time.sleep(1)
oiSpurtsContracts_NotWorking()
main()

How to connect to site using API with python?

I am trying to scrape formularylookup.com, a site with information on the market for pharmaceuticals.
It requires a login:
username: -
password: -
I need the information for the medicine called Rybelsus.
When I look into the Inspect-> Network -> XHR I suspect there could be an easy way to get the required data form this page:
https://formularylookup.com/Formulary/Coverage?ProductId=237171&ProductName=Rybelsus&ChannelId=1&DrugTypeId=3&StateId=all&Options=SummaryCoverages
I identified this site, which might give an idea of how to connect to formularylookup.com, but I am very inexperienced with connecting to API's.
Here's my code:
import requests
from bs4 import BeautifulSoup
url ="https://api.mmitnetwork.com/Formulary/v1/Products?Name=rybelsus"
params = {
"ProductId":"237171",
"productSearch":"Rybelsus"}
headers = {
"authorization":"Bearer H-oa4ULGls2Cpu8U6hX4myixRoFIPxfj",
"Access-Token":"H-oa4ULGls2Cpu8U6hX4myixRoFIPxfj",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"Host": "formularylookup.com",
"X-NewRelic-ID": "XAYCVFZSGwcGU1lXBAI="
}
res = requests.get(url ,params=params ,headers = headers)
soup = BeautifulSoup(res.content, "lxml")
print(soup.prettify())
Which gives me the following response:
<!DOCTYPE html>
<html>
<head>
<title>
The resource cannot be found.
</title>
<meta content="width=device-width" name="viewport"/>
<style>
body {font-family:"Verdana";font-weight:normal;font-size: .7em;color:black;}
p {font-family:"Verdana";font-weight:normal;color:black;margin-top: -5px}
b {font-family:"Verdana";font-weight:bold;color:black;margin-top: -5px}
H1 { font-family:"Verdana";font-weight:normal;font-size:18pt;color:red }
H2 { font-family:"Verdana";font-weight:normal;font-size:14pt;color:maroon }
pre {font-family:"Consolas","Lucida Console",Monospace;font-size:11pt;margin:0;padding:0.5em;line-height:14pt}
.marker {font-weight: bold; color: black;text-decoration: none;}
.version {color: gray;}
.error {margin-bottom: 10px;}
.expandable { text-decoration:underline; font-weight:bold; color:navy; cursor:hand; }
#media screen and (max-width: 639px) {
pre { width: 440px; overflow: auto; white-space: pre-wrap; word-wrap: break-word; }
}
#media screen and (max-width: 479px) {
pre { width: 280px; }
}
</style>
</head>
<body bgcolor="white">
<span>
<h1>
Server Error in '/' Application.
<hr color="silver" size="1" width="100%"/>
</h1>
<h2>
<i>
The resource cannot be found.
</i>
</h2>
</span>
<font face="Arial, Helvetica, Geneva, SunSans-Regular, sans-serif ">
<b>
Description:
</b>
HTTP 404. The resource you are looking for (or one of its dependencies) could have been removed, had its name changed, or is temporarily unavailable. Please review the following URL and make sure that it is spelled correctly.
<br/>
<br/>
<b>
Requested URL:
</b>
/Formulary/v1/Products
<br/>
<br/>
<hr color="silver" size="1" width="100%"/>
<b>
Version Information:
</b>
Microsoft .NET Framework Version:4.0.30319; ASP.NET Version:4.6.1590.0
</font>
</body>
</html>
<!--
[HttpException]: The controller for path '/Formulary/v1/Products' was not found or does not implement IController.
at System.Web.Mvc.DefaultControllerFactory.GetControllerInstance(RequestContext requestContext, Type controllerType)
at System.Web.Mvc.DefaultControllerFactory.CreateController(RequestContext requestContext, String controllerName)
at System.Web.Mvc.MvcHandler.ProcessRequestInit(HttpContextBase httpContext, IController& controller, IControllerFactory& factory)
at System.Web.Mvc.MvcHandler.BeginProcessRequest(HttpContextBase httpContext, AsyncCallback callback, Object state)
at System.Web.HttpApplication.CallHandlerExecutionStep.System.Web.HttpApplication.IExecutionStep.Execute()
at System.Web.HttpApplication.ExecuteStep(IExecutionStep step, Boolean& completedSynchronously)
-->
<!--
This error page might contain sensitive information because ASP.NET is configured to show verbose error messages using <customErrors mode="Off"/>. Consider using <customErrors mode="On"/> or <customErrors mode="RemoteOnly"/> in production environments.-->
Update: I get an 404 error. Not sure why.
Below code will help you,
import requests
headers = {
'Accept': '*/*',
'X-Requested-With': 'XMLHttpRequest',
'Access-Token': '7Lq-KkDx2fCO_3kG90pLEpBS9Ssh62IQ',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
'Is-Session-Expired': 'false',
'Referer': 'https://formularylookup.com/',
}
response = requests.get('https://formularylookup.com/Formulary/Coverage?ProductId=237171&ProductName=Rybelsus&ChannelId=1&DrugTypeId=3&StateId=AL&Options=SummaryCoverages', headers=headers)
print(response.json())
Note: 'Is-Session-Expired': 'false' is very important in the header otherwise you'll get 404 error.
See it in action here

Why am i being detected as robot when i am replicating the exact request a browser is making?

This is the website "https://www.interlinecenter.com/" this website is making request to "http://cs.cruisebase.com/cs/forms/hotdeals.aspx?skin=605&nc=y" for loading html content in an "I-FRAME". I am making the exact same request using the same headers being sent by the browser but i am not getting the same content.
Here is the code i am using:
url='http://cs.cruisebase.com/cs/forms/hotdeals.aspx?skin=605&nc=y'
header = {
'Host': 'cs.cruisebase.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.interlinecenter.com/',
'Connection': 'keep-alive',
'Cookie': 'visid_incap_312345=yt2dprI6SuGoy44xQsnF36dOwV0AAAAAQUIPAAAAAAAqm0pG5WAWOGjtyY8GOrLv; __utma=15704100.1052110012.1572947038.1574192877.1575447075.6; __utmz=15704100.1575447075.6.6.utmcsr=interlinecenter.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ASP.NET_SessionId=pzd3a0l5kso41hhbqf3jiqlg; nlbi_312345=/7dzbSeGvDjg2/oY/eQfhwAAAACv806Zf3m7TsjHAou/y177; incap_ses_1219_312345=tMxeGkIPugj4d1gaasLqECHE5l0AAAAAg1IvjaYhEfuSIYLXtc2f/w==; LastVisitedClient=605; AWSELB=85D5DF550634E967F245F317B00A8C32EB84DA2B6B927E6D5CCB7C26C3821788BFC50D95449A1BA0B0AFD152140A70F5EA06CBB8492B21E10EC083351D7EBC4C68F086862A; incap_ses_500_312345=6PJ9FxwJ3gh0vta6kVvwBthz510AAAAAvUZPdshu8GVWM2sbkoUXmg==; __utmb=15704100.2.10.1575447075; __utmc=15704100; __utmt_tt=1',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
}
response = requests.get(url, timeout=10, headers=header)
byte_data = response.content
source_code = html.fromstring(byte_data)
print(response)
print(byte_data)
This is the response i am getting:
<Response [200]>
<html style="height:100%"><head><META NAME="ROBOTS" CONTENT="NOINDEX, NOFOLLOW"><meta name="format-detection" content="telephone=no"><meta name="viewport" content="initial-scale=1.0"><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"></head><body style="margin:0px;height:100%"><iframe id="main-iframe" src="/_Incapsula_Resource?SWUDNSAI=9&xinfo=10-99927380-0%200NNN%20RT%281575456049298%202%29%20q%280%20-1%20-1%200%29%20r%281%20-1%29%20B12%284%2c316%2c0%29%20U2&incident_id=500000240101726326-477561257670738314&edet=12&cinfo=04000000&rpinfo=0" frameborder=0 width="100%" height="100%" marginheight="0px" marginwidth="0px">Request unsuccessful. Incapsula incident ID: 500000240101726326-477561257670738314</iframe></body></html>
I need to extract/scrape data at "https://cs.cruisebase.com/cs/forms/hotdeals.aspx?skin=605&nc=y".
Note: i don't want to use the selenium webdriver to get the data any help will be much appreciated, Thanks!
Did you try getting the headers by loading the target URL directly?
I sent a GET request to https://cs.cruisebase.com/cs/forms/hotdeals.aspx?skin=605&nc=y with the following headers, and I was able to get the complete response.
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-GB,en;q=0.9,en-US;q=0.8,hi;q=0.7,la;q=0.6',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'ENTER COOKIES',
'DNT':'1',
'Host':'cs.cruisebase.com',
'Pragma':'no-cache',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
I have left the Cookie field blank, you will have to enter cookies otherwise the page won't load. You can get the cookies from Chrome.

How to show the hidden contents under "View More" when using Selenium - Python

driver = webdriver.Chrome(r'XXXX\chromedriver.exe')
FB_bloomberg_URL = 'https://www.bloomberg.com/quote/FB:US'
driver.get(FB_bloomberg_URL)
board_members = driver.find_elements_by_xpath('//* [#id="root"]/div/div/section[3]/div[10]/div[1]/div[2]/div/div[2]')[0]
board=board_members.text
board.split('\n')
I wrote the coding above to scrape the board information from Bloomberg for FaceBook. But I have trouble to extract all board members because others are hidden behind the "View More". How can I extract all names?
Thanks for the help.
You can do whole thing with requests and grab the appropriate cookie to pass to API from prior GET. The API can be found in the network tab when clicking the view more link and inspecting the web traffic.
import requests
headers = {
'dnt': '1',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'accept': '*/*',
'referer': 'https://www.bloomberg.com/quote/FB:US',
'authority': 'www.bloomberg.com',
'cookie':''
}
with requests.Session() as s:
r = s.get('https://www.bloomberg.com/quote/FB:US')
headers['cookie'] = s.cookies.get_dict()['_pxhd']
r = s.get('https://www.bloomberg.com/markets2/api/peopleForCompany/11092218', headers = headers).json()
board_members = [item['name'] for item in r['boardMembers']]
print(board_members)

Python3 requests post correctly but get nothing(but by browser is ok)

When I visit 'https://baike.baidu.com/wikitag/taglist?tagId=75953' on chrome,through fiddler I find the browser sends a post request to 'https://baike.baidu.com//wikitag/api/getlemmas'.
So I'm trying to send a 'POST' request with form data to the url:'https://baike.baidu.com//wikitag/api/getlemmas' and get the JSON data from its 'response' request.
I get all the headers and form data through the Fiddler and try to send the same 'POST' request by python3 using requests package.
But even I send the 'POST' request with the same headers and form data, I get the request(status:200) with an empty body.
the same request I send by 'postman' is also all right, but by python3 I failed anyway.
# -*- coding:UTF-8 -*-
import requests
def disease_json():
host = 'https://baike.baidu.com'
target = host + '/wikitag/api/getlemmas'
cookies={
'BAIDUID':'EEE35ACB030447144E615B191397065B:FG=1;PSTM=1523192637;BIDUPSID=B34DD366905D15BB907C1667346970AE;Hm_lvt_55b574651fcae74b0a9f1cf9c8d7c93a=1522304864,1522305101,1523192946,1523253565;PSINO=2;H_PS_PSSID=1990_1438_26082_21 125_22074;BDORZ=B490B5EBF6F3CD402E515D22BCDA1598'
}
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'Content-Length':'91',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Referer':'https://baike.baidu.com/wikitag/taglist?tagId=75953',
'Origin':'https://baike.baidu.com',
'Connection':'keep-alive',
'Host':'baike.baidu.com',
}
forms = {
'limit': '24',
'timeout': '3000',
'filterTags':'[]',
'tagID': '75953',
'fromLemma': 'false',
'contentLength': '40',
'page': '0',
}
req=requests.post(url=target,data=forms,verify=False,headers=headers)
print(req.text)
"""
html = json.loads(req.text)
for each in html['lemmaList']:
print('lemmaCroppedTitle:',each['lemmaCroppedTitle'])
print(req.text)
"""
def main():
disease_json()
if __name__ == '__main__':
main()
Following is the correct request sent by browser:
Modified content-type and your request payload. Also added method encode_multipart_data for payload transformation to be consistent with multipart-form-data
import sys
import requests
def encode_multipart_data(fields):
boundary = '------WebKitFormBoundary7MA4YWxkTrZu0gW'
CRLF = '\r\n'
L = []
for key, value in fields.items():
L.append(boundary)
L.append('Content-Disposition: form-data; name="%s"\r\n' % key)
L.append(value)
L.append(boundary + "--")
body = CRLF.join(L)
return body
def disease_json():
host = 'https://baike.baidu.com'
target = host + '/wikitag/api/getlemmas'
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
# changed content-type
'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
'Referer':'https://baike.baidu.com/wikitag/taglist?tagId=75953',
'Origin':'https://baike.baidu.com',
'Connection':'keep-alive',
'Host':'baike.baidu.com'
}
forms = {
'limit': '24',
'timeout': '3000',
'filterTags': '[]',
'tagId': '75953',
'fromLemma': 'false',
'contentLength': '40',
'page': '0',
}
payload = encode_multipart_data(forms)
resp = requests.post(url=target, data=payload, headers=headers)
print(resp.text)
if __name__ == '__main__':
disease_json()
This way can also solve the problem.
import requests
import http.cookiejar
import json
url = "https://baike.baidu.com/wikitag/api/getlemmas"
payload = "limit=24&timeout=3000&filtetTags=%5B%5D&tagId=75953&fromLemma=false&contentLegth=40&page=0"
headers = {
'Content-Type': "application/x-www-form-urlencoded",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181\
Safari/537.36"
}
def get_cookies():
session = requests.Session()
session.cookies = http.cookiejar.LWPCookieJar("cookie")
response = session.post(url, headers=headers, data=payload, allow_redirects=False,verify=False)
session.cookies.save(ignore_discard=True, ignore_expires=True)
return response
def disease_json(times=-1):
times += 1
response = get_cookies()
if response.status_code == 302:
session = requests.session()
session.cookies = http.cookiejar.LWPCookieJar(filename='cookie')
session.cookies.load(ignore_discard=True)
url = response.headers['Location']
response = session.post(url, headers=headers, data=payload, allow_redirects=False)
json_data = response.text
print(json.loads(json_data))
print(times)
if __name__ == '__main__':
disease_json()

Resources