BeautifulSoup Python web scraping Missing html Main Body - python-3.x

i am using Beutifull soup to scrape this web page: https://greyhoundbet.racingpost.com//#results-dog/race_id=1765914&dog_id=527442&r_date=2020-03-19&track_id=61&r_time=11:03
Result: i get the javaScript, Css
Desired output: i need the main html
i used this code
import requests
from bs4 import BeautifulSoup
url = 'https://greyhoundbet.racingpost.com//#results-dog/race_id=1765914&dog_id=527442&r_date=2020-03-19&track_id=61&r_time=11:03'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
page = requests.get(url,headers=headers)url = 'https://greyhoundbet.racingpost.com//#results-dog/race_id=1765914&dog_id=527442&r_date=2020-03-19&track_id=61&r_time=11:03'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
page = requests.get(url,headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')

I’m afraid you won’t be able to get it directly using BeautifulSoup because the page loads then a javascript loads data.
It’s one of the component’s limitations, you may need to use selenium.
please check the answers on this question

I think what you looking for is this:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
It will contain the text from the page including html tags

Related

How to scrape an article which requires login to view full content using python?

I am trying to scrape an article from The Wall Street Journal and it requires log-in to view the whole content. So, I have written a code like the below using Python Requests:
import requests
from bs4 import BeautifulSoup
import re
import base64
import json
username= <username>
password= <password>
base_url= "https://accounts.wsj.com"
session = requests.Session()
r = session.get("{}/login".format(base_url))
soup = BeautifulSoup(r.text, "html.parser")
credentials_search = re.search("Base64\.decode\('(.*)'", r.text, re.IGNORECASE)
base64_decoded = base64.b64decode(credentials_search.group(1))
credentials = json.loads(base64_decoded)
connection = <connection_name>
r = session.post(
'https://sso.accounts.dowjones.com/usernamepassword/login',
data = {
"username": username,
"password": password,
"connection": connection,
"client_id": credentials["clientID"],
"state": credentials["internalOptions"]["state"],
"nonce": credentials["internalOptions"]["nonce"],
"scope": credentials["internalOptions"]["scope"],
"tenant": "sso",
"response_type": "code",
"protocol": "oauth2",
"redirect_uri": "https://accounts.wsj.com/auth/sso/login"
})
soup = BeautifulSoup(r.text, "html.parser")
login_result = dict([
(t.get("name"), t.get("value"))
for t in soup.find_all('input')
if t.get("name") is not None
])
r = session.post(
'https://sso.accounts.dowjones.com/login/callback',
data = login_result,
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36"},
)
# article get request
r = session.get(
"https://www.wsj.com/articles/singapore-prime-minister-lee-rejects-claims-he-misused-state-powers-in-family-feud-1499094761",
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36"}
)
print(r.text)
am able to login through the request but still I am not getting full article to scrape. Can anyone help me with this? Thanks in advance :-)
An easy and reliable solution is using the selenium webdriver.
With selenium you create an automated browser window which opens the website and from there you can make it choose the elements to log in. Then the content loads in as usual, like when you look for it manually on your browser.
Then you can soup that page with BeautifulSoup.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox(executable_path="C:\Program Files (x86)\geckodriver.exe")
# Download the driver for your desired browser and place it in any path
# for Chrome it's: driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
# open your website link
driver.get("https://www.your-url.com")
# then soup the page with BS
html = driver.page_source
page_soup = BeautifulSoup(html)
From there you can use the "page_soup" as you usually would.
Any questions? :)

Can not download excel file using requests python, I can't get the third step of posting request to download excel file. here is my try

Here is my attempt to download excel file ##----------
How Do i make it work. Can someone please help me to fix last call
import requests
from bs4 import BeautifulSoup
url = "http://lijekovi.almbih.gov.ba:8090/SpisakLijekova.aspx"
useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.76"
headers={
"User-Agent":useragent
}
session = requests.session() #session
r = session.get(url,headers=headers) #request to get cookies
soup = BeautifulSoup(r.text,"html.parser") #parsing values
viewstate = soup.find('input', {'id': '__VIEWSTATE'}).get('value')
viewstategenerator =soup.find('input', {'id': '__VIEWSTATEGENERATOR'}).get('value')
eventvalidation =soup.find('input', {'id': '__EVENTVALIDATION'}).get('value')
cookies = session.cookies.get_dict()
cookie=""
for k,v in cookies.items():
cookie+=k+"="+v+";"
cookie = cookie[:-1]
#header copied from the requests.
headers={
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
'Connection':'keep-alive',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.76',
'X-KL-Ajax-Request':'Ajax_Request',
'X-MicrosoftAjax':'Delta=true',
'X-Requested-With':'XMLHttpRequest',
'Cookie':cookie
}
#post request data submission
data={
'ctl00$smMain':'ctl00$MainContent$ReportGrid$ctl103$ReportGrid_top_4',
'__EVENTTARGET':'ctl00$MainContent$ReportGrid$ctl103$ReportGrid_top_4',
'__VIEWSTATE':viewstate,
'__VIEWSTATEGENERATOR':viewstategenerator,
'__EVENTVALIDATION':eventvalidation,
'__ASYNCPOST':'true'
}
#need help with this part
result = requests.get(url,headers=headers,data=data)
print(result.headers)
data = {
"__EVENTTARGET":'ctl00$MainContent$btnExport',
'__VIEWSTATE':viewstate,
}
#remove ajax request for the last call to download excel file
del headers['X-KL-Ajax-Request']
del headers['X-MicrosoftAjax']
del headers['X-Requested-With']
result = requests.post(url,headers=headers,data=data,allow_redirects=True)
print(result.headers)
print(result.status_code)
#print(result.text)
with open("test.xlsx","wb") as f:
f.write(result.content)
I am trying to export excel file without selenium help, but I am not able to get the last step. I need help to convert xmlhttprequest to pure requests using python without any selenium

Get response 403 when i'm trying to crawling, user agent doesn't work in Python 3

I'm trying to crawling this website and get the message:
"You don't have permission to access"
there is a way to bypass this ? already used user agents and urlopen
Here is my code:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from urllib.request import Request, urlopen
url = 'https://www.oref.org.il/12481-he/Pakar.aspx'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'}
res = requests.get(url, headers=header)
soup = BeautifulSoup(res.content, 'html.parser')
print(res)
output:
<Response [403]>
also tried to do that:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'})
webpage = urlopen(req).read()
output:
HTTP Error 403: Forbidden
still blocked and get response 403, Anyone who can help?

Scraping a Video from a website and downloading it as mp4

I am trying to scrape this webiste and download the soccer goal that is in it. I saw this Stackoverflow post and I tried the solution and it still does not work.
Here is the code I have done
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = 'https://cdn-cf-east.streamable.com/video/mp4/g6f986.mp4p'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)
when I printed the request it showed me 403 and because of that the video will not open or get downloaded, any thoughts on how I can download the video?
Use this below url.
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = 'https://cdn-cf-east.streamable.com/video/mp4/g6f986.mp4?Expires=1621994280&Signature=IqySuJxyVi9pCmC~JUhl-iyp-LmG6OiAFfQeu-~-a55osCfu9VrksEhzaQzJlMxAHcSt1R4j9Pt-G8sblQeFt3UtGqY-neHJkC4mUxuHjxGWAWdksyiAxkMb8DYRLkvIseUfkbKbeO6Dt807QwMkspFmXYdzljm8DLho6nMQfC--jtfy8B2gONhA9YUmK2o~fUHwTHzTXXqNGct2hQl-B9cFLDBdj8LXWTj-75YInwWxLwtoenKK~qLahGtJXKXvxTVltxMvUYXXvP9F~WfhNIhNqns1JKrrrqJ~N1XunZHCv~IVJyzOEvrn2G4J5LMIn~dcEZ9frV3APHsE4D~HQA__&Key-Pair-Id=APKAIEYUVEN4EVB2OKEQ'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)
This is how you extract the video link from Streamable if you want to in a programmatic way
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = "https://streamable.com/a50s3e"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
# Pull all text from the BodyText div
video = soup.find_all('video')
temp = video[0].get("src")
url = "https:"+temp
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)

BeautifulSoup find() is returning none

import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.de/BenQ-GL2580H-Monitor-Eye-Care-Reaktionszeit/dp/B073NTJHYY/ref=sr_1_3?__mk_de_DE=%C3%85M%C3%85%C5%BD%C3%95%C3%91&dchild=1&keywords=bildschirm&qid=1597391122&sr=8-3'
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle")
price = soup.find(id="priceblock_ourprice")
print("Titel:",title,"\n","Preis:",price)
Output is always:
Titel: None
Preis: None
I already checked the steps before, but everything is working fine until it reaches the find function.
I never asked a question before, so forgive me if i made mistakes.
Thanks for the help.
you have to use different parser
try to make following changes:
soup = BeautifulSoup(page.content, 'html.parser')
to
soup = BeautifulSoup(page.content, 'lxml')

Resources