BeautifulSoup find() is returning none

BeautifulSoup find() is returning none - python-3.x

import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.de/BenQ-GL2580H-Monitor-Eye-Care-Reaktionszeit/dp/B073NTJHYY/ref=sr_1_3?__mk_de_DE=%C3%85M%C3%85%C5%BD%C3%95%C3%91&dchild=1&keywords=bildschirm&qid=1597391122&sr=8-3'
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle")
price = soup.find(id="priceblock_ourprice")
print("Titel:",title,"\n","Preis:",price)
Output is always:
Titel: None
Preis: None
I already checked the steps before, but everything is working fine until it reaches the find function.
I never asked a question before, so forgive me if i made mistakes.
Thanks for the help.

you have to use different parser
try to make following changes:
soup = BeautifulSoup(page.content, 'html.parser')
to
soup = BeautifulSoup(page.content, 'lxml')

Related

WebScraping / Identical sites not working?

i would like to scrape the header-element from these both links -
For me this 2 sites look absolute identical - pics see below
Why is only the scraping for the second link working and not for the first?
import time
import requests
from bs4 import BeautifulSoup
# not working
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"First Link: {erg}")
# working
link = "https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"Second Link: {len(erg)}")
Working:
Not Working:

The page is sometimes loaded by JavaScript, so request won't support it.
You can use a while loop to check if header appears in the soup and then break
import requests
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
while True:
soup = BeautifulSoup(requests.get(link).content, "html.parser")
header = soup.find("header")
if header:
break
print(header)

Try this to get whatever fields you wish to grab from those links. curently it fetches the title. You can modify res.json()['data'][0]['attributes']['name'] to grab any field of your interest. Mkae sure to put the urls within this list urls_to_scrape.
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
urls_to_scrape = {
'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4',
'https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4'
}
base_url = 'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4'
link = 'https://amp-api.apps.apple.com/v1/catalog/US/apps/{}'
params = {
'platform': 'web',
'additionalPlatforms': 'appletv,ipad,iphone,mac',
'extend': 'customPromotionalText,customScreenshotsByType,description,developerInfo,distributionKind,editorialVideo,fileSizeByDevice,messagesScreenshots,privacy,privacyPolicyText,privacyPolicyUrl,requirementsByDeviceFamily,supportURLForLanguage,versionHistory,websiteUrl',
'include': 'genres,developer,reviews,merchandised-in-apps,customers-also-bought-apps,developer-other-apps,app-bundles,top-in-apps,related-editorial-items',
'l': 'en-us',
'limit[merchandised-in-apps]': '20',
'omit[resource]': 'autos',
'sparseLimit[apps:related-editorial-items]': '5'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
res = s.get(base_url)
soup = BeautifulSoup(res.text,"lxml")
token_raw = soup.select_one("[name='web-experience-app/config/environment']").get("content")
token = json.loads(unquote(token_raw))['MEDIA_API']['token']
s.headers['Accept'] = 'application/json'
s.headers['Referer'] = 'https://apps.apple.com/'
s.headers['Authorization'] = f'Bearer {token}'
for url in urls_to_scrape:
id_ = url.split("/")[-1].strip("id").split("?")[0]
res = s.get(link.format(id_),params=params)
title = res.json()['data'][0]['attributes']['name']
print(title)

parse text from span class using beautiful soup

I'm using beautiful soup to try to scrape the morning star rating out of yahoo finance. I have the span class below getting returned. Can anyone suggest how to return the "★★★★★" as a string from the soup below?
url:
https://finance.yahoo.com/quote/RFKTX?p=RFKTX
input:
<td class="Ta(end) Fw(600) Lh(14px)" data-test="MORNING_STAR_RATING-value"><span class="D(ib)">★★★★★</span><span class="D(ib) C($c-fuji-grey-d)"></span></td>
output:
★★★★★

This should do it:
import requests
from bs4 import BeautifulSoup
link = 'https://finance.yahoo.com/quote/RFKTX?p=RFKTX'
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
res = s.get(link)
soup = BeautifulSoup(res.text,"html.parser")
rating = soup.select_one("td[data-test='MORNING_STAR_RATING-value'] > span").text
print(rating)

Scraping a Video from a website and downloading it as mp4

I am trying to scrape this webiste and download the soccer goal that is in it. I saw this Stackoverflow post and I tried the solution and it still does not work.
Here is the code I have done
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = 'https://cdn-cf-east.streamable.com/video/mp4/g6f986.mp4p'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)
when I printed the request it showed me 403 and because of that the video will not open or get downloaded, any thoughts on how I can download the video?

Use this below url.
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = 'https://cdn-cf-east.streamable.com/video/mp4/g6f986.mp4?Expires=1621994280&Signature=IqySuJxyVi9pCmC~JUhl-iyp-LmG6OiAFfQeu-~-a55osCfu9VrksEhzaQzJlMxAHcSt1R4j9Pt-G8sblQeFt3UtGqY-neHJkC4mUxuHjxGWAWdksyiAxkMb8DYRLkvIseUfkbKbeO6Dt807QwMkspFmXYdzljm8DLho6nMQfC--jtfy8B2gONhA9YUmK2o~fUHwTHzTXXqNGct2hQl-B9cFLDBdj8LXWTj-75YInwWxLwtoenKK~qLahGtJXKXvxTVltxMvUYXXvP9F~WfhNIhNqns1JKrrrqJ~N1XunZHCv~IVJyzOEvrn2G4J5LMIn~dcEZ9frV3APHsE4D~HQA__&Key-Pair-Id=APKAIEYUVEN4EVB2OKEQ'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)

This is how you extract the video link from Streamable if you want to in a programmatic way
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = "https://streamable.com/a50s3e"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
# Pull all text from the BodyText div
video = soup.find_all('video')
temp = video[0].get("src")
url = "https:"+temp
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)

BeautifulSoup Python web scraping Missing html Main Body

i am using Beutifull soup to scrape this web page: https://greyhoundbet.racingpost.com//#results-dog/race_id=1765914&dog_id=527442&r_date=2020-03-19&track_id=61&r_time=11:03
Result: i get the javaScript, Css
Desired output: i need the main html
i used this code
import requests
from bs4 import BeautifulSoup
url = 'https://greyhoundbet.racingpost.com//#results-dog/race_id=1765914&dog_id=527442&r_date=2020-03-19&track_id=61&r_time=11:03'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
page = requests.get(url,headers=headers)url = 'https://greyhoundbet.racingpost.com//#results-dog/race_id=1765914&dog_id=527442&r_date=2020-03-19&track_id=61&r_time=11:03'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
page = requests.get(url,headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')

I’m afraid you won’t be able to get it directly using BeautifulSoup because the page loads then a javascript loads data.
It’s one of the component’s limitations, you may need to use selenium.
please check the answers on this question

I think what you looking for is this:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
It will contain the text from the page including html tags

Scraping store locations from a complex website

I am new to web scraping and I need to scrape store locations from the given website. The information I need includes location title, address, city, state, country, phone. So far I have extracted the webpage but I don't know how to go forward
url = 'https://www.rebounderz.com/all-locations/'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102
Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
Please guide me how can I get the required information. I have searched other answers and looked into tutorials too but the structure of this website has made me confused.

import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
url = "https://www.rebounderz.com/all-locations/"
context = ssl._create_unverified_context()
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17'
request = urllib.request.Request(url, headers=headers)
html = urlopen(request, context=context)
soup = BeautifulSoup(html, 'lxml')
divs = soup.find_all('div', {"class":"size1of3"})
for div in divs:
print(div.find("h5").get_text())
print(div.find("p").get_text())

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

BeautifulSoup find() is returning none - python-3.x

you have to use different parser try to make following changes: soup = BeautifulSoup(page.content, 'html.parser') to soup = BeautifulSoup(page.content, 'lxml')

Related

WebScraping / Identical sites not working?

parse text from span class using beautiful soup

Scraping a Video from a website and downloading it as mp4

BeautifulSoup Python web scraping Missing html Main Body

Scraping store locations from a complex website

Categories

Resources