WebScraping / Identical sites not working? - web

i would like to scrape the header-element from these both links -
For me this 2 sites look absolute identical - pics see below
Why is only the scraping for the second link working and not for the first?
import time
import requests
from bs4 import BeautifulSoup
# not working
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"First Link: {erg}")
# working
link = "https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"Second Link: {len(erg)}")
Working:
Not Working:

The page is sometimes loaded by JavaScript, so request won't support it.
You can use a while loop to check if header appears in the soup and then break
import requests
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
while True:
soup = BeautifulSoup(requests.get(link).content, "html.parser")
header = soup.find("header")
if header:
break
print(header)

Try this to get whatever fields you wish to grab from those links. curently it fetches the title. You can modify res.json()['data'][0]['attributes']['name'] to grab any field of your interest. Mkae sure to put the urls within this list urls_to_scrape.
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
urls_to_scrape = {
'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4',
'https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4'
}
base_url = 'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4'
link = 'https://amp-api.apps.apple.com/v1/catalog/US/apps/{}'
params = {
'platform': 'web',
'additionalPlatforms': 'appletv,ipad,iphone,mac',
'extend': 'customPromotionalText,customScreenshotsByType,description,developerInfo,distributionKind,editorialVideo,fileSizeByDevice,messagesScreenshots,privacy,privacyPolicyText,privacyPolicyUrl,requirementsByDeviceFamily,supportURLForLanguage,versionHistory,websiteUrl',
'include': 'genres,developer,reviews,merchandised-in-apps,customers-also-bought-apps,developer-other-apps,app-bundles,top-in-apps,related-editorial-items',
'l': 'en-us',
'limit[merchandised-in-apps]': '20',
'omit[resource]': 'autos',
'sparseLimit[apps:related-editorial-items]': '5'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
res = s.get(base_url)
soup = BeautifulSoup(res.text,"lxml")
token_raw = soup.select_one("[name='web-experience-app/config/environment']").get("content")
token = json.loads(unquote(token_raw))['MEDIA_API']['token']
s.headers['Accept'] = 'application/json'
s.headers['Referer'] = 'https://apps.apple.com/'
s.headers['Authorization'] = f'Bearer {token}'
for url in urls_to_scrape:
id_ = url.split("/")[-1].strip("id").split("?")[0]
res = s.get(link.format(id_),params=params)
title = res.json()['data'][0]['attributes']['name']
print(title)

Related

parse text from span class using beautiful soup

I'm using beautiful soup to try to scrape the morning star rating out of yahoo finance. I have the span class below getting returned. Can anyone suggest how to return the "★★★★★" as a string from the soup below?
url:
https://finance.yahoo.com/quote/RFKTX?p=RFKTX
input:
<td class="Ta(end) Fw(600) Lh(14px)" data-test="MORNING_STAR_RATING-value"><span class="D(ib)">★★★★★</span><span class="D(ib) C($c-fuji-grey-d)"></span></td>
output:
★★★★★
This should do it:
import requests
from bs4 import BeautifulSoup
link = 'https://finance.yahoo.com/quote/RFKTX?p=RFKTX'
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
res = s.get(link)
soup = BeautifulSoup(res.text,"html.parser")
rating = soup.select_one("td[data-test='MORNING_STAR_RATING-value'] > span").text
print(rating)

How to get a link with web scraping

I would like to create a web scraping with some Python library (Beautiful Soup, for example) to collect the YouTube links on this page:
https://www.last.fm/tag/rock/tracks
Basically, I want to download the title of the song, the name of the artist and the link to Youtube. Can anyone help me with some code?
Here's how you can do it:
from bs4 import BeautifulSoup
import requests
url = 'https://www.last.fm/tag/rock/tracks'
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B179 Safari/7534.48.3"
}
links = []
response = requests.get(url, headers=headers)
response = requests.get(url, headers = headers)
soup = BeautifulSoup(response.content, 'html.parser')
soup.encode('utf-8')
urls = soup.find_all(class_ = 'chartlist-name')
for url in urls:
relative_link = url.find('a')['href']
link = 'https://www.last.fm/' + relative_link
links.append(link)
print(links)
With the fuction soup.find_all you find all the tag with the class: "chartlist-name".
The for loop is used to remove the html tags and to append the links in the "links" list
In the future, provide some code to show what you have attempted.
I have expanded on Fabix answer. The following code gets the Youtube link, song name, and artist for all 20 pages on the source website.
from bs4 import BeautifulSoup
import requests
master_url = 'https://www.last.fm/tag/rock/tracks?page={}'
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B179 Safari/7534.48.3"
}
for i in range(1,20):
response = requests.get(master_url.format(i), headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
chart_items = soup.find_all(class_='chartlist-row')
for chart_item in chart_items:
youtube_link = chart_item.find('a')['href']
artist = chart_item.find('td', {'class':'chartlist-artist'}).find('a').text
song_name = chart_item.find('td', {'class': 'chartlist-name'}).find('a').text
print('{}, {}, {}'.format(song_name, artist, youtube_link))

BeautifulSoup find() is returning none

import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.de/BenQ-GL2580H-Monitor-Eye-Care-Reaktionszeit/dp/B073NTJHYY/ref=sr_1_3?__mk_de_DE=%C3%85M%C3%85%C5%BD%C3%95%C3%91&dchild=1&keywords=bildschirm&qid=1597391122&sr=8-3'
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle")
price = soup.find(id="priceblock_ourprice")
print("Titel:",title,"\n","Preis:",price)
Output is always:
Titel: None
Preis: None
I already checked the steps before, but everything is working fine until it reaches the find function.
I never asked a question before, so forgive me if i made mistakes.
Thanks for the help.
you have to use different parser
try to make following changes:
soup = BeautifulSoup(page.content, 'html.parser')
to
soup = BeautifulSoup(page.content, 'lxml')

Scraping store locations from a complex website

I am new to web scraping and I need to scrape store locations from the given website. The information I need includes location title, address, city, state, country, phone. So far I have extracted the webpage but I don't know how to go forward
url = 'https://www.rebounderz.com/all-locations/'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102
Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
Please guide me how can I get the required information. I have searched other answers and looked into tutorials too but the structure of this website has made me confused.
import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
url = "https://www.rebounderz.com/all-locations/"
context = ssl._create_unverified_context()
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17'
request = urllib.request.Request(url, headers=headers)
html = urlopen(request, context=context)
soup = BeautifulSoup(html, 'lxml')
divs = soup.find_all('div', {"class":"size1of3"})
for div in divs:
print(div.find("h5").get_text())
print(div.find("p").get_text())

how to extract links and handle the page as it is loading again and again using python beautifulsoup

tring to extract links and want to handle loading. but not the links even.
code:
from bs4 import BeautifulSoup
import requests
r = requests.get('http://www.indiabusinessguide.in/business-categories/agriculture/agricultural-equipment.html')
soup = BeautifulSoup(r.text,'lxml')
links = soup.find_all('a',class_='link_orange')
for link in links:
print(link['href'])
please help me to handle this loading and extraction of links.
Try using the lxml library. Response is received by posting a request to the url using Requests.
import requests
import lxml
from lxml import html
contact_list = []
def scrape(url, pages):
for page in range(1, pages):
headers = {
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"Cookie": "PHPSESSID=2q0tk3fi1kid0gbdfboh94ed56",
}
data = {
"page": f"{page}"
}
r = requests.post(url, headers=headers, data=data)
tree = html.fromstring(r.content)
links= tree.xpath('//a[#class="link_orange"]')
for link in links:
# print(link.get('href'))
contact_list.append(link.get('href'))
url = "http://www.indiabusinessguide.in/ajax_advertiselist.php"
scrape(url, 10)
print(contact_list)
print(len(contact_list))

Resources