Scraping store locations from a complex website - python-3.x

I am new to web scraping and I need to scrape store locations from the given website. The information I need includes location title, address, city, state, country, phone. So far I have extracted the webpage but I don't know how to go forward
url = 'https://www.rebounderz.com/all-locations/'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102
Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
Please guide me how can I get the required information. I have searched other answers and looked into tutorials too but the structure of this website has made me confused.

import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
url = "https://www.rebounderz.com/all-locations/"
context = ssl._create_unverified_context()
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17'
request = urllib.request.Request(url, headers=headers)
html = urlopen(request, context=context)
soup = BeautifulSoup(html, 'lxml')
divs = soup.find_all('div', {"class":"size1of3"})
for div in divs:
print(div.find("h5").get_text())
print(div.find("p").get_text())

Related

WebScraping / Identical sites not working?

i would like to scrape the header-element from these both links -
For me this 2 sites look absolute identical - pics see below
Why is only the scraping for the second link working and not for the first?
import time
import requests
from bs4 import BeautifulSoup
# not working
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"First Link: {erg}")
# working
link = "https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"Second Link: {len(erg)}")
Working:
Not Working:
The page is sometimes loaded by JavaScript, so request won't support it.
You can use a while loop to check if header appears in the soup and then break
import requests
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
while True:
soup = BeautifulSoup(requests.get(link).content, "html.parser")
header = soup.find("header")
if header:
break
print(header)
Try this to get whatever fields you wish to grab from those links. curently it fetches the title. You can modify res.json()['data'][0]['attributes']['name'] to grab any field of your interest. Mkae sure to put the urls within this list urls_to_scrape.
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
urls_to_scrape = {
'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4',
'https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4'
}
base_url = 'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4'
link = 'https://amp-api.apps.apple.com/v1/catalog/US/apps/{}'
params = {
'platform': 'web',
'additionalPlatforms': 'appletv,ipad,iphone,mac',
'extend': 'customPromotionalText,customScreenshotsByType,description,developerInfo,distributionKind,editorialVideo,fileSizeByDevice,messagesScreenshots,privacy,privacyPolicyText,privacyPolicyUrl,requirementsByDeviceFamily,supportURLForLanguage,versionHistory,websiteUrl',
'include': 'genres,developer,reviews,merchandised-in-apps,customers-also-bought-apps,developer-other-apps,app-bundles,top-in-apps,related-editorial-items',
'l': 'en-us',
'limit[merchandised-in-apps]': '20',
'omit[resource]': 'autos',
'sparseLimit[apps:related-editorial-items]': '5'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
res = s.get(base_url)
soup = BeautifulSoup(res.text,"lxml")
token_raw = soup.select_one("[name='web-experience-app/config/environment']").get("content")
token = json.loads(unquote(token_raw))['MEDIA_API']['token']
s.headers['Accept'] = 'application/json'
s.headers['Referer'] = 'https://apps.apple.com/'
s.headers['Authorization'] = f'Bearer {token}'
for url in urls_to_scrape:
id_ = url.split("/")[-1].strip("id").split("?")[0]
res = s.get(link.format(id_),params=params)
title = res.json()['data'][0]['attributes']['name']
print(title)

parse text from span class using beautiful soup

I'm using beautiful soup to try to scrape the morning star rating out of yahoo finance. I have the span class below getting returned. Can anyone suggest how to return the "★★★★★" as a string from the soup below?
url:
https://finance.yahoo.com/quote/RFKTX?p=RFKTX
input:
<td class="Ta(end) Fw(600) Lh(14px)" data-test="MORNING_STAR_RATING-value"><span class="D(ib)">★★★★★</span><span class="D(ib) C($c-fuji-grey-d)"></span></td>
output:
★★★★★
This should do it:
import requests
from bs4 import BeautifulSoup
link = 'https://finance.yahoo.com/quote/RFKTX?p=RFKTX'
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
res = s.get(link)
soup = BeautifulSoup(res.text,"html.parser")
rating = soup.select_one("td[data-test='MORNING_STAR_RATING-value'] > span").text
print(rating)

How to get a link with web scraping

I would like to create a web scraping with some Python library (Beautiful Soup, for example) to collect the YouTube links on this page:
https://www.last.fm/tag/rock/tracks
Basically, I want to download the title of the song, the name of the artist and the link to Youtube. Can anyone help me with some code?
Here's how you can do it:
from bs4 import BeautifulSoup
import requests
url = 'https://www.last.fm/tag/rock/tracks'
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B179 Safari/7534.48.3"
}
links = []
response = requests.get(url, headers=headers)
response = requests.get(url, headers = headers)
soup = BeautifulSoup(response.content, 'html.parser')
soup.encode('utf-8')
urls = soup.find_all(class_ = 'chartlist-name')
for url in urls:
relative_link = url.find('a')['href']
link = 'https://www.last.fm/' + relative_link
links.append(link)
print(links)
With the fuction soup.find_all you find all the tag with the class: "chartlist-name".
The for loop is used to remove the html tags and to append the links in the "links" list
In the future, provide some code to show what you have attempted.
I have expanded on Fabix answer. The following code gets the Youtube link, song name, and artist for all 20 pages on the source website.
from bs4 import BeautifulSoup
import requests
master_url = 'https://www.last.fm/tag/rock/tracks?page={}'
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B179 Safari/7534.48.3"
}
for i in range(1,20):
response = requests.get(master_url.format(i), headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
chart_items = soup.find_all(class_='chartlist-row')
for chart_item in chart_items:
youtube_link = chart_item.find('a')['href']
artist = chart_item.find('td', {'class':'chartlist-artist'}).find('a').text
song_name = chart_item.find('td', {'class': 'chartlist-name'}).find('a').text
print('{}, {}, {}'.format(song_name, artist, youtube_link))

BeautifulSoup find() is returning none

import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.de/BenQ-GL2580H-Monitor-Eye-Care-Reaktionszeit/dp/B073NTJHYY/ref=sr_1_3?__mk_de_DE=%C3%85M%C3%85%C5%BD%C3%95%C3%91&dchild=1&keywords=bildschirm&qid=1597391122&sr=8-3'
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle")
price = soup.find(id="priceblock_ourprice")
print("Titel:",title,"\n","Preis:",price)
Output is always:
Titel: None
Preis: None
I already checked the steps before, but everything is working fine until it reaches the find function.
I never asked a question before, so forgive me if i made mistakes.
Thanks for the help.
you have to use different parser
try to make following changes:
soup = BeautifulSoup(page.content, 'html.parser')
to
soup = BeautifulSoup(page.content, 'lxml')

how to click the button with beautifulsoup

I want to click the button before scrape data
My code:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 4.3; nl-nl; SAMSUNG GT-I9505 Build/JSS15J) AppleWebKit/537.36 (KHTML, like Gecko) Version/1.5 Chrome/28.0.1500.94 Mobile Safari/537.36'}
url = "https://berdu.id/cek-resi?courier=sicepat&code=000361453759"
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
print(soup)
how to do it right

Resources