Web-scraping Yell.com in Python - python-3.x

After reading a LOT, I have tried to do my first step in web scraping at yell website with urllib and requests but I get the same in both cases (404 not found).
The url is:
url = https://www.yell.com/
What I have tried:
urllib package
import urllib.request
f = urllib.request.urlopen(url)
print(f.read(100))
and
import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
opener.open(url)
requests package
url = 'www.yell.com'
response = requests.get(url)
and
headers = {'Accept': 'text/html'}
response = requests.get(url, headers=headers)
But i reach to the 404 error.

Try this using urllib
import urllib.request
url = 'https://www.yell.com/'
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
print(response.read())
I would suggest you to use requests + beautifulsoup4
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
it will make your scraping life easier

#You can use also selenium to avoid http errors
from selenium import webdriver
from bs4 import BeautifulSoup
import urllib.request
main_url = 'https://www.yell.com/'
driver = webdriver.Chrome(r'write chromedriver path')
driver.get(main_url)
res = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(res, 'html.parser')
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
request = urllib.request.Request(main_url, headers=headers)
response = urllib.request.urlopen(request)
print(response.read())

Related

requests_html does not render javascript webpages

Why my code doesn't render the javascript pages using requests_html and beautifulSoup
from requests_html import HTMLSession
from bs4 import BeautifulSoup
session = HTMLSession()
def track(num):
url = f'https://www.trackingmore.com/track/en/{num}'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0'}
r = session.post(url,headers=headers)
# r.html.render(timeout=20)
res = []
soup = BeautifulSoup(r.content,'lxml')
st = soup.find('div',class_ ="track-status uk-flex")
print(st.text)
if st != 'Not Found':
checkpoint = soup.find_all('div', class_="info-checkpoint")
for i in checkpoint:
date = i.find('div',class_='info-date').text.strip()
desc = i.find('div',class_='info-desc').text.strip()
res.append({
'Date':date.replace('\xa0','')[:19],
'Description':desc.replace('\xa0','')
})
return res
else :
return res
The output is like this, I can't get the value inside of each javascript function
[{'Date': '{{info.Date}} {{inf', 'Description': '{{info.StatusDescription}}'}, {'Date': '{{info.Date}} {{inf', 'Description': '{{info.StatusDescription}}'}]

WebScraping / Identical sites not working?

i would like to scrape the header-element from these both links -
For me this 2 sites look absolute identical - pics see below
Why is only the scraping for the second link working and not for the first?
import time
import requests
from bs4 import BeautifulSoup
# not working
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"First Link: {erg}")
# working
link = "https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"Second Link: {len(erg)}")
Working:
Not Working:
The page is sometimes loaded by JavaScript, so request won't support it.
You can use a while loop to check if header appears in the soup and then break
import requests
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
while True:
soup = BeautifulSoup(requests.get(link).content, "html.parser")
header = soup.find("header")
if header:
break
print(header)
Try this to get whatever fields you wish to grab from those links. curently it fetches the title. You can modify res.json()['data'][0]['attributes']['name'] to grab any field of your interest. Mkae sure to put the urls within this list urls_to_scrape.
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
urls_to_scrape = {
'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4',
'https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4'
}
base_url = 'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4'
link = 'https://amp-api.apps.apple.com/v1/catalog/US/apps/{}'
params = {
'platform': 'web',
'additionalPlatforms': 'appletv,ipad,iphone,mac',
'extend': 'customPromotionalText,customScreenshotsByType,description,developerInfo,distributionKind,editorialVideo,fileSizeByDevice,messagesScreenshots,privacy,privacyPolicyText,privacyPolicyUrl,requirementsByDeviceFamily,supportURLForLanguage,versionHistory,websiteUrl',
'include': 'genres,developer,reviews,merchandised-in-apps,customers-also-bought-apps,developer-other-apps,app-bundles,top-in-apps,related-editorial-items',
'l': 'en-us',
'limit[merchandised-in-apps]': '20',
'omit[resource]': 'autos',
'sparseLimit[apps:related-editorial-items]': '5'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
res = s.get(base_url)
soup = BeautifulSoup(res.text,"lxml")
token_raw = soup.select_one("[name='web-experience-app/config/environment']").get("content")
token = json.loads(unquote(token_raw))['MEDIA_API']['token']
s.headers['Accept'] = 'application/json'
s.headers['Referer'] = 'https://apps.apple.com/'
s.headers['Authorization'] = f'Bearer {token}'
for url in urls_to_scrape:
id_ = url.split("/")[-1].strip("id").split("?")[0]
res = s.get(link.format(id_),params=params)
title = res.json()['data'][0]['attributes']['name']
print(title)

Download Specific file from Website with BeautifulSoup

Following the documentation of BeautifulSoup, I am trying to download a specific file from a webpage. First trying to find the link that contains the file name:
import re
import requests
from bs4 import BeautifulSoup
url = requests.get("https://www.bancentral.gov.do/a/d/2538-mercado-cambiario")
parsed = BeautifulSoup(url.text, "html.parser")
link = parsed.find("a", text=re.compile("TASA_DOLAR_REFERENCIA_MC.xls"))
path = link.get('href')
print(f"{path}")
But with no success. Then trying to print every link on that page, I get no links:
import re
import requests
from bs4 import BeautifulSoup
url = requests.get("https://www.bancentral.gov.do/a/d/2538-mercado-cambiario")
parsed = BeautifulSoup(url.text, "html.parser")
link = parsed.find_all('a')
for links in parsed.find_all("a href"):
print(links.get('a href'))
It looks like the url of the file is dynamic, it adds a ?v=123456789 parameter to the end of the url, like the file version, that's why I need to download the file using the file name.
(Eg https://cdn.bancentral.gov.do/documents/estadisticas/mercado-cambiario/documents/TASA_DOLAR_REFERENCIA_MC.xls?v=1612902983415)
Thanks.
Actually you are dealing with a dynamic JavaScript page which is fully loaded via an XHR request to the following url once the page loads.
Below is a direct call to the back-end API which identify the request using page id which is 2538 and then we can load your desired url.
import requests
from bs4 import BeautifulSoup
def main(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0'
}
with requests.Session() as req:
req.headers.update(headers)
data = {
"id": "2538",
"languageName": "es"
}
r = req.post(url, data=data)
soup = BeautifulSoup(r.json()['result']['article']['content'], 'lxml')
target = soup.select_one('a[href*=TASA_DOLAR_REFERENCIA_MC]')['href']
r = req.get(target)
with open('data.xls', 'wb') as f:
f.write(r.content)
if __name__ == "__main__":
main('https://www.bancentral.gov.do/Home/GetContentForRender')

how to extract links and handle the page as it is loading again and again using python beautifulsoup

tring to extract links and want to handle loading. but not the links even.
code:
from bs4 import BeautifulSoup
import requests
r = requests.get('http://www.indiabusinessguide.in/business-categories/agriculture/agricultural-equipment.html')
soup = BeautifulSoup(r.text,'lxml')
links = soup.find_all('a',class_='link_orange')
for link in links:
print(link['href'])
please help me to handle this loading and extraction of links.
Try using the lxml library. Response is received by posting a request to the url using Requests.
import requests
import lxml
from lxml import html
contact_list = []
def scrape(url, pages):
for page in range(1, pages):
headers = {
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"Cookie": "PHPSESSID=2q0tk3fi1kid0gbdfboh94ed56",
}
data = {
"page": f"{page}"
}
r = requests.post(url, headers=headers, data=data)
tree = html.fromstring(r.content)
links= tree.xpath('//a[#class="link_orange"]')
for link in links:
# print(link.get('href'))
contact_list.append(link.get('href'))
url = "http://www.indiabusinessguide.in/ajax_advertiselist.php"
scrape(url, 10)
print(contact_list)
print(len(contact_list))

Python 3 - Add custom headers to urllib.request Request

In Python 3, the following code obtains the HTML source for a webpage.
import urllib.request
url = "https://docs.python.org/3.4/howto/urllib2.html"
response = urllib.request.urlopen(url)
response.read()
How can I add the following custom header to the request when using urllib.request?
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
The request headers can be customized by first creating a request object then supplying it to urlopen.
import urllib.request
url = "https://docs.python.org/3.4/howto/urllib2.html"
hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
req = urllib.request.Request(url, headers=hdr)
response = urllib.request.urlopen(req)
response.read()
Source: Python 3.4 Documentation
import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
response = urllib.request.urlopen("url")
response.read()
Should you wish to learn about the details you can refer to the python documentation: https://docs.python.org/3/library/urllib.request.html
#Using urllib.request, with urlopen, allows to open the specified URL.
#Headers can be included inside the urlopen along with the url.
from urllib.request import urlopen
url = "https://docs.python.org/3.4/howto/urllib2.html"
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
response = urlopen(url, headers=header)
response.read()

Resources