requests_html does not render javascript webpages

requests_html does not render javascript webpages - python-3.x

Why my code doesn't render the javascript pages using requests_html and beautifulSoup
from requests_html import HTMLSession
from bs4 import BeautifulSoup
session = HTMLSession()
def track(num):
url = f'https://www.trackingmore.com/track/en/{num}'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0'}
r = session.post(url,headers=headers)
# r.html.render(timeout=20)
res = []
soup = BeautifulSoup(r.content,'lxml')
st = soup.find('div',class_ ="track-status uk-flex")
print(st.text)
if st != 'Not Found':
checkpoint = soup.find_all('div', class_="info-checkpoint")
for i in checkpoint:
date = i.find('div',class_='info-date').text.strip()
desc = i.find('div',class_='info-desc').text.strip()
res.append({
'Date':date.replace('\xa0','')[:19],
'Description':desc.replace('\xa0','')
})
return res
else :
return res
The output is like this, I can't get the value inside of each javascript function
[{'Date': '{{info.Date}} {{inf', 'Description': '{{info.StatusDescription}}'}, {'Date': '{{info.Date}} {{inf', 'Description': '{{info.StatusDescription}}'}]

Related

WebScraping / Identical sites not working?

i would like to scrape the header-element from these both links -
For me this 2 sites look absolute identical - pics see below
Why is only the scraping for the second link working and not for the first?
import time
import requests
from bs4 import BeautifulSoup
# not working
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"First Link: {erg}")
# working
link = "https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"Second Link: {len(erg)}")
Working:
Not Working:

The page is sometimes loaded by JavaScript, so request won't support it.
You can use a while loop to check if header appears in the soup and then break
import requests
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
while True:
soup = BeautifulSoup(requests.get(link).content, "html.parser")
header = soup.find("header")
if header:
break
print(header)

Try this to get whatever fields you wish to grab from those links. curently it fetches the title. You can modify res.json()['data'][0]['attributes']['name'] to grab any field of your interest. Mkae sure to put the urls within this list urls_to_scrape.
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
urls_to_scrape = {
'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4',
'https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4'
}
base_url = 'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4'
link = 'https://amp-api.apps.apple.com/v1/catalog/US/apps/{}'
params = {
'platform': 'web',
'additionalPlatforms': 'appletv,ipad,iphone,mac',
'extend': 'customPromotionalText,customScreenshotsByType,description,developerInfo,distributionKind,editorialVideo,fileSizeByDevice,messagesScreenshots,privacy,privacyPolicyText,privacyPolicyUrl,requirementsByDeviceFamily,supportURLForLanguage,versionHistory,websiteUrl',
'include': 'genres,developer,reviews,merchandised-in-apps,customers-also-bought-apps,developer-other-apps,app-bundles,top-in-apps,related-editorial-items',
'l': 'en-us',
'limit[merchandised-in-apps]': '20',
'omit[resource]': 'autos',
'sparseLimit[apps:related-editorial-items]': '5'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
res = s.get(base_url)
soup = BeautifulSoup(res.text,"lxml")
token_raw = soup.select_one("[name='web-experience-app/config/environment']").get("content")
token = json.loads(unquote(token_raw))['MEDIA_API']['token']
s.headers['Accept'] = 'application/json'
s.headers['Referer'] = 'https://apps.apple.com/'
s.headers['Authorization'] = f'Bearer {token}'
for url in urls_to_scrape:
id_ = url.split("/")[-1].strip("id").split("?")[0]
res = s.get(link.format(id_),params=params)
title = res.json()['data'][0]['attributes']['name']
print(title)

'str' object has no attribute 'text' in BeautifulSoup4

I'm trying to login to a website using python Request for an automation purpose. And I'm getting an error as 'str' object has no attribute 'text'
My code is
from BeautifulSoup import BeautifulSoup
logging.basicConfig(filename='app.log',level=logging.INFO)
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
def post_request_curl(data):
try:
with requests.Session() as s:
login_data=data['login_details']
r = s.get(url=data['url'], headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
login_data['form_build_id'] = soup.find_all('input', attrs={'name': 'form_build_id'})['value']
r = s.post(url=data['url'], data=login_data, headers=headers)
return r
except Exception as e:
logging.error('Error occurred ' + str(e))

Try:
from bs4 import BeautifulSoup
import requests
logging.basicConfig(filename='app.log',level=logging.INFO)
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
def post_request_curl(data):
try:
with requests.Session() as s:
login_data=data['login_details']
r = s.get(url=data['url'], headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
login_data['form_build_id'] = soup.find_all('input', attrs={'name': 'form_build_id'})['value']
r = s.post(url=data['url'], data=login_data, headers=headers)
return r
except Exception as e:
logging.error('Error occurred ' + str(e))

Web-scraping Yell.com in Python

After reading a LOT, I have tried to do my first step in web scraping at yell website with urllib and requests but I get the same in both cases (404 not found).
The url is:
url = https://www.yell.com/
What I have tried:
urllib package
import urllib.request
f = urllib.request.urlopen(url)
print(f.read(100))
and
import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
opener.open(url)
requests package
url = 'www.yell.com'
response = requests.get(url)
and
headers = {'Accept': 'text/html'}
response = requests.get(url, headers=headers)
But i reach to the 404 error.

Try this using urllib
import urllib.request
url = 'https://www.yell.com/'
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
print(response.read())
I would suggest you to use requests + beautifulsoup4
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
it will make your scraping life easier

#You can use also selenium to avoid http errors
from selenium import webdriver
from bs4 import BeautifulSoup
import urllib.request
main_url = 'https://www.yell.com/'
driver = webdriver.Chrome(r'write chromedriver path')
driver.get(main_url)
res = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(res, 'html.parser')
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
request = urllib.request.Request(main_url, headers=headers)
response = urllib.request.urlopen(request)
print(response.read())

how to extract links and handle the page as it is loading again and again using python beautifulsoup

tring to extract links and want to handle loading. but not the links even.
code:
from bs4 import BeautifulSoup
import requests
r = requests.get('http://www.indiabusinessguide.in/business-categories/agriculture/agricultural-equipment.html')
soup = BeautifulSoup(r.text,'lxml')
links = soup.find_all('a',class_='link_orange')
for link in links:
print(link['href'])
please help me to handle this loading and extraction of links.

Try using the lxml library. Response is received by posting a request to the url using Requests.
import requests
import lxml
from lxml import html
contact_list = []
def scrape(url, pages):
for page in range(1, pages):
headers = {
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"Cookie": "PHPSESSID=2q0tk3fi1kid0gbdfboh94ed56",
}
data = {
"page": f"{page}"
}
r = requests.post(url, headers=headers, data=data)
tree = html.fromstring(r.content)
links= tree.xpath('//a[#class="link_orange"]')
for link in links:
# print(link.get('href'))
contact_list.append(link.get('href'))
url = "http://www.indiabusinessguide.in/ajax_advertiselist.php"
scrape(url, 10)
print(contact_list)
print(len(contact_list))

Python 3 - Add custom headers to urllib.request Request

In Python 3, the following code obtains the HTML source for a webpage.
import urllib.request
url = "https://docs.python.org/3.4/howto/urllib2.html"
response = urllib.request.urlopen(url)
response.read()
How can I add the following custom header to the request when using urllib.request?
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }

The request headers can be customized by first creating a request object then supplying it to urlopen.
import urllib.request
url = "https://docs.python.org/3.4/howto/urllib2.html"
hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
req = urllib.request.Request(url, headers=hdr)
response = urllib.request.urlopen(req)
response.read()
Source: Python 3.4 Documentation

import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
response = urllib.request.urlopen("url")
response.read()
Should you wish to learn about the details you can refer to the python documentation: https://docs.python.org/3/library/urllib.request.html

#Using urllib.request, with urlopen, allows to open the specified URL.
#Headers can be included inside the urlopen along with the url.
from urllib.request import urlopen
url = "https://docs.python.org/3.4/howto/urllib2.html"
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
response = urlopen(url, headers=header)
response.read()

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

requests_html does not render javascript webpages - python-3.x

Related

WebScraping / Identical sites not working?

'str' object has no attribute 'text' in BeautifulSoup4

Web-scraping Yell.com in Python

how to extract links and handle the page as it is loading again and again using python beautifulsoup

Python 3 - Add custom headers to urllib.request Request

Categories

Resources