from selenium import webdriver
from bs4 import BeautifulSoup
import time
################import the chrome web driver and define the location###############
driver = webdriver.Chrome(executable_path='C:/Users/../Downloads/cd79/chromedriver.exe')
###################################################################################
###########open the web page and print the title##############
page = driver.get("https://kjustin765.wixsite.com/website")
print(driver.title)
driver.maximize_window()
time.sleep(5)
while True:
soup = BeautifulSoup(page.content, 'html.parser')
button1 = soup.find('span', class_='pWNha').text
if 'Yes' in button1:
driver.refresh()
else:
button1.click()
Why is the page being returned as None?
Here is the error
soup = BeautifulSoup(page.content, 'html.parser')
AttributeError: 'NoneType' object has no attribute 'content
To get the correct data use page.page_source instead of page.content:
soup = BeautifulSoup(page.page_source, 'html.parser')
The .content method comes from the requests library if you use it to request the page. For example:
import requests
page = requests.get(my_url).content
Related
i have this element : <span class="text-robux-lg wait-for-i18n-format-render">515</span>
and i wanna extract 515, i cant use requests and i can use selenium but that is very slow since i want it to be checked very 0,1 seconds. i tried requests but couldn't find anything on how to do it.
does anyone know how to do it? maybe beautifulsoup or other requests modules?
i tried this
from bs4 import BeautifulSoup
response = requests.get('https://www.roblox.com/catalog/20573078/Shaggy')
soup = BeautifulSoup(response.text, 'html.parser')
mydivs = soup.find_all("a", {"class": "text-robux-lg wait-for-i18n-format-render"})
print(mydivs)
but it doesn't work it just prints "[]"
It doesn't work, because the tag is <span>, not <a>:
import requests
from bs4 import BeautifulSoup
url = 'https://www.roblox.com/catalog/20573078/Shaggy'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
t = soup.find("span", {"class": "text-robux-lg wait-for-i18n-format-render"}).text
print(t)
Prints:
515
Alternatively, you can use CSS selector:
print(soup.select_one('[data-expected-price]')['data-expected-price'])
How do I get the PDFs found in the iframe of this URL?
(1) The following code throws an error.
import requests, re
from bs4 import BeautifulSoup
url = r'https://www.d88a.org/domain/102'
headers = {'User-Agent': 'C19SchoolsWebscrape'}
s = requests.Session()
r = s.get(url, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
iframe_src = soup.select_one("swGoogleDrive").attrs["src"]
r = s.get(f"https:{iframe_src}")
print(r)
error: 'NoneType' object has no attribute 'attrs'
(2) This also throws an error.
response = requests.get(url, headers=headers)
t = re.search(b'(?<=artist":")(.*?)(?=")', response.content).group(0).decode("utf-8")
print(t)
error: 'NoneType' object has no attribute 'group'
Earlier threads I've referenced:
Python BeautifulSoup - Scrape Web Content Inside Iframes,
extract iFrame content using BeautifulSoup
To get all links to the PDFs, you can use this example:
import requests
from bs4 import BeautifulSoup
url = 'https://www.d88a.org/domain/102'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
soup = BeautifulSoup(requests.get(soup.iframe['src']).content, 'html.parser')
for a in soup.select('a'):
print(a['href'])
Prints:
https://drive.google.com/file/d/1bCXyoE7FWWI9RIcDWosHrohYQY7Ryb13/view?usp=drive_web
https://drive.google.com/file/d/1SlR-71M-jCMF-AO4ChdSbywolIF9yL1h/view?usp=drive_web
https://drive.google.com/file/d/1zbrt5Mnt0fZxjeD7DRYvfP6cskYKig27/view?usp=drive_web
I wrote this code for scraping score details from livescore.com . But I have some problems. Maybe I wrote incorrect code. Please help me.
Code run output:
Traceback (most recent call last):
File "web.py", line 15, in <module>
box2 = box.find_all('a',{'class' : 'match-row scorelink'})
AttributeError: 'NoneType' object has no attribute 'find_all'
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
#page = requests.get('https://livescore.com')
soup = BeautifulSoup(res, 'lxml')
box = soup.find('div',{'class':'container'})
box2 = box.find_all('a',{'class' : 'match-row scorelink'})
for data in box2:
test = data.find('div',{'class': 'sco'}).text.replace('\n', '')
print (test)
Try This:
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
#page = requests.get('https://livescore.com')
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.quit()
box = soup.find('div',{'class':'container'})
box2 = box.find_all('a',{'class' : 'match-row scorelink'})
for data in box2:
test = data.find('div',{'class': 'sco'}).text.replace('\n', '')
print (test)
Use the following css selector.However container is not a class attrribute value.Its data-type='container' attribute value.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
soup = BeautifulSoup(res, 'lxml')
for item in soup.select("div[data-type='container'] .match-row.scorelink>.sco"):
test=item.text.replace('\n', '')
print(test)
Give this a go. I have skipped 'box2' as it's not really needed for getting the scores. Also, judging by the data I fetched, .replace('\n', '') is not needed either, but feel free to use it if you think you will get score containing "\n" character.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
soup = BeautifulSoup(res, 'lxml')
box = soup.find('div',{'data-type':'container'})
scores=box.find_all('div',{'class': 'sco'})
for score in scores:
print(score.text)
Thanks for answers. Solved problem
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
#page = requests.get('https://livescore.com')
soup = BeautifulSoup(res, 'lxml')
box = soup.find('div',{'data-type':'container'})
box2 = box.find_all('a',{'class' : 'match-row'})
for data in box2:
test1 = data.find('div',{'class': 'sco'}).text.replace('\n', '')
test2 = data.find('div',{'class': 'ply tright name'}).text.replace('\n', '')
test3 = data.find('div',{'class': 'ply name'}).text.replace('\n', '')
print(test2,test1,test3)
I am just starting out with web scraping. I am having trouble with beautiful soup. I have tried changing the div class to other classes as well but it always returns []. Here is my code.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome(executable_path="C:/Users/MuhIsmail/Downloads/cd79/chromedriver.exe")
url = "https://www.cricbuzz.com/cricket-match/live-scores"
driver.get(url)
driver.maximize_window()
time.sleep(4)
content = driver.page_source
soup = BeautifulSoup(content, "html.parser")
scores = soup.find_all('div', class_='col-xs-9 col-lg-9 dis-inline')
print(scores)
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.cricbuzz.com/cricket-match/live-scores")
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.select("a.cb-mat-mnu-itm:nth-child(5)"):
print(item.text)
Output:
MLR vs SYS - SYS Won
It is returning [] because there are no elements on the page with that class.
If you open your browser console and do a simple
document.getElementsByClassName('col-xs-9 col-lg-9 dis-inline')
it will return no results.
I tried this as well:
import requests
from bs4 import BeautifulSoup
url = "https://www.cricbuzz.com/cricket-match/live-scores"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
scores = soup.find_all('div', {'class':'col-xs-9 col-lg-9 dis-inline'})
print(scores)
I am trying to get the href of anchor tag of the very first video search on YouTube using Beautiful Soup. I am searching it by using the "a" and class_="yt-simple-endpoint style-scope ytd-video-renderer".
But I am getting None output:
from bs4 import BeautifulSoup
import requests
source = requests.get("https://www.youtube.com/results?search_query=MP+election+results+2018%3A+BJP+minister+blames+conspiracy+as+reason+while+losing").text
soup = BeautifulSoup(source,'lxml')
# print(soup2.prettify())
a =soup.findAll("a", class_="yt-simple-endpoint style-scope ytd-video-renderer")
a_fin = soup.find("a", class_="compact-media-item-image")
#
print(a)
from bs4 import BeautifulSoup
import requests
source = requests.get("https://www.youtube.com/results?search_query=MP+election+results+2018%3A+BJP+minister+blames+conspiracy+as+reason+while+losing").text
soup = BeautifulSoup(source,'lxml')
first_serach_result_link = soup.findAll('a',attrs={'class':'yt-uix-tile-link'})[0]['href']
heavily inspired by
this answer
Another option is to render the page first with Selenium.
import bs4
from selenium import webdriver
url = 'https://www.youtube.com/results?search_query=MP+election+results+2018%3A+BJP+minister+blames+conspiracy+as+reason+while+losing'
browser = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
browser.get(url)
source = browser.page_source
soup = bs4.BeautifulSoup(source,'html.parser')
hrefs = soup.find_all("a", class_="yt-simple-endpoint style-scope ytd-video-renderer")
for a in hrefs:
print (a['href'])
Output:
/watch?v=Jor09n2IF44
/watch?v=ym14AyqJDTg
/watch?v=g-2V1XJL0kg
/watch?v=eeVYaDLC5ik
/watch?v=StI92Bic3UI
/watch?v=2W_4LIAhbdQ
/watch?v=PH1WZPT5IKw
/watch?v=Au2EH3GsM7k
/watch?v=q-j1HEnDn7w
/watch?v=Usjg7IuUhvU
/watch?v=YizmwHibomQ
/watch?v=i2q6Fm0E3VE
/watch?v=OXNAMyEvcH4
/watch?v=vdcBtAeZsCk
/watch?v=E4v2StDdYqs
/watch?v=x7kCuRB0f7E
/watch?v=KERtHNoZrF0
/watch?v=TenbA4wWIJA
/watch?v=Ey9HfjUyUvY
/watch?v=hqsuOT0URJU
It dynamic html you can use Selenium or to get static html use GoogleBot user-agent
headers = {'User-Agent' : 'Googlebot/2.1 (+http://www.google.com/bot.html)'}
source = requests.get("https://.......", headers=headers).text
soup = BeautifulSoup(source, 'lxml')
links = soup.findAll("a", class_="yt-uix-tile-link")
for link in links:
print(link['href'])
Try looping over the matches:
import urllib2
data = urllib2.urlopen("some_url")
html_data = data.read()
soup = BeautifulSoup(html_data)
for a in soup.findAll('a',href=True):
print a['href']
The class which you're searching does not exist in the scraped html. You can identify it by printing the soup variable.
For example:
a =soup.findAll("a", class_="sign-in-link")
gives output as:
[<a class="sign-in-link" href="https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Faction_handle_signin%3Dtrue%26app%3Ddesktop%26feature%3Dplaylist%26hl%3Den%26next%3D%252Fresults%253Fsearch_query%253DMP%252Belection%252Bresults%252B2018%25253A%252BBJP%252Bminister%252Bblames%252Bconspiracy%252Bas%252Breason%252Bwhile%252Blosing&uilel=3&hl=en&service=youtube">Sign in</a>]