Python3 beautifulsoup4 and selenium

Python3 beautifulsoup4 and selenium - python-3.x

I wrote this code for scraping score details from livescore.com . But I have some problems. Maybe I wrote incorrect code. Please help me.
Code run output:
Traceback (most recent call last):
File "web.py", line 15, in <module>
box2 = box.find_all('a',{'class' : 'match-row scorelink'})
AttributeError: 'NoneType' object has no attribute 'find_all'
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
#page = requests.get('https://livescore.com')
soup = BeautifulSoup(res, 'lxml')
box = soup.find('div',{'class':'container'})
box2 = box.find_all('a',{'class' : 'match-row scorelink'})
for data in box2:
test = data.find('div',{'class': 'sco'}).text.replace('\n', '')
print (test)

Try This:
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
#page = requests.get('https://livescore.com')
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.quit()
box = soup.find('div',{'class':'container'})
box2 = box.find_all('a',{'class' : 'match-row scorelink'})
for data in box2:
test = data.find('div',{'class': 'sco'}).text.replace('\n', '')
print (test)

Use the following css selector.However container is not a class attrribute value.Its data-type='container' attribute value.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
soup = BeautifulSoup(res, 'lxml')
for item in soup.select("div[data-type='container'] .match-row.scorelink>.sco"):
test=item.text.replace('\n', '')
print(test)

Give this a go. I have skipped 'box2' as it's not really needed for getting the scores. Also, judging by the data I fetched, .replace('\n', '') is not needed either, but feel free to use it if you think you will get score containing "\n" character.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
soup = BeautifulSoup(res, 'lxml')
box = soup.find('div',{'data-type':'container'})
scores=box.find_all('div',{'class': 'sco'})
for score in scores:
print(score.text)

Thanks for answers. Solved problem
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
#page = requests.get('https://livescore.com')
soup = BeautifulSoup(res, 'lxml')
box = soup.find('div',{'data-type':'container'})
box2 = box.find_all('a',{'class' : 'match-row'})
for data in box2:
test1 = data.find('div',{'class': 'sco'}).text.replace('\n', '')
test2 = data.find('div',{'class': 'ply tright name'}).text.replace('\n', '')
test3 = data.find('div',{'class': 'ply name'}).text.replace('\n', '')
print(test2,test1,test3)

Related

How to get post links from the whole page using BeautifulSoup Selenium

I'm having trouble trying to web scraping using BeautifulSoup and Selenium. The problem I have is i want to try pulling data from pages 1-20. But somehow the data that was successfully pulled was only up to page 10. It is possible that the number of the last page limit that I would take could be more than 20, but the results of the code I made could only pull 10 pages. Does anyone have an understanding for the problem to be able to pull a lot of data without page limit?
options = webdriver.ChromeOptions()
options.add_argument('-headless')
options.add_argument('-no-sandbox')
options.add_argument('-disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=options)
apartment_urls = []
try:
for page in range(1,20):
print(f"Extraction Page# {page}")
page="https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan&hlmn=" + str(page)
driver.get(page)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a[href]')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
apartment_urls.append(get_url)
except:
print("Good Bye!")
This is the output of the code. When pages 10,11,12 and so on I can't get the data

Now, pagination is working fine without page limit.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan')
time.sleep(5)
driver.maximize_window()
while True:
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
next_button = driver.find_element(By.CSS_SELECTOR,'li.next > a ')
if next_button:
button = next_button.click()
time.sleep(3)
else:
break
If you would prefer to use: webdriverManager
Alternative solution: As the next page url isn't dynamic, It's also working fine.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan')
time.sleep(5)
driver.maximize_window()
while True:
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
# next_button = driver.find_element(By.CSS_SELECTOR,'li.next > a ')
# if next_button:
# button = next_button.click()
# time.sleep(3)
next_page = soup.select_one('li.next > a ')
if next_page:
next_page = f'https://www.99.co{next_page}'
else:
break

Scraping infomation from a div tag inside of a div tag

I have been trying to web scrape from this website: https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players
I want to get specific ratings but i get nothing when i execute this code:
from bs4 import BeautifulSoup
import requests
result = requests.get("https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players")
src = result.content
soup = BeautifulSoup(src, 'lxml')
match = soup.find('div', class_='css-gm45eu')
print(match)
output: None
How can I scrape what is in that class?

Try to use Selenium:
from selenium import webdriver
driver = webdriver.Chrome('PATH_TO --> chromedriver.exe')
driver.get("https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players")
ratings = driver.find_elements_by_xpath('//div[#class="css-gm45eu"]')
ratings_list = []
for p in range(len(ratings)):
ratings_list.append(ratings[p].text)
print(ratings_list)
Output:
['1.189', '1.109', '1.098', '1.031', '1.028', '1.005', '0.990', '0.981', '0.967', '0.936', '0.904', '0.846', '0.841', '0.840', '0.836', '0.809', '0.759', '0.726']
Download chromedriver.exe:
https://chromedriver.chromium.org/downloads
if you don't want a chrome window to open while running, use this code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome('PATH_TO --> chromedriver.exe', options=options)
driver.get("https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players")
ratings = driver.find_elements_by_xpath('//div[#class="css-gm45eu"]')
ratings_list = []
for p in range(len(ratings)):
ratings_list.append(ratings[p].text)
print(ratings_list)

How to extract name and links from a given website - python

For below mentioned website, I am trying to find the name and its corresponding link from that site. But not able to pass/get the data at all.
Using BeautifulSoup
from bs4 import BeautifulSoup
import requests
source = requests.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
soup = BeautifulSoup(source.text, 'html.parser')
mains = soup.find_all("div", {"class": "list-container-wrapper"})
name = []
lnks = []
for main in mains:
name.append(main.find("a").text)
lnks.append(main.find("a").get('href'))
Using Selenium webdriver
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r"chromedriver_win32\chromedriver.exe")
driver.get("https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0")
lnks = []
name = []
for a in driver.find_elements_by_class_name('ng-star-inserted'):
link = a.get_attribute('href')
lnks.append(link)
nm = driver.find_element_by_css_selector("#list-item-0 > div > h2 > a").text
name.append(nm)
I have tried with both 2 above methods.
Example:
name = ['Friday Night Flicks Drive-In at the Roadium', 'Open: Butterfly Pavilion and Nature Gardens']
lnks = ['https://mommypoppins.com/los-angeles-kids/event/in-person/friday-night-flicks-drive-in-at-the-roadium','https://mommypoppins.com/los-angeles-kids/event/in-person/open-butterfly-pavilion-and-nature-gardens']

Here's solution for webdriver:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
time.sleep(3)
elements = driver.find_elements(By.XPATH, "//a[#angularticsaction='expanded-detail']")
attributes = [{el.text: el.get_attribute('href')} for el in elements]
print(attributes)
print(len(attributes))
driver.quit()
Here's solution with webdriver and bs4:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
mains = soup.find_all("a", {"angularticsaction": "expanded-detail"})
attributes = [{el.text: el.get('href')} for el in mains]
print(attributes)
print(len(attributes))
driver.quit()
Here's solution with requests:
import requests
url = "https://mommypoppins.com"
response = requests.get(f"{url}/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all").json()
attributes = [{r.get('node_title'): f"{url}{r['node'][r['nid']]['node_url']}"} for r in response['results']]
print(attributes)
print(len(attributes))
cheers!

The website is loaded dynamically, therefore requests won't support it. However, the data is available in JSON format via sending a GET request to:
https://mommypoppins.com/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all.
There's no need for BeautifulSoup or Selenium, using merely requests would work, which will make your code much faster.
import requests
URL = "https://mommypoppins.com/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all"
BASE_URL = "https://mommypoppins.com"
response = requests.get(URL).json()
names = []
links = []
for json_data in response["results"]:
data = json_data["node"][json_data["nid"]]
names.append(data["title"])
links.append(BASE_URL + data["node_url"])

How get the text with BeautifulSoup in this html code: <span id="pass_0" class="text-success">c#</span>

I'm doing a program that crack some hash, through selenium and beautifulsoup with this website: https://hashkiller.co.uk/Cracker
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
import time
target = requests.get("https://hashkiller.co.uk/Cracker")
soup = BeautifulSoup(target.content, 'html.parser')
driver = webdriver.Chrome(executable_path=r"D:\Download\chromedriver.exe")
#driver.set_window_position(-10000,0)
#240aa2cec4b29c56f3bee520a8dcee7e
driver.get("https://hashkiller.co.uk/Cracker")
hash = input("Hash: ")
hash_box = driver.find_element_by_id("txtHashList").send_keys(hash)
hash_submit = driver.find_element_by_id("btnCrack").click()
time.sleep(5)
hash_table = soup.find('span', {'class': 'text-success'})
a = hash_table.text
print(hash_table)
i expect the output is c# [Image: https://imgur.com/kEegEgY ] Html Code: [html <span id="pass_0" class="text-success">c#</span>]
but it returns: html<span class="text-success">$pass</span>
instead of $pass there should be c#

You're actually not parsing the rendered html. You're parsing the html response from your requests.
Secondly, you want to grab the second element, as the first element is the $pass. Also, change hash to a different variable as it's a function in python:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
import time
#target = requests.get("https://hashkiller.co.uk/Cracker")
#soup = BeautifulSoup(target.content, 'html.parser')
driver = webdriver.Chrome("C:/chromedriver.exe")
#driver.set_window_position(-10000,0)
#240aa2cec4b29c56f3bee520a8dcee7e
driver.get("https://hashkiller.co.uk/Cracker")
hash_input = input("Hash: ")
hash_box = driver.find_element_by_id("txtHashList").send_keys(hash_input)
hash_submit = driver.find_element_by_id("btnCrack").click()
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
hash_table = soup.find_all('span', {'class': 'text-success'})
a = hash_table[1].text
print(hash_table)
print(a)
driver.close()
Output:
[<span class="text-success">$pass</span>, <span class="text-success" id="pass_0">c#</span>]
c#

Can't print tag 'content' anymore

I had a perfectly well working scraper for TripAdvisor, it met all my needs, then I tried to use it after a four day break and something went wrong, I quickly realized that TA had changed some of the tags, I made the appropriate changes and I still couldn't get it working as before. I want to grab the value of the 'content' tag within an element.
This is the element:
<div class="prw_rup prw_common_bubble_rating bubble_rating" data-prwidget-init="" data-prwidget-name="common_bubble_rating"><span alt="5 of 5 bubbles" class="ui_bubble_rating bubble_50" content="5" property="ratingValue" style="font-size:18px;"></span></div>
and here is the code:
for bubs in data.findAll('div',{'class':"prw_rup prw_common_bubble_rating bubble_rating"}):
print([img["content"] for img in bubs.select("img[content]")])
but now it only gives me an empty '[]' instead of the content which is '5'.
Anybody know what may have changed?
here is the rest of my code
import urllib
import urllib.request
from bs4 import BeautifulSoup
import re
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
theurl = "https://www.tripadvisor.com/Hotels-g147364-c3-Cayman_Islands-Hotels.html"
thepage = urllib
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
base_url = "https://www.tripadvisor.com"
urls = []
init_info = []
init_data = open('/Users/paribaker/Desktop/scrapping/TripAdvisor/Inv/speccaydata.txt', 'w')
for link in soup.findAll('a',href=re.compile('/Hotel_Review')):
urls.append(base_url + (link.get('href')).strip("#REVIEWS"))
def remove_duplicates(urls):
output= []
seen = set()
for line in urls:
if line not in seen:
output.append(line)
seen.add(line)
return output
urls2 = remove_duplicates(urls)
for url in urls2:
try:
driver = webdriver.Chrome()
driver.get(url)
element = driver.find_element_by_id("taplc_prodp13n_hr_sur_review_filter_controls_0_filterLang_ALL").click()
print("succesfull")
moreinfo = driver.page_source
moresoup = BeautifulSoup(moreinfo,"html.parser")
driver.close()
#moreinfo = urllib
#moreinfo = urllib.request.urlopen(url)
#moresoup = BeautifulSoup(moreinfo,"html.parser")
except:
print("none")
for data in moresoup.findAll('div', {"class":"heading_2014 hr_heading"}):
try:
for title in data.findAll('h1',{'id':"HEADING"}):
init_info.append(title.text.strip("\n")+ ",\t")
for add_data in data.findAll('span',{'class':'format_address'}):
print((add_data.find('span',{'class':'street-address'}).text +",\t"))
init_info.append(add_data.find('span',{'class':'street-address'}).text +",\t")
init_info.append(add_data.find('span',{'class':'locality'}).text + ",\t")
init_info.append(add_data.find('span',{'class':'country-name'}).text + ",\t")
for reviews in data.findAll('a',{'class':'more taLnk'}):
init_info.append(reviews.text).strip("\n")
init_info.append(", \t")
#init_info.append([img["alt"] for img in stars.select("img[alt]")])
#init_info.append([img["content"] for img in stars.select("img[content]")])
except :
init_info.append("N/A" + ", /t")

The element with the content="5" attribute is a span, not an img.
Does this get what you want?
for bubs in data.findAll('div',{'class':"prw_rup prw_common_bubble_rating bubble_rating"}):
print([elem["content"] for elem in bubs.select("span[content]")])

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Python3 beautifulsoup4 and selenium - python-3.x

Related

How to get post links from the whole page using BeautifulSoup Selenium

Scraping infomation from a div tag inside of a div tag

How to extract name and links from a given website - python

How get the text with BeautifulSoup in this html code: <span id="pass_0" class="text-success">c#</span>

Can't print tag 'content' anymore

Categories

Resources