How to get post links from the whole page using BeautifulSoup Selenium - python-3.x

I'm having trouble trying to web scraping using BeautifulSoup and Selenium. The problem I have is i want to try pulling data from pages 1-20. But somehow the data that was successfully pulled was only up to page 10. It is possible that the number of the last page limit that I would take could be more than 20, but the results of the code I made could only pull 10 pages. Does anyone have an understanding for the problem to be able to pull a lot of data without page limit?
options = webdriver.ChromeOptions()
options.add_argument('-headless')
options.add_argument('-no-sandbox')
options.add_argument('-disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=options)
apartment_urls = []
try:
for page in range(1,20):
print(f"Extraction Page# {page}")
page="https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan&hlmn=" + str(page)
driver.get(page)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a[href]')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
apartment_urls.append(get_url)
except:
print("Good Bye!")
This is the output of the code. When pages 10,11,12 and so on I can't get the data

Now, pagination is working fine without page limit.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan')
time.sleep(5)
driver.maximize_window()
while True:
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
next_button = driver.find_element(By.CSS_SELECTOR,'li.next > a ')
if next_button:
button = next_button.click()
time.sleep(3)
else:
break
If you would prefer to use: webdriverManager
Alternative solution: As the next page url isn't dynamic, It's also working fine.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan')
time.sleep(5)
driver.maximize_window()
while True:
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
# next_button = driver.find_element(By.CSS_SELECTOR,'li.next > a ')
# if next_button:
# button = next_button.click()
# time.sleep(3)
next_page = soup.select_one('li.next > a ')
if next_page:
next_page = f'https://www.99.co{next_page}'
else:
break

Related

Scraping infomation from a div tag inside of a div tag

I have been trying to web scrape from this website: https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players
I want to get specific ratings but i get nothing when i execute this code:
from bs4 import BeautifulSoup
import requests
result = requests.get("https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players")
src = result.content
soup = BeautifulSoup(src, 'lxml')
match = soup.find('div', class_='css-gm45eu')
print(match)
output: None
How can I scrape what is in that class?
Try to use Selenium:
from selenium import webdriver
driver = webdriver.Chrome('PATH_TO --> chromedriver.exe')
driver.get("https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players")
ratings = driver.find_elements_by_xpath('//div[#class="css-gm45eu"]')
ratings_list = []
for p in range(len(ratings)):
ratings_list.append(ratings[p].text)
print(ratings_list)
Output:
['1.189', '1.109', '1.098', '1.031', '1.028', '1.005', '0.990', '0.981', '0.967', '0.936', '0.904', '0.846', '0.841', '0.840', '0.836', '0.809', '0.759', '0.726']
Download chromedriver.exe:
https://chromedriver.chromium.org/downloads
if you don't want a chrome window to open while running, use this code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome('PATH_TO --> chromedriver.exe', options=options)
driver.get("https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players")
ratings = driver.find_elements_by_xpath('//div[#class="css-gm45eu"]')
ratings_list = []
for p in range(len(ratings)):
ratings_list.append(ratings[p].text)
print(ratings_list)

How to extract name and links from a given website - python

For below mentioned website, I am trying to find the name and its corresponding link from that site. But not able to pass/get the data at all.
Using BeautifulSoup
from bs4 import BeautifulSoup
import requests
source = requests.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
soup = BeautifulSoup(source.text, 'html.parser')
mains = soup.find_all("div", {"class": "list-container-wrapper"})
name = []
lnks = []
for main in mains:
name.append(main.find("a").text)
lnks.append(main.find("a").get('href'))
Using Selenium webdriver
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r"chromedriver_win32\chromedriver.exe")
driver.get("https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0")
lnks = []
name = []
for a in driver.find_elements_by_class_name('ng-star-inserted'):
link = a.get_attribute('href')
lnks.append(link)
nm = driver.find_element_by_css_selector("#list-item-0 > div > h2 > a").text
name.append(nm)
I have tried with both 2 above methods.
Example:
name = ['Friday Night Flicks Drive-In at the Roadium', 'Open: Butterfly Pavilion and Nature Gardens']
lnks = ['https://mommypoppins.com/los-angeles-kids/event/in-person/friday-night-flicks-drive-in-at-the-roadium','https://mommypoppins.com/los-angeles-kids/event/in-person/open-butterfly-pavilion-and-nature-gardens']
Here's solution for webdriver:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
time.sleep(3)
elements = driver.find_elements(By.XPATH, "//a[#angularticsaction='expanded-detail']")
attributes = [{el.text: el.get_attribute('href')} for el in elements]
print(attributes)
print(len(attributes))
driver.quit()
Here's solution with webdriver and bs4:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
mains = soup.find_all("a", {"angularticsaction": "expanded-detail"})
attributes = [{el.text: el.get('href')} for el in mains]
print(attributes)
print(len(attributes))
driver.quit()
Here's solution with requests:
import requests
url = "https://mommypoppins.com"
response = requests.get(f"{url}/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all").json()
attributes = [{r.get('node_title'): f"{url}{r['node'][r['nid']]['node_url']}"} for r in response['results']]
print(attributes)
print(len(attributes))
cheers!
The website is loaded dynamically, therefore requests won't support it. However, the data is available in JSON format via sending a GET request to:
https://mommypoppins.com/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all.
There's no need for BeautifulSoup or Selenium, using merely requests would work, which will make your code much faster.
import requests
URL = "https://mommypoppins.com/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all"
BASE_URL = "https://mommypoppins.com"
response = requests.get(URL).json()
names = []
links = []
for json_data in response["results"]:
data = json_data["node"][json_data["nid"]]
names.append(data["title"])
links.append(BASE_URL + data["node_url"])

Python3 beautifulsoup4 and selenium

I wrote this code for scraping score details from livescore.com . But I have some problems. Maybe I wrote incorrect code. Please help me.
Code run output:
Traceback (most recent call last):
File "web.py", line 15, in <module>
box2 = box.find_all('a',{'class' : 'match-row scorelink'})
AttributeError: 'NoneType' object has no attribute 'find_all'
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
#page = requests.get('https://livescore.com')
soup = BeautifulSoup(res, 'lxml')
box = soup.find('div',{'class':'container'})
box2 = box.find_all('a',{'class' : 'match-row scorelink'})
for data in box2:
test = data.find('div',{'class': 'sco'}).text.replace('\n', '')
print (test)
Try This:
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
#page = requests.get('https://livescore.com')
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.quit()
box = soup.find('div',{'class':'container'})
box2 = box.find_all('a',{'class' : 'match-row scorelink'})
for data in box2:
test = data.find('div',{'class': 'sco'}).text.replace('\n', '')
print (test)
Use the following css selector.However container is not a class attrribute value.Its data-type='container' attribute value.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
soup = BeautifulSoup(res, 'lxml')
for item in soup.select("div[data-type='container'] .match-row.scorelink>.sco"):
test=item.text.replace('\n', '')
print(test)
Give this a go. I have skipped 'box2' as it's not really needed for getting the scores. Also, judging by the data I fetched, .replace('\n', '') is not needed either, but feel free to use it if you think you will get score containing "\n" character.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
soup = BeautifulSoup(res, 'lxml')
box = soup.find('div',{'data-type':'container'})
scores=box.find_all('div',{'class': 'sco'})
for score in scores:
print(score.text)
Thanks for answers. Solved problem
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
#page = requests.get('https://livescore.com')
soup = BeautifulSoup(res, 'lxml')
box = soup.find('div',{'data-type':'container'})
box2 = box.find_all('a',{'class' : 'match-row'})
for data in box2:
test1 = data.find('div',{'class': 'sco'}).text.replace('\n', '')
test2 = data.find('div',{'class': 'ply tright name'}).text.replace('\n', '')
test3 = data.find('div',{'class': 'ply name'}).text.replace('\n', '')
print(test2,test1,test3)

How to scrape price from booking.com using beautifulsoup?

I am trying to scrape price from booking.com but not successful. Any suggestions
My code as follows
#Importing necessary library
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
import pandas as pd
import time
import re
import requests
from itertools import zip_longest
from webdriver_manager.chrome import ChromeDriverManager
price = []
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get('https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ&lang=en-gb&sid=422b3ff3c0e98b522259ad1cad2505ea&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ%3Bsid%3D422b3ff3c0e98b522259ad1cad2505ea%3Btmpl%3Dsearchresults%3Bclass_interval%3D1%3Bdest_id%3D-1506909%3Bdest_type%3Dcity%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D2%3Bgroup_children%3D0%3Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcard%3D0%3Braw_dest_type%3Dcity%3Broom1%3DA%252CA%3Bsb_price_type%3Dtotal%3Bshw_aparth%3D1%3Bslp_r_match%3D0%3Bsrc%3Dindex%3Bsrc_elem%3Dsb%3Bsrpvid%3D912403b6d1220012%3Bss%3DAuckland%3Bss_all%3D0%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DAuckland%3Bssne_untouched%3DAuckland%3Btop_ufis%3D1%26%3B&sr_autoscroll=1&ss=Auckland&is_ski_area=0&ssne=Auckland&ssne_untouched=Auckland&city=-1506909&checkin_year=2020&checkin_month=9&checkin_monthday=1&checkout_year=2020&checkout_month=9&checkout_monthday=2&group_adults=2&group_children=0&no_rooms=1&from_sf=1')
time.sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
for item in soup.findAll('label', {'class': "tpi_price_label tpi_price_label__orange"}):
price.append(item.get_text(strip=True))
print(price)
The above code is not showing any output. It gives an empty list.
You need properly wait for the page to load.
This is done using WebDriverWait and it will throw exception if the page isnt loaded during the specified timeout.
Try running my sample code bellow:
# test_scrape.py
import atexit
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
URL = ("https://www.booking.com/searchresults.en-gb.html?"
"label=gen173nr-1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ"
"&lang=en-gb&sid=422b3ff3c0e98b522259ad1cad2505ea&sb=1&src=searchresults&src_elem=sb"
"&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-"
"1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ%3Bsid%3D422b3ff"
"3c0e98b522259ad1cad2505ea%3Btmpl%3Dsearchresults%3Bclass_interval%3D1%3Bdest_id%3D-150690"
"9%3Bdest_type%3Dcity%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D2%3Bgroup_children%3D0%3"
"Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcar"
"d%3D0%3Braw_dest_type%3Dcity%3Broom1%3DA%252CA%3Bsb_price_type%3Dtotal%3Bshw_aparth%3D1%3Bs"
"lp_r_match%3D0%3Bsrc%3Dindex%3Bsrc_elem%3Dsb%3Bsrpvid%3D912403b6d1220012%3Bss%3DAuckland%3B"
"ss_all%3D0%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DAuckland%3Bssne_untouched%3DAuckland%3Btop_ufi"
"s%3D1%26%3B&sr_autoscroll=1&ss=Auckland&is_ski_area=0&ssne=Auckland&ssne_untouched=Auckland&ci"
"ty=-1506909&checkin_year=2020&checkin_month=9&checkin_monthday=1&checkout_year=2020&checkout_m"
"onth=9&checkout_monthday=2&group_adults=2&group_children=0&no_rooms=1&from_sf=1'")
class page_loaded:
def __call__(self, driver):
document_ready = driver.execute_script("return document.readyState;") == "complete"
jquery_ready = driver.execute_script("return jQuery.active == 0;")
print(f"document ready: [({type(document_ready).__name__}){document_ready}]")
print(f"jquery ready: [({type(jquery_ready).__name__}){jquery_ready}]")
return document_ready and jquery_ready
def wait_for_page_to_load(driver, timeout_seconds=20):
WebDriverWait(driver, timeout_seconds, 0.2).until(page_loaded(), f"Page could not load in {timeout_seconds} s.!")
def go_to_url(driver, url):
driver.get(url)
wait_for_page_to_load(driver)
def get_orange_prices(soup):
return [price_label.get_text(strip=True)
for price_label
in soup.select("label.tpi_price_label.tpi_price_label__orange")]
def get_normal_prices(soup):
return [price_label.get_text(strip=True)
for price_label
in soup.select("div[class*=bui-price-display__value]")]
def start_driver():
driver = webdriver.Chrome()
atexit.register(driver.quit)
driver.maximize_window()
return driver
def main():
driver = start_driver()
go_to_url(driver, URL)
soup = BeautifulSoup(driver.page_source, 'html.parser')
orange_prices = get_orange_prices(soup)
print(orange_prices)
normal_prices = get_normal_prices(soup)
print(normal_prices)
if __name__ == '__main__':
main()
If you're having issues with the chromedriver not being discovered, try specify exact path to it like this:
def start_driver():
driver = webdriver.Chrome(executable_path="/path/to/cromedriver")
atexit.register(driver.quit)
driver.maximize_window()
return driver

Selenium is not working on proper way

from bs4 import BeautifulSoup
from selenium import webdriver
import pdfkit
import time
import logging
driver=webdriver.Chrome()
driver.get('https://www.linkedin.com/')
time.sleep(5)
driver.get('https://www.linkedin.com/in/ankurkhandelwal1/')
time.sleep(20)
soup= BeautifulSoup(driver.page_source, 'lxml')
elem=driver.find_element_by_css_selector(".contact-see-more-less.link-without-visited-state")
elem.click()
for div in soup.select('.pv-contact-info__ci-container'):
for link in soup.find_all('a', {'class': 'pv-contact-info__contact-link Sans-15px-black-55%'}):
old=link.get('href')
mobile=old.replace("tel:", " ")
print(mobile)
elem.click() is working but after this line program doesn't go next and shows mobile blank. When I remove this line and manually click then it's working.

Resources