Python Selenium for getting the whole content of table in Reactjs - python-3.x

I tried to scrape the content of table from wyscout.com, which seems that is built by Reacjs.
After log in, script selects the country(e.g. England), League(e.g. Premier League), Team(e.g. Arsenal). Here choose Stats tab.
Then, it shows the table to scrape the data. Even if there is a button to export a excel file, I want to scrape the content manually using selenium or beautifulsoup.
However, script gets only 18 rows even though the number of rows on the table is more than 100.
Please let me know the solution.
Thanks.
Here is my code.
from re import search
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import string
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
# Load Chrome Browser
show_browser = True
options = Options()
# options.add_argument('--headless')
scraped_data = []
def bot_driver(url, user_name, user_password):
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get(url)
driver.maximize_window()
time.sleep(2)
# Log in
# login = driver.find_element_by_xpath("//ul[#id='avia-menu']/li[5]/a")
# login.click()
time.sleep(10)
idd = driver.find_element_by_xpath("//input[#id='login_username']")
idd.send_keys(user_name)
passW = driver.find_element_by_xpath("//input[#id='login_password']")
passW.send_keys(user_password)
time.sleep(2)
submit = driver.find_element_by_xpath("//button[#id='login_button']")
submit.click()
time.sleep(10)
try:
force_login = driver.find_element_by_xpath("//button[#class='btn2_zFM sc-jDwBTQ cUKaFo -block3u2Qh -primary1dLZk']")
force_login.click()
print('force loging')
except :
print('force login error')
return driver
def select_country(driver, country_name):
# Specific Country
# country = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "//div[#commandsource='list#area_list#30']")))
# country = driver.find_element_by_xpath("//div[#commandsource='list#area_list#30']")
# All the countries
list_country = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH, "//div[#id='detail_0_home_navy']/div[1]/div/div")))
time.sleep(3)
for entry in list_country:
if country_name == entry.text:
print('country click here')
entry.click()
return driver, 1
return driver, 0
def select_league(driver, league_name):
# Specific League
# league = driver.find_element_by_xpath("//div[#commandsource='list#competition_list#0']")
# All the leagues
list_league = driver.find_elements_by_xpath("//div[#id='detail_0_area_navy_0']/div[1]/div/div")
for entry in list_league:
if league_name == entry.text:
entry.click()
return driver, 1
return driver, 0
def select_team(driver, team_names):
# Specific Team
# team = driver.find_element_by_xpath("//div[#commandsource='list#team_list#0']")
flag_team = 0
list_team = driver.find_elements_by_xpath("//div[#id='detail_0_competition_navy_0']/div[1]/div/div")
for entry in list_team:
if entry.text in team_names:
flag_team = 1
print('selected team = ', entry.text)
entry.click()
time.sleep(2)
# Stats
stats = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, 'Stats')))
stats.click()
time.sleep(3)
WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.XPATH, "//div[#id='detail_0_team_stats']/div/div/div/main/div[3]/div[2]/div/table")))
content_stats = driver.page_source
soup_stats = BeautifulSoup(content_stats, "html.parser")
table_stats = soup_stats.find('table', attrs={'class': 'teamstats__Index-module__table___1K93L teamstats__Index-module__with-opp___16Rp5'})
# print(table_stats)
tbody_stats = table_stats.find('tbody')
tr_stats = tbody_stats.find_all('tr')
print('number of tr = ', len(tr_stats))
# Return to team selection
back_team = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "//div[#id='detail_0_team_back']")))
back_team.click()
time.sleep(5)
if flag_team == 1:
return driver, 1
else:
return driver, 0
if __name__ == "__main__":
# User input
# Login - wyscout_url = 'https://wyscout.com/'
wyscout_url = 'https://platform.wyscout.com/app/?/'
wyscout_user_name = '' # username
wyscout_user_password = '' # password
wyscout_driver = bot_driver(wyscout_url, wyscout_user_name, wyscout_user_password)
time.sleep(10)
# Select a Country
country = 'England' # .upper()
wyscout_driver, succeed = select_country(wyscout_driver, country)
if succeed == 0:
print('NO country!')
time.sleep(7)
# Select a league
league = 'Premier League' # .upper()
wyscout_driver, succeed = select_league(wyscout_driver, league)
if succeed == 0:
print('NO League!')
time.sleep(7)
# Select team
team_options = ['Arsenal']
wyscout_driver, succeed = select_team(wyscout_driver, team_options)
time.sleep(7)
if succeed == 0:
print('NO Team!')
time.sleep(7)
print('!!!Wyscout END!!!')
# wyscout_driver.quit()

Finally I figured it out by myself.
Here is my solution.
# Scroll down
print('scroll down')
last_height = driver.execute_script("return arguments[0].scrollHeight;", table_stats)
time.sleep(3)
while True:
driver.execute_script("arguments[0].scrollBy(0,arguments[0].scrollHeight)", table_stats)
time.sleep(5)
new_height = driver.execute_script("return arguments[0].scrollHeight;", table_stats)
if new_height == last_height:
break
last_height = new_height
print('scroll end')

Related

Selenium fails to scroll down

I am using Selenium to scrape data from here. The website is using some animation to show the sections after your scroll down. I am trying to scroll down to the footer and wait for the animation to get the data from the page.
Although I am not sure if that's the only approach that get me the data, cause I can see that the animation is only adding class aos-animate to the main class, and if that class is not in the HTML element, it wont get the text!
In the get_service_data function, I am trying to scroll down to the end of the page. I tried to scroll down before I start the loop.
I tried:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
html = driver.find_element(By.CLASS_NAME, 'html')
html.send_keys(Keys.END)
html.send_keys(Keys. PAGE_DOWN)
copyright = driver.find_element(By.CLASS_NAME, 'copyright')
driver.execute_script("arguments[0].scrollIntoView();", copyright)
Here is my full script:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
language = "en" # to take this from the user
main_link = f"https://www.atlp.ae/{language}"
driver_path = os.path.join(os.getcwd(), "chromedriver")
# options = webdriver.ChromeOptions()
# options.headless = True
driver = webdriver.Chrome(driver_path) # options=options
driver.maximize_window()
def get_services_links():
links = []
driver.get(main_link)
services_header_xpath = '//*[#id="fixed-header"]/div/div[2]/div/nav/ul/li[5]/button'
driver.find_element(By.XPATH, services_header_xpath).click()
services_menu_xpath = '//*[#id="serviceInfotitle"]/nav/ul'
services_menu = driver.find_element(By.XPATH, services_menu_xpath)
options = services_menu.find_elements(By.TAG_NAME ,"li")
for option in options:
a_tag = option.find_element(By.TAG_NAME ,"a")
links.append(a_tag.get_attribute("href"))
return links[:-1] if len(links) > 0 else []
def get_service_data(link):
driver.get(link)
wait = WebDriverWait(driver, 10)
service_name_xpath = '//*[#id="main-scrollbar"]/div[1]/main/sc-placeholder/app-intro-section/section/div/div[1]/div[1]/div/p'
wait.until(EC.visibility_of_element_located((By.XPATH,service_name_xpath)))
service_name = driver.find_element(By.XPATH, service_name_xpath).text
print("Service Name: ", service_name)
# row serviceSubsetRow ng-star-inserted
wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'ServiceSubsetWrapper')))
services_wrapper = driver.find_element(By.CLASS_NAME, 'ServiceSubsetWrapper')
container = services_wrapper.find_element(By.CLASS_NAME, 'container')
service_sections = container.find_elements(By.CLASS_NAME, 'serviceSubsetRow')
for service in service_sections:
textual_div = service.find_element(By.CLASS_NAME, 'textCol')
something = textual_div.find_element(By.CLASS_NAME, 'serviceSubsetTitle')
print("Text: ", something.text)
if __name__ == '__main__':
# try:
links = get_services_links()
for link in links:
get_service_data(link)
break
driver.quit()
What you need is this:
something.get_attribute('innerText') because, perhaps, due to the added animation, the regular text is not working.
Also, I have removed a few lines as I thought they were not needed (at least for this exercise). I have directly added a loop to make it work with serviceSubsetTitle
def get_service_data(link):
driver.get(link)
wait = WebDriverWait(driver, 10)
service_name_xpath = '//*[#id="main-scrollbar"]/div[1]/main/sc-placeholder/app-intro-section/section/div/div[1]/div[1]/div/p'
wait.until(EC.visibility_of_element_located((By.XPATH, service_name_xpath)))
service_name = driver.find_element(By.XPATH, service_name_xpath).text
print("Service Name: ", service_name)
# ---- removed these lines --------
# row serviceSubsetRow ng-star-inserted
# wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'ServiceSubsetWrapper')))
# services_wrapper = driver.find_element(By.CLASS_NAME, 'ServiceSubsetWrapper')
#
# container = services_wrapper.find_element(By.CLASS_NAME, 'container')
# service_sections = container.find_elements(By.CLASS_NAME, 'serviceSubsetRow')
# ----- End of lines removal ----------
# Clicking out the cookie acceptance button
try:
driver.find_element(By.XPATH, "//*[#class='cc-btn cc-allow']").click()
except:
print("nothing there")
# --- removed these lines
# for service in service_sections:
# textual_div = service.find_element(By.CLASS_NAME, 'textCol')
# time.sleep(3)
# --- end of lines removal ---------
# These are my lines here from below:
somethings = driver.find_elements(By.XPATH, "//*[contains(#class, 'serviceSubsetTitle')]")
print(len(somethings))
for something in somethings:
# time.sleep(2)
title_txt = something.get_attribute('innerText')
print(title_txt)
here is the output:
Service Name: Sea Services
5
Vessel Management and Marine Services
Passenger Handling and Cargo Operations
Issuance of Certificates and Approvals in Ports
Ports Licensing
Property Leasing Services - Ports
Process finished with exit code 0
This is one way of scrolling that page down:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.atlp.ae/en'
browser.get(url)
browser.execute_script('window.scrollBy(0, 100);')
cookie_b = WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[#aria-label='deny cookies']")))
cookie_b.click()
body = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, "main-scrollbar")))
body.click()
body.send_keys(Keys.END)
print('scrolled down')
Setup is chrome/chromedriver on linux, however it can be adapted to your system, just observe the imports, and the code after defining the browser/driver. Selenium docs: https://www.selenium.dev/documentation/

How to have better structure in my webscraping output

Here's my script :
from selenium import webdriver
import pandas as pd
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
PATH = "chromedriver.exe"
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(executable_path=PATH,options=options)
url = 'https://www.google.com/maps/place/Lazeo+Paris+11e/#48.8532051,2.3859464,17z/data=!3m1!4b1!4m5!3m4!1s0x47e6738875606957:0xdfb3822564e33888!8m2!3d48.8532051!4d2.3881404'
driver.get(url)
time.sleep(5)
driver.find_element_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/span[1]/span/span/span[2]/span[1]/button').click()
#to make sure content is fully loaded we can use time.sleep() after navigating to each page
import time
time.sleep(3)
SCROLL_PAUSE_TIME = 5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
number = 0
while True:
number = number+1
# Scroll down to bottom
ele = driver.find_element_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
driver.execute_script('arguments[0].scrollBy(0, 5000);', ele)
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
print(f'last height: {last_height}')
ele = driver.find_element_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
new_height = driver.execute_script("return arguments[0].scrollHeight", ele)
print(f'new height: {new_height}')
if number == 2:
break
#if new_height == last_height:
# break
print('cont')
last_height = new_height
item = driver.find_elements_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[10]')
time.sleep(3)
name_list = []
stars_list = []
review_list = []
duration_list = []
for i in item:
button = i.find_elements_by_tag_name('button')
for m in button:
if m.text == "More":
m.click()
time.sleep(5)
name = i.find_elements_by_class_name("d4r55")
stars = i.find_elements_by_class_name("kvMYJc")
review = i.find_elements_by_xpath("//span[#class='wiI7pd']")
duration = i.find_elements_by_class_name("rsqaWe")
for j,k,l,p in zip(name,stars, review, duration):
name_list.append(j.text)
stars_list.append(p.text)
review_list.append(k.text.strip())
duration_list.append(l.text)
review = pd.DataFrame(
{'name': name_list,
'rating': stars_list,
'review': review_list,
'duration': duration_list})
review.to_csv('google_review.csv',index=False, encoding='utf-8-sig')
print(review)
driver.quit()
But sometimes the format of the review is not convenient, like that (sorry it's in french) :
That's this one on google map :
How to avoid that ? Because it count as several rows and I cannot work with a structure like that.
I hope the fact that's in french isn't problematic to understand the structure of the sentences.

Get access to data in dynamic table using Selenium

Below code works and fills out two forms which is required to get to the table on this page: https://forsikringsguiden.dk/#!/bilforsikring/resultatside
Once the forms are filled out the table shows an overview over difference insurance firms and how much you will pay yearly for car insurance. So a comparison service. I need to scrape this overview once a week.
I am unsure how to do this. I know how to use BS4 for scraping, but I need the firm names as well, and not only the information, and this is not available by inspecting the website in Chrome. If I dive into
the network and look into XHR i find this link
https://forsikringsguiden.dk/signalr/poll?transport=longPolling&messageId=d-D7589F50-A%2C0%7C9%2C0%7C_%2C1%7C%3A%2C0&clientProtocol=1.4&connectionToken=fUYa3MT52oKf77Y6yU1sLnXiVzPw2CD4XgA8x50EfifJlz8XTPjBeP0klHUKt2uXmnisqO0KLk3fCb5bjOZ8k%2FeJl8zaXAgtRIALW9rzMF%2F8L7Pk3MOYwPRY4md1sDk5&connectionData=%5B%7B%22name%22%3A%22insuranceofferrequesthub%22%7D%5D&tid=9&_=1572505813840
This shows me all the data I need, but I cannot navigate to this page in selenium.
How do I tackle this problem?
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
# enable browser logging
d = DesiredCapabilities.CHROME
d['loggingPrefs'] = { 'browser':'ALL' }
driver = webdriver.Chrome(desired_capabilities = d, options=chrome_options)
driver.fullscreen_window()
wait = WebDriverWait(driver,1)
driver.get("https://forsikringsguiden.dk/#!/bilforsikring/manuel")
#time.sleep(5)
#remove cookie bar
driver.find_element_by_id('cookieBarAccept').click()
maerke = driver.find_element_by_xpath('//*[#id="s2id_carSelectedMake"]/a').click()
driver.find_element_by_xpath('//*[#id="s2id_autogen1_search"]').send_keys("Hyundai")
driver.minimize_window()
driver.maximize_window()
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
model = driver.find_element_by_xpath('//*[#id="s2id_autogen2"]').click()
driver.find_element_by_xpath('//*[#id="s2id_autogen3_search"]').send_keys("i30")
driver.minimize_window()
driver.maximize_window()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
driver.execute_script("scrollBy(0,250)")
aargang = driver.find_element_by_xpath('//*[#id="s2id_autogen4"]/a').click()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="s2id_autogen5_search"]').send_keys("2009")
driver.minimize_window()
driver.maximize_window()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
driver.execute_script("scrollBy(0,250)")
motor_str = driver.find_element_by_xpath('//*[#id="s2id_autogen6"]/a').click()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="s2id_autogen7_search"]').send_keys("1,6")
driver.minimize_window()
driver.maximize_window()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
variant = driver.find_element_by_xpath('//*[#id="s2id_autogen8"]').click()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="s2id_autogen9_search"]').send_keys("1,6 CRDi 116HK 5d")
driver.minimize_window()
driver.maximize_window()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
driver.execute_script("scrollBy(0,250)")
godkend_oplysninger = driver.find_element_by_xpath('//*[#id="content"]/div[4]/form/div[6]/div/button').click()
#Om dig siden
driver.get("https://forsikringsguiden.dk/#!/bilforsikring/omdig")
alder = wait.until(EC.presence_of_element_located((By.XPATH,'//*[#id="content"]/div/div[2]/div[2]/form/div[1]/div[1]/div/input')))
alder.send_keys("50")
adresse = wait.until(EC.presence_of_element_located((By.XPATH,'//*[#id="adresse-autocomplete"]')))
adresse.send_keys("Havevang 8, 3. th, 4300 Holbæk", Keys.ENTER)
aar = wait.until(EC.presence_of_element_located((By.XPATH,'//*[#id="content"]/div/div[2]/div[2]/form/div[2]/div/div/input')))
aar.send_keys("10")
driver.execute_script("scrollBy(0,250)")
#Antal skader
driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div/div[2]/div[2]/form/div[3]/div/div/div[2]').click()
wait
driver.find_element_by_xpath('/html/body/div[11]/ul/li[3]').click()
driver.minimize_window()
driver.maximize_window()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
time.sleep(1)
#skade 1
driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div/div[2]/div[2]/form/div[4]/div/div[1]/div/div[2]').click()
wait
driver.find_element_by_xpath('/html/body/div[12]/ul/li[5]').click()
driver.minimize_window()
driver.maximize_window()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
time.sleep(1)
#skade 2
driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div/div[2]/div[2]/form/div[4]/div/div[2]/div/div[2]').click()
wait
driver.find_element_by_xpath('/html/body/div[13]/ul/li[3]').click()
driver.minimize_window()
driver.maximize_window()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
time.sleep(1)
find_daekning = driver.find_element_by_xpath('//*[#id="content"]/div/div[2]/div[2]/form/div[5]/div/button').click()
EDIT:
I added this to my code using Selenium:
##### Get Data #####
driver.get("https://forsikringsguiden.dk/#!/bilforsikring/resultatside")
wait = WebDriverWait(driver,10)
wait
res_element = driver.find_elements_by_xpath('/html/body/div[7]/div/div[2]/div[1]/div[2]/div[2]')
res = [x.text for x in res_element]
print(res, "\n")
But it doesn't get me the numbers, just some of the text.
Here the result
['Sortér efter: Forklaring\nGå til selskab\nDin dækning\nkr./år -\nMed
samlerabat kr./år\nSelvrisiko\nSe detaljer\nSammenlign\nGå til selskab\nDin dækning\nkr./år -\nMed samlerabat kr./år\nSelvrisiko
\nSe detaljer\nSammenlign\nGå til selskab\nDin dækning\nkr./år -
\nMed samlerabat kr./år\nSelvrisiko\nSe detaljer\nSammenlign\n

Selenium Web Scraping Id missing

I am trying to gather the data from the http://maharain.gov.in/ site. I have written the below script.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import unittest, time, re
import csv,sys,os
from bs4 import BeautifulSoup
import time
def check_exists_by_xpath(xpath,driver):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
chromepath= '/home/swapnil/Downloads/chromedriver'
driver = webdriver.Chrome(chromepath)
start_time =time.time()
base_url = "http://maharain.gov.in/"
driver.get(base_url)
driver.switch_to.frame('MenuFrame')
driver.find_element_by_name("QueriesCirclewise3").click()
time.sleep(3)
print("Done")
driver.find_element(By.XPATH, '//*[#id="menu"]/input[10]').click()
time.sleep(3)
print("Done")
# driver.find_element_by_name("PastQueriesCirclewise6").click()
time.sleep(3)
print("Done")
# driver.implicitly_wait(3)
driver.switch_to.default_content()
time.sleep(3)
print("Done")
driver.switch_to.frame(driver.find_element_by_name("ContentFrame"))
# dropdown_menu_year = Select(driver.find_element_by_id("selyear"))
# select_year = [option.text for option in dropdown_menu_year.options]
# select_year = [t for t in select_year if t !='Select']
# select_year = [ '2015', '2016', '2017']
time.sleep(3)
print("Done All ")
dropdown_menu_state = Select(driver.find_element_by_id("selstate"))
select_state = [option.text for option in dropdown_menu_state.options]
select_state = [t for t in select_state if t !='Select']
dropdown_menu_dist = Select(driver.find_element_by_id("seldist"))
select_dist = [option.text for option in dropdown_menu_dist.options]
select_dist = [t for t in select_dist if t !='Select']
dropdown_menu_month = Select(driver.find_element_by_id("selmonth"))
select_mon = [option.text for option in dropdown_menu_month.options]
select_mon = [t for t in select_mon if t !='Select']
i = 0
year=str(2018)
# for year in select_year:
if not os.path.exists(year):
os.makedirs(year)
for state in select_state:
for dist in select_dist:
if not os.path.exists(year+'/'+dist):
os.makedirs(year+'/'+dist)
for month in select_mon:
print (i)
# dropdown_menu_year = Select(driver.find_element_by_id("selyear"))
# dropdown_menu_year.select_by_visible_text(year)
dropdown_menu_state = Select(driver.find_element_by_id("selstate"))
dropdown_menu_state.select_by_visible_text(state)
time.sleep(1)
dropdown_menu_dist = Select(driver.find_element_by_id("seldist"))
dropdown_menu_dist.select_by_visible_text(dist)
if(dist=='Wardha' or dist=='Washim' or dist=='Yavatmal'):
# time.sleep(2)
dropdown_menu_month = Select(driver.find_element_by_id("selmonth"))
dropdown_menu_month.select_by_visible_text(month)
time.sleep(2)
driver.find_element_by_name("btnshow").click()
time.sleep(2)
print("Done")
driver.switch_to.frame(driver.find_element(By.CSS_SELECTOR, 'body > embed'))
if (check_exists_by_xpath('//*[#id="tableID"]',driver)):
tab = driver.find_element(By.XPATH,'//*[#id="tableID"]')
soup = BeautifulSoup (driver.page_source)
table = soup.select_one('table')
data = [[td.text for td in row.find_all("td")] for row in table.find_all("tr")]
file_name = year+'/'+dist+'/'+year+'_'+dist+'_'+month+'.csv'
print(file_name)
f = open(file_name,'w', newline='')
writer =csv.writer(f)
writer.writerows(data)
f.close()
i+=1
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element_by_name("ContentFrame"))
print ( time.time() - start_time)
print(i)
But each time I run the code it gets stuck at different locations with errors like missing selector id "selstate" or "body > embed" not present, which may run correctly in the next run without any changes to the code and may get stuck at the different location.
I have tried adding driver implicit wait and thread sleep with value set to 5 and less.Please point out what should be the correct measure to make it run in one go and where should be the wait or sleep statements to be added or any other changes if required.

Selenium3 Python3 how to get url from attribute style="background-image: ur,' " and there is a url"

ho to get url from attribute url i mean the url himself in style? style="width: 433px; height: 510px; background-image: url(https://cs7056.vk.me/c635104/v635104607/1c316/ADzy-2WY8pw.jpg)" Selenium3 Python3 easy for you!
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import re
import time
url = 'https://vk.com/uporols_you'
driver = webdriver.Firefox(executable_path='C:/Users/PANDEMIC/AppData/Local/Mozilla/geckodriver.exe')
def login(driver):
log_page = driver.get('https://login.vk.com/?act=login')
find_login_input = driver.find_element_by_id('login_form').find_element_by_id('email').send_keys('+77782303865')
find_password_input = driver.find_element_by_id('login_form').find_element_by_id('pass').send_keys('pass')
find_button = driver.find_element_by_xpath('//button[#id="login_button"]').click()
time.sleep(5)
def get_photo_from_page(driver):
driver.get(url)
try:
driver.find_element_by_class_name('popup_box_container').find_element_by_class_name('box_title_wrap').find_element_by_class_name('box_x_button').click()
except:
print('nope nothing')
for i in range(2):
scrol_down = driver.find_element_by_id('public_wall').find_element_by_id('wall_more_link').click()
time.sleep(2)
tut = []
#t = (a[#class="page_post_thumb_wrap image_cover page_post_thumb_last_column page_post_thumb_last_row"])
for ii in driver.find_elements_by_xpath('//a[#style]'):
o = ii.get_attribute('style')
print(o)
#soup = BeautifulSoup(htlm, 'lxml')
#im = soup.find_all('a', class_="'page_post_thumb_wrap image_cover page_post_thumb_last_column page_post_thumb_last_row'")
#print(htlm)
#for a in im:
# s = a.get('data-src_big').split('|')[0]
# tut.append(s)
#print(tut)
#for num, link in enumerate(tut, start=1):
# p = requests.get(link)
# out = open("img%s.jpg" % (num), 'wb')
# out.write(p.content)
# out.close()
def main():
login(driver)
get_photo_from_page(driver)
if __name__ == '__main__':
main()
In that particular case, you could just parse the style string that you were already able to gather with your script.
Just add this function to your code:
def parse_style_attribute(style_string):
if 'background-image' in style_string:
style_string = style_string.split(' url("')[1].replace('");', '')
return style_string
return None
This is a simple string parsing which extract the url if there is "background-image" in the string, or return None if there is no image.
You can then use it in your code:
links = list()
for ii in driver.find_elements_by_xpath('//a[#style]'):
o = ii.get_attribute('style')
links.append(parse_style_attribute(o))
links = [link for link in links if link is not None]

Resources