Selenium Web Scraping Id missing - python-3.x

I am trying to gather the data from the http://maharain.gov.in/ site. I have written the below script.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import unittest, time, re
import csv,sys,os
from bs4 import BeautifulSoup
import time
def check_exists_by_xpath(xpath,driver):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
chromepath= '/home/swapnil/Downloads/chromedriver'
driver = webdriver.Chrome(chromepath)
start_time =time.time()
base_url = "http://maharain.gov.in/"
driver.get(base_url)
driver.switch_to.frame('MenuFrame')
driver.find_element_by_name("QueriesCirclewise3").click()
time.sleep(3)
print("Done")
driver.find_element(By.XPATH, '//*[#id="menu"]/input[10]').click()
time.sleep(3)
print("Done")
# driver.find_element_by_name("PastQueriesCirclewise6").click()
time.sleep(3)
print("Done")
# driver.implicitly_wait(3)
driver.switch_to.default_content()
time.sleep(3)
print("Done")
driver.switch_to.frame(driver.find_element_by_name("ContentFrame"))
# dropdown_menu_year = Select(driver.find_element_by_id("selyear"))
# select_year = [option.text for option in dropdown_menu_year.options]
# select_year = [t for t in select_year if t !='Select']
# select_year = [ '2015', '2016', '2017']
time.sleep(3)
print("Done All ")
dropdown_menu_state = Select(driver.find_element_by_id("selstate"))
select_state = [option.text for option in dropdown_menu_state.options]
select_state = [t for t in select_state if t !='Select']
dropdown_menu_dist = Select(driver.find_element_by_id("seldist"))
select_dist = [option.text for option in dropdown_menu_dist.options]
select_dist = [t for t in select_dist if t !='Select']
dropdown_menu_month = Select(driver.find_element_by_id("selmonth"))
select_mon = [option.text for option in dropdown_menu_month.options]
select_mon = [t for t in select_mon if t !='Select']
i = 0
year=str(2018)
# for year in select_year:
if not os.path.exists(year):
os.makedirs(year)
for state in select_state:
for dist in select_dist:
if not os.path.exists(year+'/'+dist):
os.makedirs(year+'/'+dist)
for month in select_mon:
print (i)
# dropdown_menu_year = Select(driver.find_element_by_id("selyear"))
# dropdown_menu_year.select_by_visible_text(year)
dropdown_menu_state = Select(driver.find_element_by_id("selstate"))
dropdown_menu_state.select_by_visible_text(state)
time.sleep(1)
dropdown_menu_dist = Select(driver.find_element_by_id("seldist"))
dropdown_menu_dist.select_by_visible_text(dist)
if(dist=='Wardha' or dist=='Washim' or dist=='Yavatmal'):
# time.sleep(2)
dropdown_menu_month = Select(driver.find_element_by_id("selmonth"))
dropdown_menu_month.select_by_visible_text(month)
time.sleep(2)
driver.find_element_by_name("btnshow").click()
time.sleep(2)
print("Done")
driver.switch_to.frame(driver.find_element(By.CSS_SELECTOR, 'body > embed'))
if (check_exists_by_xpath('//*[#id="tableID"]',driver)):
tab = driver.find_element(By.XPATH,'//*[#id="tableID"]')
soup = BeautifulSoup (driver.page_source)
table = soup.select_one('table')
data = [[td.text for td in row.find_all("td")] for row in table.find_all("tr")]
file_name = year+'/'+dist+'/'+year+'_'+dist+'_'+month+'.csv'
print(file_name)
f = open(file_name,'w', newline='')
writer =csv.writer(f)
writer.writerows(data)
f.close()
i+=1
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element_by_name("ContentFrame"))
print ( time.time() - start_time)
print(i)
But each time I run the code it gets stuck at different locations with errors like missing selector id "selstate" or "body > embed" not present, which may run correctly in the next run without any changes to the code and may get stuck at the different location.
I have tried adding driver implicit wait and thread sleep with value set to 5 and less.Please point out what should be the correct measure to make it run in one go and where should be the wait or sleep statements to be added or any other changes if required.

Related

Selenium fails to scroll down

I am using Selenium to scrape data from here. The website is using some animation to show the sections after your scroll down. I am trying to scroll down to the footer and wait for the animation to get the data from the page.
Although I am not sure if that's the only approach that get me the data, cause I can see that the animation is only adding class aos-animate to the main class, and if that class is not in the HTML element, it wont get the text!
In the get_service_data function, I am trying to scroll down to the end of the page. I tried to scroll down before I start the loop.
I tried:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
html = driver.find_element(By.CLASS_NAME, 'html')
html.send_keys(Keys.END)
html.send_keys(Keys. PAGE_DOWN)
copyright = driver.find_element(By.CLASS_NAME, 'copyright')
driver.execute_script("arguments[0].scrollIntoView();", copyright)
Here is my full script:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
language = "en" # to take this from the user
main_link = f"https://www.atlp.ae/{language}"
driver_path = os.path.join(os.getcwd(), "chromedriver")
# options = webdriver.ChromeOptions()
# options.headless = True
driver = webdriver.Chrome(driver_path) # options=options
driver.maximize_window()
def get_services_links():
links = []
driver.get(main_link)
services_header_xpath = '//*[#id="fixed-header"]/div/div[2]/div/nav/ul/li[5]/button'
driver.find_element(By.XPATH, services_header_xpath).click()
services_menu_xpath = '//*[#id="serviceInfotitle"]/nav/ul'
services_menu = driver.find_element(By.XPATH, services_menu_xpath)
options = services_menu.find_elements(By.TAG_NAME ,"li")
for option in options:
a_tag = option.find_element(By.TAG_NAME ,"a")
links.append(a_tag.get_attribute("href"))
return links[:-1] if len(links) > 0 else []
def get_service_data(link):
driver.get(link)
wait = WebDriverWait(driver, 10)
service_name_xpath = '//*[#id="main-scrollbar"]/div[1]/main/sc-placeholder/app-intro-section/section/div/div[1]/div[1]/div/p'
wait.until(EC.visibility_of_element_located((By.XPATH,service_name_xpath)))
service_name = driver.find_element(By.XPATH, service_name_xpath).text
print("Service Name: ", service_name)
# row serviceSubsetRow ng-star-inserted
wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'ServiceSubsetWrapper')))
services_wrapper = driver.find_element(By.CLASS_NAME, 'ServiceSubsetWrapper')
container = services_wrapper.find_element(By.CLASS_NAME, 'container')
service_sections = container.find_elements(By.CLASS_NAME, 'serviceSubsetRow')
for service in service_sections:
textual_div = service.find_element(By.CLASS_NAME, 'textCol')
something = textual_div.find_element(By.CLASS_NAME, 'serviceSubsetTitle')
print("Text: ", something.text)
if __name__ == '__main__':
# try:
links = get_services_links()
for link in links:
get_service_data(link)
break
driver.quit()
What you need is this:
something.get_attribute('innerText') because, perhaps, due to the added animation, the regular text is not working.
Also, I have removed a few lines as I thought they were not needed (at least for this exercise). I have directly added a loop to make it work with serviceSubsetTitle
def get_service_data(link):
driver.get(link)
wait = WebDriverWait(driver, 10)
service_name_xpath = '//*[#id="main-scrollbar"]/div[1]/main/sc-placeholder/app-intro-section/section/div/div[1]/div[1]/div/p'
wait.until(EC.visibility_of_element_located((By.XPATH, service_name_xpath)))
service_name = driver.find_element(By.XPATH, service_name_xpath).text
print("Service Name: ", service_name)
# ---- removed these lines --------
# row serviceSubsetRow ng-star-inserted
# wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'ServiceSubsetWrapper')))
# services_wrapper = driver.find_element(By.CLASS_NAME, 'ServiceSubsetWrapper')
#
# container = services_wrapper.find_element(By.CLASS_NAME, 'container')
# service_sections = container.find_elements(By.CLASS_NAME, 'serviceSubsetRow')
# ----- End of lines removal ----------
# Clicking out the cookie acceptance button
try:
driver.find_element(By.XPATH, "//*[#class='cc-btn cc-allow']").click()
except:
print("nothing there")
# --- removed these lines
# for service in service_sections:
# textual_div = service.find_element(By.CLASS_NAME, 'textCol')
# time.sleep(3)
# --- end of lines removal ---------
# These are my lines here from below:
somethings = driver.find_elements(By.XPATH, "//*[contains(#class, 'serviceSubsetTitle')]")
print(len(somethings))
for something in somethings:
# time.sleep(2)
title_txt = something.get_attribute('innerText')
print(title_txt)
here is the output:
Service Name: Sea Services
5
Vessel Management and Marine Services
Passenger Handling and Cargo Operations
Issuance of Certificates and Approvals in Ports
Ports Licensing
Property Leasing Services - Ports
Process finished with exit code 0
This is one way of scrolling that page down:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.atlp.ae/en'
browser.get(url)
browser.execute_script('window.scrollBy(0, 100);')
cookie_b = WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[#aria-label='deny cookies']")))
cookie_b.click()
body = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, "main-scrollbar")))
body.click()
body.send_keys(Keys.END)
print('scrolled down')
Setup is chrome/chromedriver on linux, however it can be adapted to your system, just observe the imports, and the code after defining the browser/driver. Selenium docs: https://www.selenium.dev/documentation/

How to have better structure in my webscraping output

Here's my script :
from selenium import webdriver
import pandas as pd
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
PATH = "chromedriver.exe"
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(executable_path=PATH,options=options)
url = 'https://www.google.com/maps/place/Lazeo+Paris+11e/#48.8532051,2.3859464,17z/data=!3m1!4b1!4m5!3m4!1s0x47e6738875606957:0xdfb3822564e33888!8m2!3d48.8532051!4d2.3881404'
driver.get(url)
time.sleep(5)
driver.find_element_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/span[1]/span/span/span[2]/span[1]/button').click()
#to make sure content is fully loaded we can use time.sleep() after navigating to each page
import time
time.sleep(3)
SCROLL_PAUSE_TIME = 5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
number = 0
while True:
number = number+1
# Scroll down to bottom
ele = driver.find_element_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
driver.execute_script('arguments[0].scrollBy(0, 5000);', ele)
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
print(f'last height: {last_height}')
ele = driver.find_element_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
new_height = driver.execute_script("return arguments[0].scrollHeight", ele)
print(f'new height: {new_height}')
if number == 2:
break
#if new_height == last_height:
# break
print('cont')
last_height = new_height
item = driver.find_elements_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[10]')
time.sleep(3)
name_list = []
stars_list = []
review_list = []
duration_list = []
for i in item:
button = i.find_elements_by_tag_name('button')
for m in button:
if m.text == "More":
m.click()
time.sleep(5)
name = i.find_elements_by_class_name("d4r55")
stars = i.find_elements_by_class_name("kvMYJc")
review = i.find_elements_by_xpath("//span[#class='wiI7pd']")
duration = i.find_elements_by_class_name("rsqaWe")
for j,k,l,p in zip(name,stars, review, duration):
name_list.append(j.text)
stars_list.append(p.text)
review_list.append(k.text.strip())
duration_list.append(l.text)
review = pd.DataFrame(
{'name': name_list,
'rating': stars_list,
'review': review_list,
'duration': duration_list})
review.to_csv('google_review.csv',index=False, encoding='utf-8-sig')
print(review)
driver.quit()
But sometimes the format of the review is not convenient, like that (sorry it's in french) :
That's this one on google map :
How to avoid that ? Because it count as several rows and I cannot work with a structure like that.
I hope the fact that's in french isn't problematic to understand the structure of the sentences.

How to get post links from the whole page using BeautifulSoup Selenium

I'm having trouble trying to web scraping using BeautifulSoup and Selenium. The problem I have is i want to try pulling data from pages 1-20. But somehow the data that was successfully pulled was only up to page 10. It is possible that the number of the last page limit that I would take could be more than 20, but the results of the code I made could only pull 10 pages. Does anyone have an understanding for the problem to be able to pull a lot of data without page limit?
options = webdriver.ChromeOptions()
options.add_argument('-headless')
options.add_argument('-no-sandbox')
options.add_argument('-disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=options)
apartment_urls = []
try:
for page in range(1,20):
print(f"Extraction Page# {page}")
page="https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan&hlmn=" + str(page)
driver.get(page)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a[href]')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
apartment_urls.append(get_url)
except:
print("Good Bye!")
This is the output of the code. When pages 10,11,12 and so on I can't get the data
Now, pagination is working fine without page limit.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan')
time.sleep(5)
driver.maximize_window()
while True:
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
next_button = driver.find_element(By.CSS_SELECTOR,'li.next > a ')
if next_button:
button = next_button.click()
time.sleep(3)
else:
break
If you would prefer to use: webdriverManager
Alternative solution: As the next page url isn't dynamic, It's also working fine.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan')
time.sleep(5)
driver.maximize_window()
while True:
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
# next_button = driver.find_element(By.CSS_SELECTOR,'li.next > a ')
# if next_button:
# button = next_button.click()
# time.sleep(3)
next_page = soup.select_one('li.next > a ')
if next_page:
next_page = f'https://www.99.co{next_page}'
else:
break

Python Selenium for getting the whole content of table in Reactjs

I tried to scrape the content of table from wyscout.com, which seems that is built by Reacjs.
After log in, script selects the country(e.g. England), League(e.g. Premier League), Team(e.g. Arsenal). Here choose Stats tab.
Then, it shows the table to scrape the data. Even if there is a button to export a excel file, I want to scrape the content manually using selenium or beautifulsoup.
However, script gets only 18 rows even though the number of rows on the table is more than 100.
Please let me know the solution.
Thanks.
Here is my code.
from re import search
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import string
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
# Load Chrome Browser
show_browser = True
options = Options()
# options.add_argument('--headless')
scraped_data = []
def bot_driver(url, user_name, user_password):
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get(url)
driver.maximize_window()
time.sleep(2)
# Log in
# login = driver.find_element_by_xpath("//ul[#id='avia-menu']/li[5]/a")
# login.click()
time.sleep(10)
idd = driver.find_element_by_xpath("//input[#id='login_username']")
idd.send_keys(user_name)
passW = driver.find_element_by_xpath("//input[#id='login_password']")
passW.send_keys(user_password)
time.sleep(2)
submit = driver.find_element_by_xpath("//button[#id='login_button']")
submit.click()
time.sleep(10)
try:
force_login = driver.find_element_by_xpath("//button[#class='btn2_zFM sc-jDwBTQ cUKaFo -block3u2Qh -primary1dLZk']")
force_login.click()
print('force loging')
except :
print('force login error')
return driver
def select_country(driver, country_name):
# Specific Country
# country = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "//div[#commandsource='list#area_list#30']")))
# country = driver.find_element_by_xpath("//div[#commandsource='list#area_list#30']")
# All the countries
list_country = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH, "//div[#id='detail_0_home_navy']/div[1]/div/div")))
time.sleep(3)
for entry in list_country:
if country_name == entry.text:
print('country click here')
entry.click()
return driver, 1
return driver, 0
def select_league(driver, league_name):
# Specific League
# league = driver.find_element_by_xpath("//div[#commandsource='list#competition_list#0']")
# All the leagues
list_league = driver.find_elements_by_xpath("//div[#id='detail_0_area_navy_0']/div[1]/div/div")
for entry in list_league:
if league_name == entry.text:
entry.click()
return driver, 1
return driver, 0
def select_team(driver, team_names):
# Specific Team
# team = driver.find_element_by_xpath("//div[#commandsource='list#team_list#0']")
flag_team = 0
list_team = driver.find_elements_by_xpath("//div[#id='detail_0_competition_navy_0']/div[1]/div/div")
for entry in list_team:
if entry.text in team_names:
flag_team = 1
print('selected team = ', entry.text)
entry.click()
time.sleep(2)
# Stats
stats = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, 'Stats')))
stats.click()
time.sleep(3)
WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.XPATH, "//div[#id='detail_0_team_stats']/div/div/div/main/div[3]/div[2]/div/table")))
content_stats = driver.page_source
soup_stats = BeautifulSoup(content_stats, "html.parser")
table_stats = soup_stats.find('table', attrs={'class': 'teamstats__Index-module__table___1K93L teamstats__Index-module__with-opp___16Rp5'})
# print(table_stats)
tbody_stats = table_stats.find('tbody')
tr_stats = tbody_stats.find_all('tr')
print('number of tr = ', len(tr_stats))
# Return to team selection
back_team = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "//div[#id='detail_0_team_back']")))
back_team.click()
time.sleep(5)
if flag_team == 1:
return driver, 1
else:
return driver, 0
if __name__ == "__main__":
# User input
# Login - wyscout_url = 'https://wyscout.com/'
wyscout_url = 'https://platform.wyscout.com/app/?/'
wyscout_user_name = '' # username
wyscout_user_password = '' # password
wyscout_driver = bot_driver(wyscout_url, wyscout_user_name, wyscout_user_password)
time.sleep(10)
# Select a Country
country = 'England' # .upper()
wyscout_driver, succeed = select_country(wyscout_driver, country)
if succeed == 0:
print('NO country!')
time.sleep(7)
# Select a league
league = 'Premier League' # .upper()
wyscout_driver, succeed = select_league(wyscout_driver, league)
if succeed == 0:
print('NO League!')
time.sleep(7)
# Select team
team_options = ['Arsenal']
wyscout_driver, succeed = select_team(wyscout_driver, team_options)
time.sleep(7)
if succeed == 0:
print('NO Team!')
time.sleep(7)
print('!!!Wyscout END!!!')
# wyscout_driver.quit()
Finally I figured it out by myself.
Here is my solution.
# Scroll down
print('scroll down')
last_height = driver.execute_script("return arguments[0].scrollHeight;", table_stats)
time.sleep(3)
while True:
driver.execute_script("arguments[0].scrollBy(0,arguments[0].scrollHeight)", table_stats)
time.sleep(5)
new_height = driver.execute_script("return arguments[0].scrollHeight;", table_stats)
if new_height == last_height:
break
last_height = new_height
print('scroll end')

Selenium3 Python3 how to get url from attribute style="background-image: ur,' " and there is a url"

ho to get url from attribute url i mean the url himself in style? style="width: 433px; height: 510px; background-image: url(https://cs7056.vk.me/c635104/v635104607/1c316/ADzy-2WY8pw.jpg)" Selenium3 Python3 easy for you!
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import re
import time
url = 'https://vk.com/uporols_you'
driver = webdriver.Firefox(executable_path='C:/Users/PANDEMIC/AppData/Local/Mozilla/geckodriver.exe')
def login(driver):
log_page = driver.get('https://login.vk.com/?act=login')
find_login_input = driver.find_element_by_id('login_form').find_element_by_id('email').send_keys('+77782303865')
find_password_input = driver.find_element_by_id('login_form').find_element_by_id('pass').send_keys('pass')
find_button = driver.find_element_by_xpath('//button[#id="login_button"]').click()
time.sleep(5)
def get_photo_from_page(driver):
driver.get(url)
try:
driver.find_element_by_class_name('popup_box_container').find_element_by_class_name('box_title_wrap').find_element_by_class_name('box_x_button').click()
except:
print('nope nothing')
for i in range(2):
scrol_down = driver.find_element_by_id('public_wall').find_element_by_id('wall_more_link').click()
time.sleep(2)
tut = []
#t = (a[#class="page_post_thumb_wrap image_cover page_post_thumb_last_column page_post_thumb_last_row"])
for ii in driver.find_elements_by_xpath('//a[#style]'):
o = ii.get_attribute('style')
print(o)
#soup = BeautifulSoup(htlm, 'lxml')
#im = soup.find_all('a', class_="'page_post_thumb_wrap image_cover page_post_thumb_last_column page_post_thumb_last_row'")
#print(htlm)
#for a in im:
# s = a.get('data-src_big').split('|')[0]
# tut.append(s)
#print(tut)
#for num, link in enumerate(tut, start=1):
# p = requests.get(link)
# out = open("img%s.jpg" % (num), 'wb')
# out.write(p.content)
# out.close()
def main():
login(driver)
get_photo_from_page(driver)
if __name__ == '__main__':
main()
In that particular case, you could just parse the style string that you were already able to gather with your script.
Just add this function to your code:
def parse_style_attribute(style_string):
if 'background-image' in style_string:
style_string = style_string.split(' url("')[1].replace('");', '')
return style_string
return None
This is a simple string parsing which extract the url if there is "background-image" in the string, or return None if there is no image.
You can then use it in your code:
links = list()
for ii in driver.find_elements_by_xpath('//a[#style]'):
o = ii.get_attribute('style')
links.append(parse_style_attribute(o))
links = [link for link in links if link is not None]

Resources