How to have better structure in my webscraping output - python-3.x

Here's my script :
from selenium import webdriver
import pandas as pd
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
PATH = "chromedriver.exe"
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(executable_path=PATH,options=options)
url = 'https://www.google.com/maps/place/Lazeo+Paris+11e/#48.8532051,2.3859464,17z/data=!3m1!4b1!4m5!3m4!1s0x47e6738875606957:0xdfb3822564e33888!8m2!3d48.8532051!4d2.3881404'
driver.get(url)
time.sleep(5)
driver.find_element_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/span[1]/span/span/span[2]/span[1]/button').click()
#to make sure content is fully loaded we can use time.sleep() after navigating to each page
import time
time.sleep(3)
SCROLL_PAUSE_TIME = 5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
number = 0
while True:
number = number+1
# Scroll down to bottom
ele = driver.find_element_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
driver.execute_script('arguments[0].scrollBy(0, 5000);', ele)
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
print(f'last height: {last_height}')
ele = driver.find_element_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
new_height = driver.execute_script("return arguments[0].scrollHeight", ele)
print(f'new height: {new_height}')
if number == 2:
break
#if new_height == last_height:
# break
print('cont')
last_height = new_height
item = driver.find_elements_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[10]')
time.sleep(3)
name_list = []
stars_list = []
review_list = []
duration_list = []
for i in item:
button = i.find_elements_by_tag_name('button')
for m in button:
if m.text == "More":
m.click()
time.sleep(5)
name = i.find_elements_by_class_name("d4r55")
stars = i.find_elements_by_class_name("kvMYJc")
review = i.find_elements_by_xpath("//span[#class='wiI7pd']")
duration = i.find_elements_by_class_name("rsqaWe")
for j,k,l,p in zip(name,stars, review, duration):
name_list.append(j.text)
stars_list.append(p.text)
review_list.append(k.text.strip())
duration_list.append(l.text)
review = pd.DataFrame(
{'name': name_list,
'rating': stars_list,
'review': review_list,
'duration': duration_list})
review.to_csv('google_review.csv',index=False, encoding='utf-8-sig')
print(review)
driver.quit()
But sometimes the format of the review is not convenient, like that (sorry it's in french) :
That's this one on google map :
How to avoid that ? Because it count as several rows and I cannot work with a structure like that.
I hope the fact that's in french isn't problematic to understand the structure of the sentences.

Related

Selenium fails to scroll down

I am using Selenium to scrape data from here. The website is using some animation to show the sections after your scroll down. I am trying to scroll down to the footer and wait for the animation to get the data from the page.
Although I am not sure if that's the only approach that get me the data, cause I can see that the animation is only adding class aos-animate to the main class, and if that class is not in the HTML element, it wont get the text!
In the get_service_data function, I am trying to scroll down to the end of the page. I tried to scroll down before I start the loop.
I tried:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
html = driver.find_element(By.CLASS_NAME, 'html')
html.send_keys(Keys.END)
html.send_keys(Keys. PAGE_DOWN)
copyright = driver.find_element(By.CLASS_NAME, 'copyright')
driver.execute_script("arguments[0].scrollIntoView();", copyright)
Here is my full script:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
language = "en" # to take this from the user
main_link = f"https://www.atlp.ae/{language}"
driver_path = os.path.join(os.getcwd(), "chromedriver")
# options = webdriver.ChromeOptions()
# options.headless = True
driver = webdriver.Chrome(driver_path) # options=options
driver.maximize_window()
def get_services_links():
links = []
driver.get(main_link)
services_header_xpath = '//*[#id="fixed-header"]/div/div[2]/div/nav/ul/li[5]/button'
driver.find_element(By.XPATH, services_header_xpath).click()
services_menu_xpath = '//*[#id="serviceInfotitle"]/nav/ul'
services_menu = driver.find_element(By.XPATH, services_menu_xpath)
options = services_menu.find_elements(By.TAG_NAME ,"li")
for option in options:
a_tag = option.find_element(By.TAG_NAME ,"a")
links.append(a_tag.get_attribute("href"))
return links[:-1] if len(links) > 0 else []
def get_service_data(link):
driver.get(link)
wait = WebDriverWait(driver, 10)
service_name_xpath = '//*[#id="main-scrollbar"]/div[1]/main/sc-placeholder/app-intro-section/section/div/div[1]/div[1]/div/p'
wait.until(EC.visibility_of_element_located((By.XPATH,service_name_xpath)))
service_name = driver.find_element(By.XPATH, service_name_xpath).text
print("Service Name: ", service_name)
# row serviceSubsetRow ng-star-inserted
wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'ServiceSubsetWrapper')))
services_wrapper = driver.find_element(By.CLASS_NAME, 'ServiceSubsetWrapper')
container = services_wrapper.find_element(By.CLASS_NAME, 'container')
service_sections = container.find_elements(By.CLASS_NAME, 'serviceSubsetRow')
for service in service_sections:
textual_div = service.find_element(By.CLASS_NAME, 'textCol')
something = textual_div.find_element(By.CLASS_NAME, 'serviceSubsetTitle')
print("Text: ", something.text)
if __name__ == '__main__':
# try:
links = get_services_links()
for link in links:
get_service_data(link)
break
driver.quit()
What you need is this:
something.get_attribute('innerText') because, perhaps, due to the added animation, the regular text is not working.
Also, I have removed a few lines as I thought they were not needed (at least for this exercise). I have directly added a loop to make it work with serviceSubsetTitle
def get_service_data(link):
driver.get(link)
wait = WebDriverWait(driver, 10)
service_name_xpath = '//*[#id="main-scrollbar"]/div[1]/main/sc-placeholder/app-intro-section/section/div/div[1]/div[1]/div/p'
wait.until(EC.visibility_of_element_located((By.XPATH, service_name_xpath)))
service_name = driver.find_element(By.XPATH, service_name_xpath).text
print("Service Name: ", service_name)
# ---- removed these lines --------
# row serviceSubsetRow ng-star-inserted
# wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'ServiceSubsetWrapper')))
# services_wrapper = driver.find_element(By.CLASS_NAME, 'ServiceSubsetWrapper')
#
# container = services_wrapper.find_element(By.CLASS_NAME, 'container')
# service_sections = container.find_elements(By.CLASS_NAME, 'serviceSubsetRow')
# ----- End of lines removal ----------
# Clicking out the cookie acceptance button
try:
driver.find_element(By.XPATH, "//*[#class='cc-btn cc-allow']").click()
except:
print("nothing there")
# --- removed these lines
# for service in service_sections:
# textual_div = service.find_element(By.CLASS_NAME, 'textCol')
# time.sleep(3)
# --- end of lines removal ---------
# These are my lines here from below:
somethings = driver.find_elements(By.XPATH, "//*[contains(#class, 'serviceSubsetTitle')]")
print(len(somethings))
for something in somethings:
# time.sleep(2)
title_txt = something.get_attribute('innerText')
print(title_txt)
here is the output:
Service Name: Sea Services
5
Vessel Management and Marine Services
Passenger Handling and Cargo Operations
Issuance of Certificates and Approvals in Ports
Ports Licensing
Property Leasing Services - Ports
Process finished with exit code 0
This is one way of scrolling that page down:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.atlp.ae/en'
browser.get(url)
browser.execute_script('window.scrollBy(0, 100);')
cookie_b = WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[#aria-label='deny cookies']")))
cookie_b.click()
body = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, "main-scrollbar")))
body.click()
body.send_keys(Keys.END)
print('scrolled down')
Setup is chrome/chromedriver on linux, however it can be adapted to your system, just observe the imports, and the code after defining the browser/driver. Selenium docs: https://www.selenium.dev/documentation/

Python Selenium for getting the whole content of table in Reactjs

I tried to scrape the content of table from wyscout.com, which seems that is built by Reacjs.
After log in, script selects the country(e.g. England), League(e.g. Premier League), Team(e.g. Arsenal). Here choose Stats tab.
Then, it shows the table to scrape the data. Even if there is a button to export a excel file, I want to scrape the content manually using selenium or beautifulsoup.
However, script gets only 18 rows even though the number of rows on the table is more than 100.
Please let me know the solution.
Thanks.
Here is my code.
from re import search
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import string
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
# Load Chrome Browser
show_browser = True
options = Options()
# options.add_argument('--headless')
scraped_data = []
def bot_driver(url, user_name, user_password):
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get(url)
driver.maximize_window()
time.sleep(2)
# Log in
# login = driver.find_element_by_xpath("//ul[#id='avia-menu']/li[5]/a")
# login.click()
time.sleep(10)
idd = driver.find_element_by_xpath("//input[#id='login_username']")
idd.send_keys(user_name)
passW = driver.find_element_by_xpath("//input[#id='login_password']")
passW.send_keys(user_password)
time.sleep(2)
submit = driver.find_element_by_xpath("//button[#id='login_button']")
submit.click()
time.sleep(10)
try:
force_login = driver.find_element_by_xpath("//button[#class='btn2_zFM sc-jDwBTQ cUKaFo -block3u2Qh -primary1dLZk']")
force_login.click()
print('force loging')
except :
print('force login error')
return driver
def select_country(driver, country_name):
# Specific Country
# country = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "//div[#commandsource='list#area_list#30']")))
# country = driver.find_element_by_xpath("//div[#commandsource='list#area_list#30']")
# All the countries
list_country = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH, "//div[#id='detail_0_home_navy']/div[1]/div/div")))
time.sleep(3)
for entry in list_country:
if country_name == entry.text:
print('country click here')
entry.click()
return driver, 1
return driver, 0
def select_league(driver, league_name):
# Specific League
# league = driver.find_element_by_xpath("//div[#commandsource='list#competition_list#0']")
# All the leagues
list_league = driver.find_elements_by_xpath("//div[#id='detail_0_area_navy_0']/div[1]/div/div")
for entry in list_league:
if league_name == entry.text:
entry.click()
return driver, 1
return driver, 0
def select_team(driver, team_names):
# Specific Team
# team = driver.find_element_by_xpath("//div[#commandsource='list#team_list#0']")
flag_team = 0
list_team = driver.find_elements_by_xpath("//div[#id='detail_0_competition_navy_0']/div[1]/div/div")
for entry in list_team:
if entry.text in team_names:
flag_team = 1
print('selected team = ', entry.text)
entry.click()
time.sleep(2)
# Stats
stats = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, 'Stats')))
stats.click()
time.sleep(3)
WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.XPATH, "//div[#id='detail_0_team_stats']/div/div/div/main/div[3]/div[2]/div/table")))
content_stats = driver.page_source
soup_stats = BeautifulSoup(content_stats, "html.parser")
table_stats = soup_stats.find('table', attrs={'class': 'teamstats__Index-module__table___1K93L teamstats__Index-module__with-opp___16Rp5'})
# print(table_stats)
tbody_stats = table_stats.find('tbody')
tr_stats = tbody_stats.find_all('tr')
print('number of tr = ', len(tr_stats))
# Return to team selection
back_team = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "//div[#id='detail_0_team_back']")))
back_team.click()
time.sleep(5)
if flag_team == 1:
return driver, 1
else:
return driver, 0
if __name__ == "__main__":
# User input
# Login - wyscout_url = 'https://wyscout.com/'
wyscout_url = 'https://platform.wyscout.com/app/?/'
wyscout_user_name = '' # username
wyscout_user_password = '' # password
wyscout_driver = bot_driver(wyscout_url, wyscout_user_name, wyscout_user_password)
time.sleep(10)
# Select a Country
country = 'England' # .upper()
wyscout_driver, succeed = select_country(wyscout_driver, country)
if succeed == 0:
print('NO country!')
time.sleep(7)
# Select a league
league = 'Premier League' # .upper()
wyscout_driver, succeed = select_league(wyscout_driver, league)
if succeed == 0:
print('NO League!')
time.sleep(7)
# Select team
team_options = ['Arsenal']
wyscout_driver, succeed = select_team(wyscout_driver, team_options)
time.sleep(7)
if succeed == 0:
print('NO Team!')
time.sleep(7)
print('!!!Wyscout END!!!')
# wyscout_driver.quit()
Finally I figured it out by myself.
Here is my solution.
# Scroll down
print('scroll down')
last_height = driver.execute_script("return arguments[0].scrollHeight;", table_stats)
time.sleep(3)
while True:
driver.execute_script("arguments[0].scrollBy(0,arguments[0].scrollHeight)", table_stats)
time.sleep(5)
new_height = driver.execute_script("return arguments[0].scrollHeight;", table_stats)
if new_height == last_height:
break
last_height = new_height
print('scroll end')

How to open and access multiple (nearly 50) tabs in Chrome using ChromeDriver and Selenium through Python

I'm trying to gather some information from certain webpages using selenium and python.I have a working code for a single tab. But now i have a situation where i need to open 50 tabs in chrome at once and process each page data.
1) So open 50 tabs at once - The code i got already
2) Change the control between tabs and process the information from the page and close the tab and move to next tab and do the same.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import psycopg2
import os
import datetime
final_results=[]
positions=[]
saerched_url=[]
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
#options.add_argument('--headless')
options.add_argument("—-incognito")
browser = webdriver.Chrome(executable_path='/users/user_123/downloads/chrome_driver/chromedriver', chrome_options=options)
browser.implicitly_wait(20)
#def db_connect():
try:
DSN = "dbname='postgres' user='postgres' host='localhost' password='postgres' port='5432'"
TABLE_NAME = 'staging.search_url'
conn = psycopg2.connect(DSN)
print("Database connected...")
cur = conn.cursor()
cur.execute("SET datestyle='German'")
except (Exception, psycopg2.Error) as error:
print('database connection failed')
quit()
def get_products(url):
browser.get(url)
names = browser.find_elements_by_xpath("//span[#class='pymv4e']")
upd_product_name_list=list(filter(None, names))
product_name = [x.text for x in upd_product_name_list]
product = [x for x in product_name if len(x.strip()) > 2]
upd_product_name_list.clear()
product_name.clear()
return product
links = ['https://www.google.com/search?q=Vitamin+D',
'https://www.google.com/search?q=Vitamin+D3',
'https://www.google.com/search?q=Vitamin+D+K2',
'https://www.google.com/search?q=D3',
'https://www.google.com/search?q=Vitamin+D+1000']
for link in links:
# optional: we can wait for the new tab to open by comparing window handles count before & after
tabs_count_before = len(browser.window_handles)
# open a link
control_string = "window.open('{0}')".format(link)
browser.execute_script(control_string)
# optional: wait for windows count to increment to ensure new tab is opened
WebDriverWait(browser, 1).until(lambda browser: tabs_count_before != len(browser.window_handles))
# get list of currently opened tabs
tabs_list = browser.window_handles
print(tabs_list)
# switch control to newly opened tab (the last one in the list)
last_tab_opened = tabs_list[len(tabs_list)-1]
browser.switch_to_window(last_tab_opened)
# now you can process data on the newly opened tab
print(browser.title)
for lists in tabs_list:
last_tab_opened = tabs_list[len(tabs_list)-1]
browser.switch_to_window(last_tab_opened)
filtered=[]
filtered.clear()
filtered = get_products(link)
saerched_url.clear()
if not filtered:
new_url=link+'+kaufen'
get_products(link)
print('Modified URL :'+link)
if filtered:
print(filtered)
positions.clear()
for x in range(1, len(filtered)+1):
positions.append(str(x))
saerched_url.append(link)
gobal_position=0
gobal_position=len(positions)
print('global postion first: '+str(gobal_position))
print("\n")
company_name_list = browser.find_elements_by_xpath("//div[#class='LbUacb']")
company = []
company.clear()
company = [x.text for x in company_name_list]
print('Company Name:')
print(company, '\n')
price_list = browser.find_elements_by_xpath("//div[#class='e10twf T4OwTb']")
price = []
price.clear()
price = [x.text for x in price_list]
print('Price:')
print(price)
print("\n")
urls=[]
urls.clear()
find_href = browser.find_elements_by_xpath("//a[#class='plantl pla-unit-single-clickable-target clickable-card']")
for my_href in find_href:
url_list=my_href.get_attribute("href")
urls.append(url_list)
print('Final Result: ')
result = zip(positions,filtered, urls, company,price,saerched_url)
final_results.clear()
final_results.append(tuple(result))
print(final_results)
print("\n")
print('global postion end :'+str(gobal_position))
i=0
try:
for d in final_results:
while i <= gobal_position:
print( d[i])
cur.execute("""INSERT into staging.pla_crawler_results(position, product_name, url,company,price,searched_url) VALUES (%s, %s, %s,%s, %s,%s)""", d[i])
print('Inserted succesfully')
conn.commit()
i=i+1
except (Exception, psycopg2.Error) as error:
print (error)
pass
browser.close()
Ideally you shouldn't attempt to open 50 tabs at once as:
Handling 50 concurrent TABs through Selenium will invite complicated logic/algorithm to maintain.
Additionally, you may run into CPU and memory usage issues as:
Chrome maintains many processes.
Where as at times Firefox uses too much RAM
Solution
If you are having a List of the urls as follows:
['https://selenium.dev/downloads/', 'https://selenium.dev/documentation/en/']
You can iterate over the list to open them one by one in the adjacent tab for scraping using the following Locator Strategy:
Code Block:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.common.keys import Keys
links = ['https://selenium.dev/downloads/', 'https://selenium.dev/documentation/en/']
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
for link in links:
driver = webdriver.Chrome(options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get(link)
print(driver.title)
print("Perform webscraping here")
driver.quit()
print("End of program")
Console Output:
Downloads
Perform webscraping here
The Selenium Browser Automation Project :: Documentation for Selenium
Perform webscraping here
End of program
Reference
You can find a relevant detailed discussion in:
WebScraping JavaScript-Rendered Content using Selenium in Python

Selenium Web Scraping Id missing

I am trying to gather the data from the http://maharain.gov.in/ site. I have written the below script.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import unittest, time, re
import csv,sys,os
from bs4 import BeautifulSoup
import time
def check_exists_by_xpath(xpath,driver):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
chromepath= '/home/swapnil/Downloads/chromedriver'
driver = webdriver.Chrome(chromepath)
start_time =time.time()
base_url = "http://maharain.gov.in/"
driver.get(base_url)
driver.switch_to.frame('MenuFrame')
driver.find_element_by_name("QueriesCirclewise3").click()
time.sleep(3)
print("Done")
driver.find_element(By.XPATH, '//*[#id="menu"]/input[10]').click()
time.sleep(3)
print("Done")
# driver.find_element_by_name("PastQueriesCirclewise6").click()
time.sleep(3)
print("Done")
# driver.implicitly_wait(3)
driver.switch_to.default_content()
time.sleep(3)
print("Done")
driver.switch_to.frame(driver.find_element_by_name("ContentFrame"))
# dropdown_menu_year = Select(driver.find_element_by_id("selyear"))
# select_year = [option.text for option in dropdown_menu_year.options]
# select_year = [t for t in select_year if t !='Select']
# select_year = [ '2015', '2016', '2017']
time.sleep(3)
print("Done All ")
dropdown_menu_state = Select(driver.find_element_by_id("selstate"))
select_state = [option.text for option in dropdown_menu_state.options]
select_state = [t for t in select_state if t !='Select']
dropdown_menu_dist = Select(driver.find_element_by_id("seldist"))
select_dist = [option.text for option in dropdown_menu_dist.options]
select_dist = [t for t in select_dist if t !='Select']
dropdown_menu_month = Select(driver.find_element_by_id("selmonth"))
select_mon = [option.text for option in dropdown_menu_month.options]
select_mon = [t for t in select_mon if t !='Select']
i = 0
year=str(2018)
# for year in select_year:
if not os.path.exists(year):
os.makedirs(year)
for state in select_state:
for dist in select_dist:
if not os.path.exists(year+'/'+dist):
os.makedirs(year+'/'+dist)
for month in select_mon:
print (i)
# dropdown_menu_year = Select(driver.find_element_by_id("selyear"))
# dropdown_menu_year.select_by_visible_text(year)
dropdown_menu_state = Select(driver.find_element_by_id("selstate"))
dropdown_menu_state.select_by_visible_text(state)
time.sleep(1)
dropdown_menu_dist = Select(driver.find_element_by_id("seldist"))
dropdown_menu_dist.select_by_visible_text(dist)
if(dist=='Wardha' or dist=='Washim' or dist=='Yavatmal'):
# time.sleep(2)
dropdown_menu_month = Select(driver.find_element_by_id("selmonth"))
dropdown_menu_month.select_by_visible_text(month)
time.sleep(2)
driver.find_element_by_name("btnshow").click()
time.sleep(2)
print("Done")
driver.switch_to.frame(driver.find_element(By.CSS_SELECTOR, 'body > embed'))
if (check_exists_by_xpath('//*[#id="tableID"]',driver)):
tab = driver.find_element(By.XPATH,'//*[#id="tableID"]')
soup = BeautifulSoup (driver.page_source)
table = soup.select_one('table')
data = [[td.text for td in row.find_all("td")] for row in table.find_all("tr")]
file_name = year+'/'+dist+'/'+year+'_'+dist+'_'+month+'.csv'
print(file_name)
f = open(file_name,'w', newline='')
writer =csv.writer(f)
writer.writerows(data)
f.close()
i+=1
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element_by_name("ContentFrame"))
print ( time.time() - start_time)
print(i)
But each time I run the code it gets stuck at different locations with errors like missing selector id "selstate" or "body > embed" not present, which may run correctly in the next run without any changes to the code and may get stuck at the different location.
I have tried adding driver implicit wait and thread sleep with value set to 5 and less.Please point out what should be the correct measure to make it run in one go and where should be the wait or sleep statements to be added or any other changes if required.

Save to excel file using openpyxl instead of csv

The code below is working and is currently saving to a csv file, however I want to save to an excel file instead using openpyxl. I attempted it further below but had no success. I'd eventually like to save this to an existing sheet and be able to overwrite the existing data. Can anyone help? Thanks
Working Code:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
import csv
import urllib
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
url = 'http://www.tradingview.com/screener'
driver = webdriver.Firefox()
driver.get(url)
try:
selector = '.js-field-total.tv-screener-table__field-value--total'
condition = EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
matches = WebDriverWait(driver, 10).until(condition)
matches = int(matches.text.split()[0])
except (TimeoutException, Exception):
print ('Problem finding matches, setting default...')
matches = 4895 # Set default
# The page loads 150 rows at a time; divide matches by
# 150 to determine the number of times we need to scroll;
# add 5 extra scrolls just to be sure
num_loops = int(matches / 150 + 5)
for _ in range(num_loops):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(1) # Pause briefly to allow loading time
search_entries = [
("tickers", "//tbody/tr/td[1]/div/a"),
("rev annual", "//tbody/tr/td[10]"),
("income", "//tbody/tr/td[11]")]
with open('textfile.csv', 'w+', newline= '' ) as f_output:
csv_output = csv.writer(f_output)
# Write header
csv_output.writerow([name for name, xpath in search_entries])
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
csv_output.writerows(zip(*entries))
Tried this:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from openpyxl import Workbook
import urllib
wb = Workbook(write_only=True)
ws = wb.create_sheet()
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
url = 'http://www.tradingview.com/screener'
driver = webdriver.Firefox()
driver.get(url)
try:
selector = '.js-field-total.tv-screener-table__field-value--total'
condition = EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
matches = WebDriverWait(driver, 10).until(condition)
matches = int(matches.text.split()[0])
except (TimeoutException, Exception):
print ('Problem finding matches, setting default...')
matches = 4895 # Set default
# The page loads 150 rows at a time; divide matches by
# 150 to determine the number of times we need to scroll;
# add 5 extra scrolls just to be sure
num_loops = int(matches / 150 + 5)
for _ in range(num_loops):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(1) # Pause briefly to allow loading time
search_entries = [
("tickers", "//tbody/tr/td[1]/div/a"),
("rev annual", "//tbody/tr/td[10]"),
("income", "//tbody/tr/td[11]")]
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
wb.save('new_big_file.xlsx')

Resources