Save to excel file using openpyxl instead of csv - python-3.x

The code below is working and is currently saving to a csv file, however I want to save to an excel file instead using openpyxl. I attempted it further below but had no success. I'd eventually like to save this to an existing sheet and be able to overwrite the existing data. Can anyone help? Thanks
Working Code:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
import csv
import urllib
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
url = 'http://www.tradingview.com/screener'
driver = webdriver.Firefox()
driver.get(url)
try:
selector = '.js-field-total.tv-screener-table__field-value--total'
condition = EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
matches = WebDriverWait(driver, 10).until(condition)
matches = int(matches.text.split()[0])
except (TimeoutException, Exception):
print ('Problem finding matches, setting default...')
matches = 4895 # Set default
# The page loads 150 rows at a time; divide matches by
# 150 to determine the number of times we need to scroll;
# add 5 extra scrolls just to be sure
num_loops = int(matches / 150 + 5)
for _ in range(num_loops):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(1) # Pause briefly to allow loading time
search_entries = [
("tickers", "//tbody/tr/td[1]/div/a"),
("rev annual", "//tbody/tr/td[10]"),
("income", "//tbody/tr/td[11]")]
with open('textfile.csv', 'w+', newline= '' ) as f_output:
csv_output = csv.writer(f_output)
# Write header
csv_output.writerow([name for name, xpath in search_entries])
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
csv_output.writerows(zip(*entries))
Tried this:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from openpyxl import Workbook
import urllib
wb = Workbook(write_only=True)
ws = wb.create_sheet()
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
url = 'http://www.tradingview.com/screener'
driver = webdriver.Firefox()
driver.get(url)
try:
selector = '.js-field-total.tv-screener-table__field-value--total'
condition = EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
matches = WebDriverWait(driver, 10).until(condition)
matches = int(matches.text.split()[0])
except (TimeoutException, Exception):
print ('Problem finding matches, setting default...')
matches = 4895 # Set default
# The page loads 150 rows at a time; divide matches by
# 150 to determine the number of times we need to scroll;
# add 5 extra scrolls just to be sure
num_loops = int(matches / 150 + 5)
for _ in range(num_loops):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(1) # Pause briefly to allow loading time
search_entries = [
("tickers", "//tbody/tr/td[1]/div/a"),
("rev annual", "//tbody/tr/td[10]"),
("income", "//tbody/tr/td[11]")]
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
wb.save('new_big_file.xlsx')

Related

Unable to scrape texts from URLs

I have been strugling to scrape the contents/text of news articles from each URLs. The extraction of URLs works fine, but scraping the texts from each URLs has been challenging. Below is my code:
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
article_link = []
full_text = []
# For every article we take what we want
for article in articles:
link = article.find_element(By.XPATH, f".//a")
news_link = link.get_attribute('href')
article_link.append(news_link)
for j in article_link:
news_response = requests.get(j)
news_data = news_response.content
news_soup = BeautifulSoup(news_data, 'html.parser')
art_cont = news_soup.find('div', 'Article__StyledArticleContent-sc-uw4nkg-0')
full_text.append(art_cont.text)
print(article_link)
print(full_text)
I tried to use beautifulsoup, but it doesn't seem to work. I will be grateful for any help.
First off you should probably unindent the second for loop, it shouldn't be running inside of the first loop (you will be doubling and getting all of the information countless extra times).
Second. The requests that you send are returning a webpage that has content blocked (I could not figure out a way around this with inserting headers into the request). What you could do is use the driver to load each of the links and grab the text from there, here is how you could do that.
for link in article_link:
driver.get(link)
news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
full_text.append(news_data[0].get_attribute('textContent'))
The full script would look like this:
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
article_link = []
full_text = []
# For every article we take what we want
for article in articles:
link = article.find_element(By.XPATH, f".//a")
news_link = link.get_attribute('href')
article_link.append(news_link)
for link in article_link[:5]:
driver.get(link)
news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
full_text.append(news_data[0].get_attribute('textContent'))
print(article_link)
print(full_text)
The best course of action is to utilize selenium throughout as the site's content is cloudflare secured. Although #Andrew Ryan has already addressed the issue, I thought I'd come up with a shorter version of it since this answer was already halfway through at the time of his posting.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
link = 'https://www.iol.co.za/news/south-africa/eastern-cape'
def get_links_and_texts(driver,url):
driver.get(url)
for article_link in [i.get_attribute('href') for i in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH,"//article/a[starts-with(#class,'Link__StyledLink')]")))]:
driver.get(article_link)
art_content = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,".article-content"))).text
yield {"Link":article_link,"article_content":art_content}
if __name__ == '__main__':
with webdriver.Chrome() as driver:
for item in get_links_and_texts(driver,link):
print(item)

Selenium fails to scroll down

I am using Selenium to scrape data from here. The website is using some animation to show the sections after your scroll down. I am trying to scroll down to the footer and wait for the animation to get the data from the page.
Although I am not sure if that's the only approach that get me the data, cause I can see that the animation is only adding class aos-animate to the main class, and if that class is not in the HTML element, it wont get the text!
In the get_service_data function, I am trying to scroll down to the end of the page. I tried to scroll down before I start the loop.
I tried:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
html = driver.find_element(By.CLASS_NAME, 'html')
html.send_keys(Keys.END)
html.send_keys(Keys. PAGE_DOWN)
copyright = driver.find_element(By.CLASS_NAME, 'copyright')
driver.execute_script("arguments[0].scrollIntoView();", copyright)
Here is my full script:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
language = "en" # to take this from the user
main_link = f"https://www.atlp.ae/{language}"
driver_path = os.path.join(os.getcwd(), "chromedriver")
# options = webdriver.ChromeOptions()
# options.headless = True
driver = webdriver.Chrome(driver_path) # options=options
driver.maximize_window()
def get_services_links():
links = []
driver.get(main_link)
services_header_xpath = '//*[#id="fixed-header"]/div/div[2]/div/nav/ul/li[5]/button'
driver.find_element(By.XPATH, services_header_xpath).click()
services_menu_xpath = '//*[#id="serviceInfotitle"]/nav/ul'
services_menu = driver.find_element(By.XPATH, services_menu_xpath)
options = services_menu.find_elements(By.TAG_NAME ,"li")
for option in options:
a_tag = option.find_element(By.TAG_NAME ,"a")
links.append(a_tag.get_attribute("href"))
return links[:-1] if len(links) > 0 else []
def get_service_data(link):
driver.get(link)
wait = WebDriverWait(driver, 10)
service_name_xpath = '//*[#id="main-scrollbar"]/div[1]/main/sc-placeholder/app-intro-section/section/div/div[1]/div[1]/div/p'
wait.until(EC.visibility_of_element_located((By.XPATH,service_name_xpath)))
service_name = driver.find_element(By.XPATH, service_name_xpath).text
print("Service Name: ", service_name)
# row serviceSubsetRow ng-star-inserted
wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'ServiceSubsetWrapper')))
services_wrapper = driver.find_element(By.CLASS_NAME, 'ServiceSubsetWrapper')
container = services_wrapper.find_element(By.CLASS_NAME, 'container')
service_sections = container.find_elements(By.CLASS_NAME, 'serviceSubsetRow')
for service in service_sections:
textual_div = service.find_element(By.CLASS_NAME, 'textCol')
something = textual_div.find_element(By.CLASS_NAME, 'serviceSubsetTitle')
print("Text: ", something.text)
if __name__ == '__main__':
# try:
links = get_services_links()
for link in links:
get_service_data(link)
break
driver.quit()
What you need is this:
something.get_attribute('innerText') because, perhaps, due to the added animation, the regular text is not working.
Also, I have removed a few lines as I thought they were not needed (at least for this exercise). I have directly added a loop to make it work with serviceSubsetTitle
def get_service_data(link):
driver.get(link)
wait = WebDriverWait(driver, 10)
service_name_xpath = '//*[#id="main-scrollbar"]/div[1]/main/sc-placeholder/app-intro-section/section/div/div[1]/div[1]/div/p'
wait.until(EC.visibility_of_element_located((By.XPATH, service_name_xpath)))
service_name = driver.find_element(By.XPATH, service_name_xpath).text
print("Service Name: ", service_name)
# ---- removed these lines --------
# row serviceSubsetRow ng-star-inserted
# wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'ServiceSubsetWrapper')))
# services_wrapper = driver.find_element(By.CLASS_NAME, 'ServiceSubsetWrapper')
#
# container = services_wrapper.find_element(By.CLASS_NAME, 'container')
# service_sections = container.find_elements(By.CLASS_NAME, 'serviceSubsetRow')
# ----- End of lines removal ----------
# Clicking out the cookie acceptance button
try:
driver.find_element(By.XPATH, "//*[#class='cc-btn cc-allow']").click()
except:
print("nothing there")
# --- removed these lines
# for service in service_sections:
# textual_div = service.find_element(By.CLASS_NAME, 'textCol')
# time.sleep(3)
# --- end of lines removal ---------
# These are my lines here from below:
somethings = driver.find_elements(By.XPATH, "//*[contains(#class, 'serviceSubsetTitle')]")
print(len(somethings))
for something in somethings:
# time.sleep(2)
title_txt = something.get_attribute('innerText')
print(title_txt)
here is the output:
Service Name: Sea Services
5
Vessel Management and Marine Services
Passenger Handling and Cargo Operations
Issuance of Certificates and Approvals in Ports
Ports Licensing
Property Leasing Services - Ports
Process finished with exit code 0
This is one way of scrolling that page down:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.atlp.ae/en'
browser.get(url)
browser.execute_script('window.scrollBy(0, 100);')
cookie_b = WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[#aria-label='deny cookies']")))
cookie_b.click()
body = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, "main-scrollbar")))
body.click()
body.send_keys(Keys.END)
print('scrolled down')
Setup is chrome/chromedriver on linux, however it can be adapted to your system, just observe the imports, and the code after defining the browser/driver. Selenium docs: https://www.selenium.dev/documentation/

How to click the next span value which has same class name

import urllib3
import certifi
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import requests
from bs4 import BeautifulSoup
import time
import ssl
http = urllib3.PoolManager(ca_certs=certifi.where())
chrome_options = Options()
chrome_options.add_argument("--incognito")
driver = webdriver.Chrome(options=chrome_options, executable_path="D:\\python works\\driver\\chromedriver.exe")
URL= "https://physicians.wustl.edu/"
driver.get(URL)
time.sleep(5)
driver.find_element_by_link_text("Find a Doctor").click()
find_doc = driver.current_url
print(find_doc)
driver.get(find_doc)
# content = driver.page_source
# print(content)
response = http.request('GET', find_doc)
url_text = response.data #text
time.sleep(10)
count = len(driver.find_elements_by_xpath("//span[#class='entry-title-link']"))
print(count)
s = driver.find_element_by_css_selector("span[class='entry-title-link']") #firstpage click
s.click()
urls = []
provider = []
print(driver.current_url)
urls.append(driver.current_url)
name = driver.find_element_by_css_selector("h1[class='washu-ppi-name entry-title']").text
print(name)
provider.append(name)
specialization = driver.find_element_by_css_selector("ul[class='wuphys-specialties']").text
print(specialization)
location= driver.find_element_by_css_selector("a[class='wuphys-addr name']").text
print(location)
time.sleep(5)
driver.find_element_by_css_selector("a[href='https://physicians.wustl.edu/find-a-doctor/']").click()
time.sleep(10)
I have same classname of span but I need to loop the same class name but the div is different. In the url there is doctors name with details after click I get details and I need to move to next doctor which has same class name
I think you are looking for something of this kind (to loop through all the doctor links and get info from there). Here I have written a basic action which you can scale to add more data related to each doctor.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
import time
driver = webdriver.Chrome(options=chrome_options, executable_path="D:\\python works\\driver\\chromedriver.exe")
driver.maximize_window()
driver.get("https://physicians.wustl.edu/")
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.LINK_TEXT, "Find a Doctor"))).click()
print(driver.current_url)
doc_cnt = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//span[#class='entry-title-link']")))
print(len(doc_cnt))
doc_list=[] # to append all the doctor urls into the list for further processing, if required.
for doc in doc_cnt:
ActionChains(driver).key_down(Keys.CONTROL).click(doc).key_up(Keys.CONTROL).perform()
driver.switch_to.window(driver.window_handles[1])
doc_list.append(driver.current_url)
# ... you could include any code of yours related to each doctor here...
# After this one the tab terminates and a new doctor link would open
driver.close()
driver.switch_to.window(driver.window_handles[0])
time.sleep(1)
print(doc_list)

How to open and access multiple (nearly 50) tabs in Chrome using ChromeDriver and Selenium through Python

I'm trying to gather some information from certain webpages using selenium and python.I have a working code for a single tab. But now i have a situation where i need to open 50 tabs in chrome at once and process each page data.
1) So open 50 tabs at once - The code i got already
2) Change the control between tabs and process the information from the page and close the tab and move to next tab and do the same.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import psycopg2
import os
import datetime
final_results=[]
positions=[]
saerched_url=[]
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
#options.add_argument('--headless')
options.add_argument("—-incognito")
browser = webdriver.Chrome(executable_path='/users/user_123/downloads/chrome_driver/chromedriver', chrome_options=options)
browser.implicitly_wait(20)
#def db_connect():
try:
DSN = "dbname='postgres' user='postgres' host='localhost' password='postgres' port='5432'"
TABLE_NAME = 'staging.search_url'
conn = psycopg2.connect(DSN)
print("Database connected...")
cur = conn.cursor()
cur.execute("SET datestyle='German'")
except (Exception, psycopg2.Error) as error:
print('database connection failed')
quit()
def get_products(url):
browser.get(url)
names = browser.find_elements_by_xpath("//span[#class='pymv4e']")
upd_product_name_list=list(filter(None, names))
product_name = [x.text for x in upd_product_name_list]
product = [x for x in product_name if len(x.strip()) > 2]
upd_product_name_list.clear()
product_name.clear()
return product
links = ['https://www.google.com/search?q=Vitamin+D',
'https://www.google.com/search?q=Vitamin+D3',
'https://www.google.com/search?q=Vitamin+D+K2',
'https://www.google.com/search?q=D3',
'https://www.google.com/search?q=Vitamin+D+1000']
for link in links:
# optional: we can wait for the new tab to open by comparing window handles count before & after
tabs_count_before = len(browser.window_handles)
# open a link
control_string = "window.open('{0}')".format(link)
browser.execute_script(control_string)
# optional: wait for windows count to increment to ensure new tab is opened
WebDriverWait(browser, 1).until(lambda browser: tabs_count_before != len(browser.window_handles))
# get list of currently opened tabs
tabs_list = browser.window_handles
print(tabs_list)
# switch control to newly opened tab (the last one in the list)
last_tab_opened = tabs_list[len(tabs_list)-1]
browser.switch_to_window(last_tab_opened)
# now you can process data on the newly opened tab
print(browser.title)
for lists in tabs_list:
last_tab_opened = tabs_list[len(tabs_list)-1]
browser.switch_to_window(last_tab_opened)
filtered=[]
filtered.clear()
filtered = get_products(link)
saerched_url.clear()
if not filtered:
new_url=link+'+kaufen'
get_products(link)
print('Modified URL :'+link)
if filtered:
print(filtered)
positions.clear()
for x in range(1, len(filtered)+1):
positions.append(str(x))
saerched_url.append(link)
gobal_position=0
gobal_position=len(positions)
print('global postion first: '+str(gobal_position))
print("\n")
company_name_list = browser.find_elements_by_xpath("//div[#class='LbUacb']")
company = []
company.clear()
company = [x.text for x in company_name_list]
print('Company Name:')
print(company, '\n')
price_list = browser.find_elements_by_xpath("//div[#class='e10twf T4OwTb']")
price = []
price.clear()
price = [x.text for x in price_list]
print('Price:')
print(price)
print("\n")
urls=[]
urls.clear()
find_href = browser.find_elements_by_xpath("//a[#class='plantl pla-unit-single-clickable-target clickable-card']")
for my_href in find_href:
url_list=my_href.get_attribute("href")
urls.append(url_list)
print('Final Result: ')
result = zip(positions,filtered, urls, company,price,saerched_url)
final_results.clear()
final_results.append(tuple(result))
print(final_results)
print("\n")
print('global postion end :'+str(gobal_position))
i=0
try:
for d in final_results:
while i <= gobal_position:
print( d[i])
cur.execute("""INSERT into staging.pla_crawler_results(position, product_name, url,company,price,searched_url) VALUES (%s, %s, %s,%s, %s,%s)""", d[i])
print('Inserted succesfully')
conn.commit()
i=i+1
except (Exception, psycopg2.Error) as error:
print (error)
pass
browser.close()
Ideally you shouldn't attempt to open 50 tabs at once as:
Handling 50 concurrent TABs through Selenium will invite complicated logic/algorithm to maintain.
Additionally, you may run into CPU and memory usage issues as:
Chrome maintains many processes.
Where as at times Firefox uses too much RAM
Solution
If you are having a List of the urls as follows:
['https://selenium.dev/downloads/', 'https://selenium.dev/documentation/en/']
You can iterate over the list to open them one by one in the adjacent tab for scraping using the following Locator Strategy:
Code Block:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.common.keys import Keys
links = ['https://selenium.dev/downloads/', 'https://selenium.dev/documentation/en/']
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
for link in links:
driver = webdriver.Chrome(options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get(link)
print(driver.title)
print("Perform webscraping here")
driver.quit()
print("End of program")
Console Output:
Downloads
Perform webscraping here
The Selenium Browser Automation Project :: Documentation for Selenium
Perform webscraping here
End of program
Reference
You can find a relevant detailed discussion in:
WebScraping JavaScript-Rendered Content using Selenium in Python

How do I add waits to my Selenium scraping program?

I am trying to scrape a website and save the information using Python and Selenium. The scrape is simple, and only requires choosing the state and district in two dropdown menus, clicking a submit button, and reading and writing a table to a csv.
I am confident my packages are installed correctly and my program even works, but only some of the time. My guess is that without the proper Selenium driver 'waits', my program crashes because it can't find the correct css_selector. I'll post the program below, and if anyone has any suggestions on how to correctly incorporate Selenium driver 'waits', I would very much appreciate the help.
Thanks so much, and here's the program:
import time
import re
import string
import urllib.parse
import pandas
import numpy
import os
import csv
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
url = "https://myhpgas.in/myHPGas/HPGas/LocateDistributor.aspx"
driver.set_window_size(1120, 550)
driver.get(url);
time.sleep(5)
stateList = driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlState")
options = stateList.find_elements_by_tag_name("option")
optionsList = []
for option in options:
optionsList.append(option.get_attribute("value"))
optionsList[1:len(optionsList)]
for optionValue in optionsList:
select = Select(driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlState"))
select.select_by_value(optionValue)
districtList = driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlDistrict")
distOptions = districtList.find_elements_by_tag_name("option")
distOptionsList = []
for distOption in distOptions: #iterate over the options, place attribute value in list
distOptionsList.append(distOption.get_attribute("value"))
for distOptionValue in distOptionsList[1:len(distOptionsList)]:
distSelect = Select(driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlDistrict"))
distSelect.select_by_value(distOptionValue)
driver.find_element_by_css_selector('#ContentPlaceHolder1_btnShowList').click()
data = []
for tr in driver.find_elements_by_css_selector('#ContentPlaceHolder1_gvDistributor'):
tds = tr.find_elements_by_tag_name('td')
if tds:
data.append([td.text for td in tds])
print(data)
dataRows = int(numpy.array(data).size / 7)
rowsTimesColumns = (dataRows * 7) -1
newArray = numpy.array(data)
outArray = newArray[0:rowsTimesColumns]
test = pandas.DataFrame(outArray.reshape(dataRows,7), columns=['no', 'distributor', 'address','contact1', 'contact2', 'contact3', 'map'])
file_path = 'Users/outpath' + '_' + optionValue + '_' + distOptionValue + '.csv'
test.to_csv(file_path, sep=',')
driver.back()
driver.back()
Can you tell me which line returns an error!? Also how about using xpaths!?
I couldn’t see the statement to implement explicit waits
WebDriverWait(driver, 30).until(EC.presence_of_element_located_by(By.CSS_SELECTOR,*your css selector*))

Resources