I have a web page I have been developing and I am now stuck getting told syntax is wrong, I cannot find the error can someone please help me find my error? Here is my code for the database that is giving me my error:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
from models import *
import os # File I/O
import time
import shutil
import glob
import configparser
config_parser = configparser.ConfigParser()
config_parser.read("config.ini")
pdownload_dir = os.path.abspath('./prism_downloads/')
dt = str(datetime.datetime.now())
filelist = glob.glob(download_dir + '/*.html')
dbpath = ('./db')
def db_Prism():
database.connect()
database.create_tables([Prism], safe=False)
database.close()
for root, dir, files in os.walk(pdownload_dir):
for file in files:
print(file)
file_markup = ''
with open(os.path.abspath(os.path.join(pdownload_dir, file)), 'r') as html:
file_markup = html.read()
if file_markup == '':
print('ERROR: File was not read')
print('Reading {0} into BS4'.format(file))
soup = BeautifulSoup(file_markup, 'html.parser')
print('File parsed')
data = []
table = soup.find('table')
rows = table.find_all('tr') # 18th row is header row
cols = rows[0].find_all('td')
cols = [ele.text.strip() for ele in cols]
database.connect()
for row in rows[0:]:
d = row.find_all('td')
d = [ele.text.strip() for ele in d]
data.append([ele for ele in d if ele]) # Get rid of empty values
Prism.create(pmt_id=(d[1]),
old_status=d[3],
new_status=(d[4]),
last_updated=float(d[5])
Line 96 database.close()
Now here is the error message from my console:
C:\Users\Documents\NetBeansProjects\BudgetHome>python prism.py
File "prism.py", line 96
database.close()
^
SyntaxError: invalid syntax
C:\Users\Documents\NetBeansProjects\BudgetHome>
Related
New to programming here, so forgive the silly questions. I've been trying to work out how to Python for web scraping and a lot of the YouTube videos and other questions kinda get me there, I'm having a hard time relating the answer to my actual code.
My code so far is:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
url = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo=1"
driver = webdriver.Chrome()
driver.get(url)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup_level1=soup(driver.page_source, 'lxml')
race_soup = soup_level1.find("tbody", class_="f_fs13")
print(race_soup.text.strip())
results_soup = soup_level1.find("tbody", class_="f_fs12")
print(results_soup.text.strip())
datalist = [] #empty list
x = 0 #counter
print('good')
driver.close()
This will generate the parsed data, but now I am stuck as how to move it from text to a data frame with pandas. I'm sure it is simple, but all of the instructional material I've seen isn't clicking for me.
Also, The code so far is just sort of copy and pasted chunks from different websites that I got to work with trial and error. I'm not sure if any of it is redundant, so if there is a neater way to go about it, I would appreciate that feedback as well!
Thanks in advance,
Spencer
Give this a try:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
url = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo=1"
driver = webdriver.Chrome()
driver.get(url)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "f_fs13")))
htmlStr = driver.page_source
soup_level1 = soup(htmlStr, 'html.parser')
race_soup = soup_level1.find('tbody',{'class':'f_fs13'}).find_parent('table')
results_soup = soup_level1.find('tbody',{'class':'f_fs12'}).find_parent('table')
df1 = pd.read_html(str(race_soup))[0]
print(df1)
df2 = pd.read_html(str(results_soup))[0]
print(df2)
datalist = [] #empty list
x = 0 #counter
print('good')
driver.close()
I have extracted data from a table in a website with for-loop using
selenium automation webdriver. How to convert that data into dataframe
and export to csv file. I tried to assign the 'value' within pandas dataframe but it is throwing error.
from selenium import webdriver
url = "https://www.jambalakadi.info/status/"
driver = webdriver.Chrome(executable_path="chromedriver.exe")
driver.get(url)
row_count = len(driver.find_elements_by_xpath(" //*[#id='main_table_countries_today']/tbody[1]/tr "))
col_count = len(driver.find_elements_by_xpath(" //*[#id='main_table_countries_today']/tbody[1]/tr[1]/td "))
print('Number of row counts:', row_count)
print("Number of column counts:", col_count)
for r in range(2, row_count+1):
for c in range(1, col_count+1):
value = driver.find_element_by_xpath(" //*[#id='main_table_countries_today']/tbody[1]/tr["+str(r)+"]/td["+str(c)+"] ").text
print(value, end=" ")
print(" ")
when I run the for-loop the 'value' variable printing the data but I'm
not able to create data frame and export it to CSV file using pandas.
I updated the code is it correctly formatted?
my_data = []
for r in range(2, row_count+1):
for c in range(1, col_count+1):
value = driver.find_element_by_xpath(" //*[#id='main_table_countries_today']/tbody[1]/tr["+str(r)+"]/td["+str(c)+"] ").text
print(value, end=" ")
for line in value:
my_data.append(line[0],line[1],line[2])
pd.DataFrame.from_records(my_data, columns=column).to_csv('output.csv')
print(" ")
Here is the code using pandas to get data in dataframe and then import to csv.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
driver=webdriver.Chrome(executable_path="chromedriver.exe")
driver.get("https://yourwebsitename.com")
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"table#main_table_countries_today")))
html=driver.page_source
soup=BeautifulSoup(driver.page_source,'html.parser')
table=soup.find('table',attrs={"id":"main_table_countries_today"})
df=pd.read_html(str(table))
print(df[0])
df[0].to_csv('output.csv',index=False)
Updated:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
driver=webdriver.Chrome(executable_path = "chromedriver.exe")
driver.get("https://yourwebsitename.com")
element=WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"table#main_table_countries_today")))
table=driver.execute_script("return arguments[0].outerHTML;",element)
df=pd.read_html(str(table))
print(df[0])
df[0].to_csv('output.csv',index=False)
You need to use the function pd.DataFrame.from_records()
Use case:
import pandas as pd
#Reading the data
my_data = []
for line in my_database:
#preprocess the line (say you get 3 columns date,customer,price)
#say you use line.split(" "), now your line is actually an array of values (line = line.split(" ")
my_data.append([line[0],line[1],line[2]]) #each index corresponds to date, customer and price respectively
pd.DataFrame.from_records(my_data, columns=['date','customer','price']).to_csv('output.csv')
I have the following code:
from selenium import webdriver
import sys
import time
import os
import pyautogui
import webbrowser
import openpyxl
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from openpyxl import Workbook
from openpyxl import load_workbook
chrome_path =r"C:\Users\Desktop\webdriver\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get("url string")
def login(driver):
elem = driver.find_element_by_xpath("""//*[#id="usernameField"]""")
elem.send_keys("username")
elem2 = driver.find_element_by_xpath("""//*[#id="passwordField"]""")
elem2.send_keys("password")
driver.find_element_by_xpath("""//* [#id="loginForm"]/table/tbody/tr[4]/td[2]/input""").click()
driver.find_element_by_xpath("""//*[#id="nav"]/ul/li[2]/a""").click()
driver.find_element_by_xpath("""//*[#id="check1"]""").click()
def sendvalues(driver):
wb = load_workbook('prueba.xlsx')
coma = ","
ws = wb.active
buscar = driver.find_element_by_xpath(""" //*[#id="wrapper"]/form/div[2]/input[1]""")
rut = driver.find_element_by_xpath("""//*[#id="rut"]""")
dv = driver.find_element_by_xpath("""//*[#id="wrapper"]/form/table[1]/tbody/tr[3]/td[2]/input""")
nombre = driver.find_element_by_xpath("""//*[#id="wrapper"]/form/table[1]/tbody/tr[4]/td[2]/input""")
rutvalue = ws.cell(1,1).value
dvvalue = ws.cell(1,2).value
nombrevalue = ws.cell(1,3).value
rut.send_keys(rutvalue)
dv.send_keys(dvvalue)
nombre.send_keys(nombrevalue)
buscar.click()
table_elements = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//table[#class = 'grilla']")))
for table_element in table_elements:
for row in table_element.find_elements_by_xpath(".//tr"):
text_file = open("Output2.txt", "a")
text_file.write(str(rutvalue)+str(coma)+str(row.text)+'\n')
text_file.close()
clear(driver)
def clear(driver):
rut = driver.find_element_by_xpath("""//*[#id="rut"]""")
dv = driver.find_element_by_xpath("""//*[#id="wrapper"]/form/table[1]/tbody/tr[3]/td[2]/input""")
nombre = driver.find_element_by_xpath("""//*[#id="wrapper"]/form/table[1]/tbody/tr[4]/td[2]/input""")
rut.clear()
dv.clear()
nombre.clear()
login(driver)
sendvalues(driver)
The code extracts a table after sending an Excel file's single row values to a website. I have three columns rutvalue, dvvalue and nombrevalue in the .xlsx file and I need to send those values for each row to the specific input fields (rut value, dv value, name value) on the web page and get the table of results. I need to send the values of each Excel row to the input fields.
Is there a method to develop a cycle iterating each row and get the results?
You can create a class for the scraping code, and in another file, read the xlsx file and call the class in each iteration.
Look this example.
class ScrapingCode():
def __init__(self, rut, dv, name):
self.rut = rut
self.dv = dv
self.name = name
def run(self):
#for use the values here, use self.name_var
rut.send_keys(self.rut)
...code scraping...
** another file
from app.folder.file import ScrapingCode
# read csv
for row in rows:
scrapingCode = ScrapingCode(row[0], row[1], row[2])
scrapingCode.run()
i changed and improved the code with satisfactory results:
from selenium import webdriver
import sys
import time
import os
import pyautogui
import webbrowser
import openpyxl
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from openpyxl import Workbook
from openpyxl import load_workbook
chrome_path =r"C:\Users\Desktop\webdriver\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get("myurl")
def login(driver):
elem = driver.find_element_by_xpath("""//*[#id="usernameField"]""")
elem.send_keys("myusername")
elem2 = driver.find_element_by_xpath("""//*[#id="passwordField"]""")
elem2.send_keys("mypassword")
driver.find_element_by_xpath("""//*[#id="loginForm"]/table/tbody/tr[4]/td[2]/input""").click()
driver.find_element_by_xpath("""//*[#id="nav"]/ul/li[2]/a""").click()
driver.find_element_by_xpath("""//*[#id="check1"]""").click()
def sendvalues(driver):
wb = load_workbook('prueba.xlsx')
ws = wb.active
buscar = driver.find_element_by_xpath(""" //*[#id="wrapper"]/form/div[2]/input[1]""")
rut = driver.find_element_by_xpath("""//*[#id="rut"]""")
dv = driver.find_element_by_xpath("""//*[#id="wrapper"]/form/table[1]/tbody/tr[3]/td[2]/input""")
nombre = driver.find_element_by_xpath("""//*[#id="wrapper"]/form/table[1]/tbody/tr[4]/td[2]/input""")
rutvalue= ws.cell(row=x, column=1).value
dvvalue= ws.cell(row=x, column=2).value
nombrevalue= ws.cell(row=x, column=3).value
rut.send_keys(rutvalue)
dv.send_keys(dvvalue)
nombre.send_keys(nombrevalue)
buscar.click()
table(driver)
def table(driver):
rut = driver.find_element_by_xpath("""//*[#id="rut"]""")
dv = driver.find_element_by_xpath("""//*[#id="wrapper"]/form/table[1]/tbody/tr[3]/td[2]/input""")
nombre = driver.find_element_by_xpath("""//*[#id="wrapper"]/form/table[1]/tbody/tr[4]/td[2]/input""")
coma = ","
wb = load_workbook('prueba.xlsx')
ws = wb.active
rutvalue= ws.cell(row=x, column=1).value
table_elements = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//table[#class = 'grilla']")))
for table_element in table_elements:
for row in table_element.find_elements_by_xpath(".//tr"):
text_file = open("Output2.txt", "a")
text_file.write(str(rutvalue)+str(coma)+str(row.text)+'\n')
text_file.close()
rut.clear()
dv.clear()
nombre.clear()
login(driver)
for x in range(1,1000):
sendvalues(driver)
I defined the "for loop" at the end of the code and defined in the sendvalues function the factor x as seen in the following part:
rutvalue= ws.cell(row=x, column=1).value
dvvalue= ws.cell(row=x, column=2).value
nombrevalue= ws.cell(row=x, column=3).value
Thanks for the comments!
When I parse the table[3] it contains the subtable in it. And it will repeatedly print the subtable once it is done. I have tried using the pandas library read_html.
enter code here
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import pandas as pd
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver,
20).until(EC.element_to_be_clickable((By.XPATH,"//div[#class='search-
pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio= WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();",Registered_Project_radio)
Application = driver.find_element_by_id("CertiNo")
Application.send_keys("P50500000005")
Search = WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();",Search)
View = [item.get_attribute('href') for item in
driver.find_elements_by_tag_name("a") if
item.get_attribute('href') is not None]
View = View[0]
driver.get(View)
table = pd.read_html(driver.page_source)[3]
print(table)
with open("MyFile_table.csv","a") as f :
table.to_csv(f , sep=',',index = False)
f.close()
The code below is working and is currently saving to a csv file, however I want to save to an excel file instead using openpyxl. I attempted it further below but had no success. I'd eventually like to save this to an existing sheet and be able to overwrite the existing data. Can anyone help? Thanks
Working Code:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
import csv
import urllib
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
url = 'http://www.tradingview.com/screener'
driver = webdriver.Firefox()
driver.get(url)
try:
selector = '.js-field-total.tv-screener-table__field-value--total'
condition = EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
matches = WebDriverWait(driver, 10).until(condition)
matches = int(matches.text.split()[0])
except (TimeoutException, Exception):
print ('Problem finding matches, setting default...')
matches = 4895 # Set default
# The page loads 150 rows at a time; divide matches by
# 150 to determine the number of times we need to scroll;
# add 5 extra scrolls just to be sure
num_loops = int(matches / 150 + 5)
for _ in range(num_loops):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(1) # Pause briefly to allow loading time
search_entries = [
("tickers", "//tbody/tr/td[1]/div/a"),
("rev annual", "//tbody/tr/td[10]"),
("income", "//tbody/tr/td[11]")]
with open('textfile.csv', 'w+', newline= '' ) as f_output:
csv_output = csv.writer(f_output)
# Write header
csv_output.writerow([name for name, xpath in search_entries])
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
csv_output.writerows(zip(*entries))
Tried this:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from openpyxl import Workbook
import urllib
wb = Workbook(write_only=True)
ws = wb.create_sheet()
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
url = 'http://www.tradingview.com/screener'
driver = webdriver.Firefox()
driver.get(url)
try:
selector = '.js-field-total.tv-screener-table__field-value--total'
condition = EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
matches = WebDriverWait(driver, 10).until(condition)
matches = int(matches.text.split()[0])
except (TimeoutException, Exception):
print ('Problem finding matches, setting default...')
matches = 4895 # Set default
# The page loads 150 rows at a time; divide matches by
# 150 to determine the number of times we need to scroll;
# add 5 extra scrolls just to be sure
num_loops = int(matches / 150 + 5)
for _ in range(num_loops):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(1) # Pause briefly to allow loading time
search_entries = [
("tickers", "//tbody/tr/td[1]/div/a"),
("rev annual", "//tbody/tr/td[10]"),
("income", "//tbody/tr/td[11]")]
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
wb.save('new_big_file.xlsx')