How to convert for loop generated data into Data Frame? - python-3.x

I have extracted data from a table in a website with for-loop using
selenium automation webdriver. How to convert that data into dataframe
and export to csv file. I tried to assign the 'value' within pandas dataframe but it is throwing error.
from selenium import webdriver
url = "https://www.jambalakadi.info/status/"
driver = webdriver.Chrome(executable_path="chromedriver.exe")
driver.get(url)
row_count = len(driver.find_elements_by_xpath(" //*[#id='main_table_countries_today']/tbody[1]/tr "))
col_count = len(driver.find_elements_by_xpath(" //*[#id='main_table_countries_today']/tbody[1]/tr[1]/td "))
print('Number of row counts:', row_count)
print("Number of column counts:", col_count)
for r in range(2, row_count+1):
for c in range(1, col_count+1):
value = driver.find_element_by_xpath(" //*[#id='main_table_countries_today']/tbody[1]/tr["+str(r)+"]/td["+str(c)+"] ").text
print(value, end=" ")
print(" ")
when I run the for-loop the 'value' variable printing the data but I'm
not able to create data frame and export it to CSV file using pandas.
I updated the code is it correctly formatted?
my_data = []
for r in range(2, row_count+1):
for c in range(1, col_count+1):
value = driver.find_element_by_xpath(" //*[#id='main_table_countries_today']/tbody[1]/tr["+str(r)+"]/td["+str(c)+"] ").text
print(value, end=" ")
for line in value:
my_data.append(line[0],line[1],line[2])
pd.DataFrame.from_records(my_data, columns=column).to_csv('output.csv')
print(" ")

Here is the code using pandas to get data in dataframe and then import to csv.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
driver=webdriver.Chrome(executable_path="chromedriver.exe")
driver.get("https://yourwebsitename.com")
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"table#main_table_countries_today")))
html=driver.page_source
soup=BeautifulSoup(driver.page_source,'html.parser')
table=soup.find('table',attrs={"id":"main_table_countries_today"})
df=pd.read_html(str(table))
print(df[0])
df[0].to_csv('output.csv',index=False)
Updated:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
driver=webdriver.Chrome(executable_path = "chromedriver.exe")
driver.get("https://yourwebsitename.com")
element=WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"table#main_table_countries_today")))
table=driver.execute_script("return arguments[0].outerHTML;",element)
df=pd.read_html(str(table))
print(df[0])
df[0].to_csv('output.csv',index=False)

You need to use the function pd.DataFrame.from_records()
Use case:
import pandas as pd
#Reading the data
my_data = []
for line in my_database:
#preprocess the line (say you get 3 columns date,customer,price)
#say you use line.split(" "), now your line is actually an array of values (line = line.split(" ")
my_data.append([line[0],line[1],line[2]]) #each index corresponds to date, customer and price respectively
pd.DataFrame.from_records(my_data, columns=['date','customer','price']).to_csv('output.csv')

Related

Python unable to refresh the execution of script

from urllib.request import urlopen
from selenium import webdriver
from bs4 import BeautifulSoup as BSoup
import requests
import pandas as pd
from requests_html import HTMLSession
import time
import xlsxwriter
import re
import os
urlpage = 'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2019/07/14&Racecourse=ST&RaceNo=1'
# Setup selenium
driver = webdriver.Firefox(executable_path = 'geckodriver path')
# get web page
driver.get(urlpage)
time.sleep(10)
bs_obj = BSoup(driver.page_source, 'html.parser')
# Scrape table content
table = bs_obj.find('table', {"f_tac table_bd draggable"})
rows = table.find_all('tr')
table_content = []
for row in rows[1:]:
cell_row = []
for cell in row.find_all('td'):
cell_row.append(cell.text.replace(" ", "").replace("\n\n", " ").replace("\n", ""))
table_content.append(cell_row)
header_content = []
for cell in rows[0].find_all('td'):
header_content.append(cell.text)
driver.close()
race_writer = pd.ExcelWriter('export path', engine='xlsxwriter')
df = pd.DataFrame(table_content, columns=header_content)
df.to_excel(race_writer, sheet_name='game1')
Hi All, I am trying to scrape the racing result from HKJC. When I was executing the code above, either one of the errors below happened:
No excel file is created
Df is not written to the excel file < an empty excel file is created
Say if I successfully scrape the result of game 1, I then amend the script to continue to scrape that of game 2, but it still gives me the result of game 1.
Appreciate if anyone could help.
I changed your script to the one below. The approach followed is to click through each of the relevant "Sha Tin" buttons (see range(1, len(shatin)-1)) and collect the race table data. Race tables are added to a list called "races". Finally, write each of the race tables to individual sheets in Excel (note you no longer need BeautifulSoup).
Add these to your list of imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
Then:
urlpage = 'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2019/07/14&Racecourse=ST&RaceNo=1'
# Setup selenium
driver = webdriver.Firefox(executable_path = 'geckodriver path')
# get web page
driver.get(urlpage)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"//table[#class='f_fs12 f_fr js_racecard']")))
shatin=driver.find_elements_by_xpath("//table[#class='f_fs12 f_fr js_racecard']/tbody/tr/td")
races=[]
for i in range(1, len(shatin)-1):
shatin = driver.find_elements_by_xpath("//table[#class='f_fs12 f_fr js_racecard']/tbody/tr/td")
#time.sleep(3)
#WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[#class='performance']")))
shatin[i].click()
table = pd.read_html(driver.find_element_by_xpath("//div[#class='performance']").get_attribute('outerHTML'))[0]
races.append(table)
with pd.ExcelWriter('races.xlsx') as writer:
for i,race in enumerate(races):
race.to_excel(writer, sheet_name=f'game{i+1}', index=False)
writer.save()
driver.quit()
Output:

How to scrape the the table in table using the pandas read html from the website

When I parse the table[3] it contains the subtable in it. And it will repeatedly print the subtable once it is done. I have tried using the pandas library read_html.
enter code here
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import pandas as pd
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver,
20).until(EC.element_to_be_clickable((By.XPATH,"//div[#class='search-
pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio= WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();",Registered_Project_radio)
Application = driver.find_element_by_id("CertiNo")
Application.send_keys("P50500000005")
Search = WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();",Search)
View = [item.get_attribute('href') for item in
driver.find_elements_by_tag_name("a") if
item.get_attribute('href') is not None]
View = View[0]
driver.get(View)
table = pd.read_html(driver.page_source)[3]
print(table)
with open("MyFile_table.csv","a") as f :
table.to_csv(f , sep=',',index = False)
f.close()

Save to excel file using openpyxl instead of csv

The code below is working and is currently saving to a csv file, however I want to save to an excel file instead using openpyxl. I attempted it further below but had no success. I'd eventually like to save this to an existing sheet and be able to overwrite the existing data. Can anyone help? Thanks
Working Code:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
import csv
import urllib
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
url = 'http://www.tradingview.com/screener'
driver = webdriver.Firefox()
driver.get(url)
try:
selector = '.js-field-total.tv-screener-table__field-value--total'
condition = EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
matches = WebDriverWait(driver, 10).until(condition)
matches = int(matches.text.split()[0])
except (TimeoutException, Exception):
print ('Problem finding matches, setting default...')
matches = 4895 # Set default
# The page loads 150 rows at a time; divide matches by
# 150 to determine the number of times we need to scroll;
# add 5 extra scrolls just to be sure
num_loops = int(matches / 150 + 5)
for _ in range(num_loops):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(1) # Pause briefly to allow loading time
search_entries = [
("tickers", "//tbody/tr/td[1]/div/a"),
("rev annual", "//tbody/tr/td[10]"),
("income", "//tbody/tr/td[11]")]
with open('textfile.csv', 'w+', newline= '' ) as f_output:
csv_output = csv.writer(f_output)
# Write header
csv_output.writerow([name for name, xpath in search_entries])
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
csv_output.writerows(zip(*entries))
Tried this:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from openpyxl import Workbook
import urllib
wb = Workbook(write_only=True)
ws = wb.create_sheet()
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
url = 'http://www.tradingview.com/screener'
driver = webdriver.Firefox()
driver.get(url)
try:
selector = '.js-field-total.tv-screener-table__field-value--total'
condition = EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
matches = WebDriverWait(driver, 10).until(condition)
matches = int(matches.text.split()[0])
except (TimeoutException, Exception):
print ('Problem finding matches, setting default...')
matches = 4895 # Set default
# The page loads 150 rows at a time; divide matches by
# 150 to determine the number of times we need to scroll;
# add 5 extra scrolls just to be sure
num_loops = int(matches / 150 + 5)
for _ in range(num_loops):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(1) # Pause briefly to allow loading time
search_entries = [
("tickers", "//tbody/tr/td[1]/div/a"),
("rev annual", "//tbody/tr/td[10]"),
("income", "//tbody/tr/td[11]")]
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
wb.save('new_big_file.xlsx')

dBase for my website keeps giving me error "Invalid syntax"

I have a web page I have been developing and I am now stuck getting told syntax is wrong, I cannot find the error can someone please help me find my error? Here is my code for the database that is giving me my error:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
from models import *
import os # File I/O
import time
import shutil
import glob
import configparser
config_parser = configparser.ConfigParser()
config_parser.read("config.ini")
pdownload_dir = os.path.abspath('./prism_downloads/')
dt = str(datetime.datetime.now())
filelist = glob.glob(download_dir + '/*.html')
dbpath = ('./db')
def db_Prism():
database.connect()
database.create_tables([Prism], safe=False)
database.close()
for root, dir, files in os.walk(pdownload_dir):
for file in files:
print(file)
file_markup = ''
with open(os.path.abspath(os.path.join(pdownload_dir, file)), 'r') as html:
file_markup = html.read()
if file_markup == '':
print('ERROR: File was not read')
print('Reading {0} into BS4'.format(file))
soup = BeautifulSoup(file_markup, 'html.parser')
print('File parsed')
data = []
table = soup.find('table')
rows = table.find_all('tr') # 18th row is header row
cols = rows[0].find_all('td')
cols = [ele.text.strip() for ele in cols]
database.connect()
for row in rows[0:]:
d = row.find_all('td')
d = [ele.text.strip() for ele in d]
data.append([ele for ele in d if ele]) # Get rid of empty values
Prism.create(pmt_id=(d[1]),
old_status=d[3],
new_status=(d[4]),
last_updated=float(d[5])
Line 96 database.close()
Now here is the error message from my console:
C:\Users\Documents\NetBeansProjects\BudgetHome>python prism.py
File "prism.py", line 96
database.close()
^
SyntaxError: invalid syntax
C:\Users\Documents\NetBeansProjects\BudgetHome>

How do I add waits to my Selenium scraping program?

I am trying to scrape a website and save the information using Python and Selenium. The scrape is simple, and only requires choosing the state and district in two dropdown menus, clicking a submit button, and reading and writing a table to a csv.
I am confident my packages are installed correctly and my program even works, but only some of the time. My guess is that without the proper Selenium driver 'waits', my program crashes because it can't find the correct css_selector. I'll post the program below, and if anyone has any suggestions on how to correctly incorporate Selenium driver 'waits', I would very much appreciate the help.
Thanks so much, and here's the program:
import time
import re
import string
import urllib.parse
import pandas
import numpy
import os
import csv
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
url = "https://myhpgas.in/myHPGas/HPGas/LocateDistributor.aspx"
driver.set_window_size(1120, 550)
driver.get(url);
time.sleep(5)
stateList = driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlState")
options = stateList.find_elements_by_tag_name("option")
optionsList = []
for option in options:
optionsList.append(option.get_attribute("value"))
optionsList[1:len(optionsList)]
for optionValue in optionsList:
select = Select(driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlState"))
select.select_by_value(optionValue)
districtList = driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlDistrict")
distOptions = districtList.find_elements_by_tag_name("option")
distOptionsList = []
for distOption in distOptions: #iterate over the options, place attribute value in list
distOptionsList.append(distOption.get_attribute("value"))
for distOptionValue in distOptionsList[1:len(distOptionsList)]:
distSelect = Select(driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlDistrict"))
distSelect.select_by_value(distOptionValue)
driver.find_element_by_css_selector('#ContentPlaceHolder1_btnShowList').click()
data = []
for tr in driver.find_elements_by_css_selector('#ContentPlaceHolder1_gvDistributor'):
tds = tr.find_elements_by_tag_name('td')
if tds:
data.append([td.text for td in tds])
print(data)
dataRows = int(numpy.array(data).size / 7)
rowsTimesColumns = (dataRows * 7) -1
newArray = numpy.array(data)
outArray = newArray[0:rowsTimesColumns]
test = pandas.DataFrame(outArray.reshape(dataRows,7), columns=['no', 'distributor', 'address','contact1', 'contact2', 'contact3', 'map'])
file_path = 'Users/outpath' + '_' + optionValue + '_' + distOptionValue + '.csv'
test.to_csv(file_path, sep=',')
driver.back()
driver.back()
Can you tell me which line returns an error!? Also how about using xpaths!?
I couldn’t see the statement to implement explicit waits
WebDriverWait(driver, 30).until(EC.presence_of_element_located_by(By.CSS_SELECTOR,*your css selector*))

Resources