from urllib.request import urlopen
from selenium import webdriver
from bs4 import BeautifulSoup as BSoup
import requests
import pandas as pd
from requests_html import HTMLSession
import time
import xlsxwriter
import re
import os
urlpage = 'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2019/07/14&Racecourse=ST&RaceNo=1'
# Setup selenium
driver = webdriver.Firefox(executable_path = 'geckodriver path')
# get web page
driver.get(urlpage)
time.sleep(10)
bs_obj = BSoup(driver.page_source, 'html.parser')
# Scrape table content
table = bs_obj.find('table', {"f_tac table_bd draggable"})
rows = table.find_all('tr')
table_content = []
for row in rows[1:]:
cell_row = []
for cell in row.find_all('td'):
cell_row.append(cell.text.replace(" ", "").replace("\n\n", " ").replace("\n", ""))
table_content.append(cell_row)
header_content = []
for cell in rows[0].find_all('td'):
header_content.append(cell.text)
driver.close()
race_writer = pd.ExcelWriter('export path', engine='xlsxwriter')
df = pd.DataFrame(table_content, columns=header_content)
df.to_excel(race_writer, sheet_name='game1')
Hi All, I am trying to scrape the racing result from HKJC. When I was executing the code above, either one of the errors below happened:
No excel file is created
Df is not written to the excel file < an empty excel file is created
Say if I successfully scrape the result of game 1, I then amend the script to continue to scrape that of game 2, but it still gives me the result of game 1.
Appreciate if anyone could help.
I changed your script to the one below. The approach followed is to click through each of the relevant "Sha Tin" buttons (see range(1, len(shatin)-1)) and collect the race table data. Race tables are added to a list called "races". Finally, write each of the race tables to individual sheets in Excel (note you no longer need BeautifulSoup).
Add these to your list of imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
Then:
urlpage = 'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2019/07/14&Racecourse=ST&RaceNo=1'
# Setup selenium
driver = webdriver.Firefox(executable_path = 'geckodriver path')
# get web page
driver.get(urlpage)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"//table[#class='f_fs12 f_fr js_racecard']")))
shatin=driver.find_elements_by_xpath("//table[#class='f_fs12 f_fr js_racecard']/tbody/tr/td")
races=[]
for i in range(1, len(shatin)-1):
shatin = driver.find_elements_by_xpath("//table[#class='f_fs12 f_fr js_racecard']/tbody/tr/td")
#time.sleep(3)
#WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[#class='performance']")))
shatin[i].click()
table = pd.read_html(driver.find_element_by_xpath("//div[#class='performance']").get_attribute('outerHTML'))[0]
races.append(table)
with pd.ExcelWriter('races.xlsx') as writer:
for i,race in enumerate(races):
race.to_excel(writer, sheet_name=f'game{i+1}', index=False)
writer.save()
driver.quit()
Output:
Related
Recently, I wrote a selenium web scraper that is meant to extract all the information on a table containing data on all presidential elections that have been held in the united states. The table is on this wikipedia site.
The problem is that the code returns all the info I need when I write the result into a .txt file. But anytime I try to print that same result in my text editor, it returns only half of the data I need. I do not understand what the problem is. Can someone help me out?
Here is my code.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas
# using selenium and shromedriver to extract the javascript wikipage
scrape_options = Options()
scrape_options.add_argument('--headless')
driver = webdriver.Chrome(r'web scraping master/chromedriver', options=scrape_options)
page_info = driver.get('https://en.wikipedia.org/wiki/United_States_presidential_election')
# waiting for the javascript to load
try:
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,"wikitable")))
finally:
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable jquery-tablesorter'})
print(table)
with open("loge.txt","w") as f: #Only part I added to the code
f.write(str(table))
I'm not really sure what was the problem, but this works as expected. I've changed loge.txt to loge.html and the code dumps the entire table.
Mind trying this?
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
scrape_options = Options()
scrape_options.add_argument('--headless')
driver = webdriver.Chrome(options=scrape_options)
page_info = driver.get('https://en.wikipedia.org/wiki/United_States_presidential_election')
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "wikitable")))
finally:
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable jquery-tablesorter'})
with open("loge.html", "w") as f:
f.write(str(table))
I am using selenium web driver to exctract the table from the web page ,bellow is my script
from selenium import webdriver
from bs4 import BeautifulSoup
import requets
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
import time, sys
driver = webdriver.Chrome()
driver.implicitly_wait(20)
URL = 'https://www.ccee.org.br/portal/faces/pages_publico/o-que-fazemos/como_ccee_atua/precos/precos_medios?_adf.ctrl-state=7e1fw5zdn_14&_afrLoop=19197915280379#!%40%40%3F_afrLoop%3D19197915280379%26_adf.ctrl-state%3D7e1fw5zdn_18'
driver.get(URL)
soup = BeautifulSoup(driver.page_source, "html.parser")
time.sleep(10)
table = soup.find('table')
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll(["td"]):
text = cell.text
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
for item in list_of_rows:
' '.join(item)
Data = pd.DataFrame(list_of_rows)
Export_Path = os.path.join(PY_Space, 'Source_Data.csv')
KN.PrintTF('\n>> Raw Data Exported to path: '+ Export_Path)
Data.to_csv(Export_Path,index = False)
I am unable to get the table . Please help
I am trying to scrape food menu data from zomato. I am using selenium to do the same while inspecting the elements, I can find the class 'category_heading', but using the same in the code gives no result and shows empty list. I am attaching the snippet of the code. Thanks.
I have tried using browser.find_element_by_xpath as well find_element_by_class_name and tag, but nothing seems to work.
order_now = browser.find_element_by_xpath("//*[#id='orig-search-list']/div[1]/div[2]/a").click()
browser.maximize_window()
browser.implicitly_wait(20)
food_item = browser.find_elements_by_class_name("category_heading")
print('food',food_item)
I need the food menu data so that I can store it in a csv.
Page can be slow to load. Try using a wait condition
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
d = webdriver.Chrome()
d.get('https://www.zomato.com/bangalore/burgers-kingdom-indiranagar-bangalore/order')
rows = WebDriverWait(d, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".row")))
soup = bs(d.page_source, 'lxml')
for category in soup.select('.category-container'):
title = category.select_one('h3').text
print(title)
items = [i.text for i in category.select('.category_heading')]
if items:
print(items)
else:
print('No sub-headers')
I'm a finance student, very new to coding. I've been trying to learn Python for a week.
Goal:
Scraping data for financial statements of listed companies in Vietnam
Source: https://www.vndirect.com.vn/portal/bang-can-doi-ke-toan/vnm.shtml?request_locale=en_GB (sample ticker: VNM)
Situation:
I'm using Python 3 with BS4, Selenium for scraping and Pandas for tabling. I've read about Scrapy but I find it takes more efforts to learn.
Problems:
As you see, the website has drop-down menus for choosing years, number of terms, units...
I've figured out the part of using Selenium to select options and click View button to load the data.
I have trouble getting the data afterward. I tried to use 'wait until' but there is no change in website elements.
Moving to Income and CF Statement, the data was there when inspecting but missing in page source (using Chrome).
Thank you for any help.
import requests as rq
import bs4
import pandas as pd
# ticker = input('Ticker')
ticker = 'vnm'
url = 'https://www.vndirect.com.vn/portal/bang-can-doi-ke-toan/' + ticker + '.shtml'
res = rq.get(url)
web_text = bs4.BeautifulSoup(res.text)
content = web_text.select('.bordertd')
mod_content =[]
for x in content:
mod_content.append(x.getText().strip())
def chunks(l, n):
for i in range(0, len(l), n):
yield l[i:i+n]
table = list(chunks(mod_content, 5))
bsheet=pd.DataFrame(table)
print(bsheet)
I keep the part with Selenium separated:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome('path')
browser.get('https://www.vndirect.com.vn/portal/bang-can-doi-ke-toan/VNM.shtml')
dropdown = Select(browser.find_element_by_name("searchObject.fiscalQuarter"))
dropdown.select_by_value('IN_YEAR')
browser.find_element_by_class_name('iButton').click()
# wait = WebDriverWait(browser, 10)
# element = wait.until(EC.element_to_be_clickable((By.NAME,'searchObject.fiscalQuarter')))
content = browser.page_source
browser.quit()
content = web_text.select('.bordertd')
mod_content =[]
for x in content:
mod_content.append(x.getText().strip())
mod_content
I am trying to scrape a website and save the information using Python and Selenium. The scrape is simple, and only requires choosing the state and district in two dropdown menus, clicking a submit button, and reading and writing a table to a csv.
I am confident my packages are installed correctly and my program even works, but only some of the time. My guess is that without the proper Selenium driver 'waits', my program crashes because it can't find the correct css_selector. I'll post the program below, and if anyone has any suggestions on how to correctly incorporate Selenium driver 'waits', I would very much appreciate the help.
Thanks so much, and here's the program:
import time
import re
import string
import urllib.parse
import pandas
import numpy
import os
import csv
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
url = "https://myhpgas.in/myHPGas/HPGas/LocateDistributor.aspx"
driver.set_window_size(1120, 550)
driver.get(url);
time.sleep(5)
stateList = driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlState")
options = stateList.find_elements_by_tag_name("option")
optionsList = []
for option in options:
optionsList.append(option.get_attribute("value"))
optionsList[1:len(optionsList)]
for optionValue in optionsList:
select = Select(driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlState"))
select.select_by_value(optionValue)
districtList = driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlDistrict")
distOptions = districtList.find_elements_by_tag_name("option")
distOptionsList = []
for distOption in distOptions: #iterate over the options, place attribute value in list
distOptionsList.append(distOption.get_attribute("value"))
for distOptionValue in distOptionsList[1:len(distOptionsList)]:
distSelect = Select(driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlDistrict"))
distSelect.select_by_value(distOptionValue)
driver.find_element_by_css_selector('#ContentPlaceHolder1_btnShowList').click()
data = []
for tr in driver.find_elements_by_css_selector('#ContentPlaceHolder1_gvDistributor'):
tds = tr.find_elements_by_tag_name('td')
if tds:
data.append([td.text for td in tds])
print(data)
dataRows = int(numpy.array(data).size / 7)
rowsTimesColumns = (dataRows * 7) -1
newArray = numpy.array(data)
outArray = newArray[0:rowsTimesColumns]
test = pandas.DataFrame(outArray.reshape(dataRows,7), columns=['no', 'distributor', 'address','contact1', 'contact2', 'contact3', 'map'])
file_path = 'Users/outpath' + '_' + optionValue + '_' + distOptionValue + '.csv'
test.to_csv(file_path, sep=',')
driver.back()
driver.back()
Can you tell me which line returns an error!? Also how about using xpaths!?
I couldn’t see the statement to implement explicit waits
WebDriverWait(driver, 30).until(EC.presence_of_element_located_by(By.CSS_SELECTOR,*your css selector*))