How to Scraping Dynamic Website in Python Selenium? - python-3.x

I'm a finance student, very new to coding. I've been trying to learn Python for a week.
Goal:
Scraping data for financial statements of listed companies in Vietnam
Source: https://www.vndirect.com.vn/portal/bang-can-doi-ke-toan/vnm.shtml?request_locale=en_GB (sample ticker: VNM)
Situation:
I'm using Python 3 with BS4, Selenium for scraping and Pandas for tabling. I've read about Scrapy but I find it takes more efforts to learn.
Problems:
As you see, the website has drop-down menus for choosing years, number of terms, units...
I've figured out the part of using Selenium to select options and click View button to load the data.
I have trouble getting the data afterward. I tried to use 'wait until' but there is no change in website elements.
Moving to Income and CF Statement, the data was there when inspecting but missing in page source (using Chrome).
Thank you for any help.
import requests as rq
import bs4
import pandas as pd
# ticker = input('Ticker')
ticker = 'vnm'
url = 'https://www.vndirect.com.vn/portal/bang-can-doi-ke-toan/' + ticker + '.shtml'
res = rq.get(url)
web_text = bs4.BeautifulSoup(res.text)
content = web_text.select('.bordertd')
mod_content =[]
for x in content:
mod_content.append(x.getText().strip())
def chunks(l, n):
for i in range(0, len(l), n):
yield l[i:i+n]
table = list(chunks(mod_content, 5))
bsheet=pd.DataFrame(table)
print(bsheet)
I keep the part with Selenium separated:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome('path')
browser.get('https://www.vndirect.com.vn/portal/bang-can-doi-ke-toan/VNM.shtml')
dropdown = Select(browser.find_element_by_name("searchObject.fiscalQuarter"))
dropdown.select_by_value('IN_YEAR')
browser.find_element_by_class_name('iButton').click()
# wait = WebDriverWait(browser, 10)
# element = wait.until(EC.element_to_be_clickable((By.NAME,'searchObject.fiscalQuarter')))
content = browser.page_source
browser.quit()
content = web_text.select('.bordertd')
mod_content =[]
for x in content:
mod_content.append(x.getText().strip())
mod_content

Related

Python unable to refresh the execution of script

from urllib.request import urlopen
from selenium import webdriver
from bs4 import BeautifulSoup as BSoup
import requests
import pandas as pd
from requests_html import HTMLSession
import time
import xlsxwriter
import re
import os
urlpage = 'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2019/07/14&Racecourse=ST&RaceNo=1'
# Setup selenium
driver = webdriver.Firefox(executable_path = 'geckodriver path')
# get web page
driver.get(urlpage)
time.sleep(10)
bs_obj = BSoup(driver.page_source, 'html.parser')
# Scrape table content
table = bs_obj.find('table', {"f_tac table_bd draggable"})
rows = table.find_all('tr')
table_content = []
for row in rows[1:]:
cell_row = []
for cell in row.find_all('td'):
cell_row.append(cell.text.replace(" ", "").replace("\n\n", " ").replace("\n", ""))
table_content.append(cell_row)
header_content = []
for cell in rows[0].find_all('td'):
header_content.append(cell.text)
driver.close()
race_writer = pd.ExcelWriter('export path', engine='xlsxwriter')
df = pd.DataFrame(table_content, columns=header_content)
df.to_excel(race_writer, sheet_name='game1')
Hi All, I am trying to scrape the racing result from HKJC. When I was executing the code above, either one of the errors below happened:
No excel file is created
Df is not written to the excel file < an empty excel file is created
Say if I successfully scrape the result of game 1, I then amend the script to continue to scrape that of game 2, but it still gives me the result of game 1.
Appreciate if anyone could help.
I changed your script to the one below. The approach followed is to click through each of the relevant "Sha Tin" buttons (see range(1, len(shatin)-1)) and collect the race table data. Race tables are added to a list called "races". Finally, write each of the race tables to individual sheets in Excel (note you no longer need BeautifulSoup).
Add these to your list of imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
Then:
urlpage = 'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2019/07/14&Racecourse=ST&RaceNo=1'
# Setup selenium
driver = webdriver.Firefox(executable_path = 'geckodriver path')
# get web page
driver.get(urlpage)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"//table[#class='f_fs12 f_fr js_racecard']")))
shatin=driver.find_elements_by_xpath("//table[#class='f_fs12 f_fr js_racecard']/tbody/tr/td")
races=[]
for i in range(1, len(shatin)-1):
shatin = driver.find_elements_by_xpath("//table[#class='f_fs12 f_fr js_racecard']/tbody/tr/td")
#time.sleep(3)
#WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[#class='performance']")))
shatin[i].click()
table = pd.read_html(driver.find_element_by_xpath("//div[#class='performance']").get_attribute('outerHTML'))[0]
races.append(table)
with pd.ExcelWriter('races.xlsx') as writer:
for i,race in enumerate(races):
race.to_excel(writer, sheet_name=f'game{i+1}', index=False)
writer.save()
driver.quit()
Output:

How to find text of a specific cell from a html table (on a url) using python?

from selenium import webdriver
driver = webdriver.Chrome(executable_path="D:\chromedriver.exe")
#url = 'https://www.dcrustedp.in/show_chart.php'
driver.get('https://www.dcrustedp.in/show_chart.php')
rows = 2
cols = 5
for r in range(5,rows+1):
for c in range(6,cols+1):
value = driver.find_element_by_xpath("/html/body/center/table/tbody/tr["+str(r)+"]/td["+str(c)+"]").text
print(value)
`
This is my code. I want to extract result date of B.Tech - Computer Science and Engineering 5th Semester. It is in the first row of table. The date is 24-02-2020. I want to print the date from that particular cell only.
The below code works-:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
webpage = 'https://www.dcrustedp.in/show_chart.php'
driver = webdriver.Chrome(executable_path='Your/path/to/chromedriver.exe')
driver.get(webpage)
time.sleep(15)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
pagehits=driver.find_element_by_xpath("/html/body/center/table/tbody/tr[3]/td[5]")
print(pagehits.text)
driver.quit()
Without Selenium, we can use requests library to fetch the table and then respective element
import requests
import pandas as pd
url = 'https://www.dcrustedp.in/show_chart.php'
html = requests.get(url, verify=False).content
df_list = pd.read_html(html)
df = df_list[-1]
print(df.iat[0,4])
To extract the result date of 5th Semester for any of the Prg. Title, you have to induce WebDriverWait for the visibility_of_element_located() and you can use the following Locator Strategy:
xpath:
driver.get('https://www.dcrustedp.in/show_chart.php')
prg_title = "B.Tech - Computer Science and Engineering"
# prg_title = "B.Tech - Electrical Engineering"
print(WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//td[contains(., '"+prg_title+"')]//following-sibling::td[3]"))).get_attribute("innerHTML"))
Console Output:
24-02-2020

How to open and access multiple (nearly 50) tabs in Chrome using ChromeDriver and Selenium through Python

I'm trying to gather some information from certain webpages using selenium and python.I have a working code for a single tab. But now i have a situation where i need to open 50 tabs in chrome at once and process each page data.
1) So open 50 tabs at once - The code i got already
2) Change the control between tabs and process the information from the page and close the tab and move to next tab and do the same.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import psycopg2
import os
import datetime
final_results=[]
positions=[]
saerched_url=[]
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
#options.add_argument('--headless')
options.add_argument("—-incognito")
browser = webdriver.Chrome(executable_path='/users/user_123/downloads/chrome_driver/chromedriver', chrome_options=options)
browser.implicitly_wait(20)
#def db_connect():
try:
DSN = "dbname='postgres' user='postgres' host='localhost' password='postgres' port='5432'"
TABLE_NAME = 'staging.search_url'
conn = psycopg2.connect(DSN)
print("Database connected...")
cur = conn.cursor()
cur.execute("SET datestyle='German'")
except (Exception, psycopg2.Error) as error:
print('database connection failed')
quit()
def get_products(url):
browser.get(url)
names = browser.find_elements_by_xpath("//span[#class='pymv4e']")
upd_product_name_list=list(filter(None, names))
product_name = [x.text for x in upd_product_name_list]
product = [x for x in product_name if len(x.strip()) > 2]
upd_product_name_list.clear()
product_name.clear()
return product
links = ['https://www.google.com/search?q=Vitamin+D',
'https://www.google.com/search?q=Vitamin+D3',
'https://www.google.com/search?q=Vitamin+D+K2',
'https://www.google.com/search?q=D3',
'https://www.google.com/search?q=Vitamin+D+1000']
for link in links:
# optional: we can wait for the new tab to open by comparing window handles count before & after
tabs_count_before = len(browser.window_handles)
# open a link
control_string = "window.open('{0}')".format(link)
browser.execute_script(control_string)
# optional: wait for windows count to increment to ensure new tab is opened
WebDriverWait(browser, 1).until(lambda browser: tabs_count_before != len(browser.window_handles))
# get list of currently opened tabs
tabs_list = browser.window_handles
print(tabs_list)
# switch control to newly opened tab (the last one in the list)
last_tab_opened = tabs_list[len(tabs_list)-1]
browser.switch_to_window(last_tab_opened)
# now you can process data on the newly opened tab
print(browser.title)
for lists in tabs_list:
last_tab_opened = tabs_list[len(tabs_list)-1]
browser.switch_to_window(last_tab_opened)
filtered=[]
filtered.clear()
filtered = get_products(link)
saerched_url.clear()
if not filtered:
new_url=link+'+kaufen'
get_products(link)
print('Modified URL :'+link)
if filtered:
print(filtered)
positions.clear()
for x in range(1, len(filtered)+1):
positions.append(str(x))
saerched_url.append(link)
gobal_position=0
gobal_position=len(positions)
print('global postion first: '+str(gobal_position))
print("\n")
company_name_list = browser.find_elements_by_xpath("//div[#class='LbUacb']")
company = []
company.clear()
company = [x.text for x in company_name_list]
print('Company Name:')
print(company, '\n')
price_list = browser.find_elements_by_xpath("//div[#class='e10twf T4OwTb']")
price = []
price.clear()
price = [x.text for x in price_list]
print('Price:')
print(price)
print("\n")
urls=[]
urls.clear()
find_href = browser.find_elements_by_xpath("//a[#class='plantl pla-unit-single-clickable-target clickable-card']")
for my_href in find_href:
url_list=my_href.get_attribute("href")
urls.append(url_list)
print('Final Result: ')
result = zip(positions,filtered, urls, company,price,saerched_url)
final_results.clear()
final_results.append(tuple(result))
print(final_results)
print("\n")
print('global postion end :'+str(gobal_position))
i=0
try:
for d in final_results:
while i <= gobal_position:
print( d[i])
cur.execute("""INSERT into staging.pla_crawler_results(position, product_name, url,company,price,searched_url) VALUES (%s, %s, %s,%s, %s,%s)""", d[i])
print('Inserted succesfully')
conn.commit()
i=i+1
except (Exception, psycopg2.Error) as error:
print (error)
pass
browser.close()
Ideally you shouldn't attempt to open 50 tabs at once as:
Handling 50 concurrent TABs through Selenium will invite complicated logic/algorithm to maintain.
Additionally, you may run into CPU and memory usage issues as:
Chrome maintains many processes.
Where as at times Firefox uses too much RAM
Solution
If you are having a List of the urls as follows:
['https://selenium.dev/downloads/', 'https://selenium.dev/documentation/en/']
You can iterate over the list to open them one by one in the adjacent tab for scraping using the following Locator Strategy:
Code Block:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.common.keys import Keys
links = ['https://selenium.dev/downloads/', 'https://selenium.dev/documentation/en/']
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
for link in links:
driver = webdriver.Chrome(options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get(link)
print(driver.title)
print("Perform webscraping here")
driver.quit()
print("End of program")
Console Output:
Downloads
Perform webscraping here
The Selenium Browser Automation Project :: Documentation for Selenium
Perform webscraping here
End of program
Reference
You can find a relevant detailed discussion in:
WebScraping JavaScript-Rendered Content using Selenium in Python

How do I scrape food menu from zomato page?

I am trying to scrape food menu data from zomato. I am using selenium to do the same while inspecting the elements, I can find the class 'category_heading', but using the same in the code gives no result and shows empty list. I am attaching the snippet of the code. Thanks.
I have tried using browser.find_element_by_xpath as well find_element_by_class_name and tag, but nothing seems to work.
order_now = browser.find_element_by_xpath("//*[#id='orig-search-list']/div[1]/div[2]/a").click()
browser.maximize_window()
browser.implicitly_wait(20)
food_item = browser.find_elements_by_class_name("category_heading")
print('food',food_item)
I need the food menu data so that I can store it in a csv.
Page can be slow to load. Try using a wait condition
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
d = webdriver.Chrome()
d.get('https://www.zomato.com/bangalore/burgers-kingdom-indiranagar-bangalore/order')
rows = WebDriverWait(d, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".row")))
soup = bs(d.page_source, 'lxml')
for category in soup.select('.category-container'):
title = category.select_one('h3').text
print(title)
items = [i.text for i in category.select('.category_heading')]
if items:
print(items)
else:
print('No sub-headers')

How do I add waits to my Selenium scraping program?

I am trying to scrape a website and save the information using Python and Selenium. The scrape is simple, and only requires choosing the state and district in two dropdown menus, clicking a submit button, and reading and writing a table to a csv.
I am confident my packages are installed correctly and my program even works, but only some of the time. My guess is that without the proper Selenium driver 'waits', my program crashes because it can't find the correct css_selector. I'll post the program below, and if anyone has any suggestions on how to correctly incorporate Selenium driver 'waits', I would very much appreciate the help.
Thanks so much, and here's the program:
import time
import re
import string
import urllib.parse
import pandas
import numpy
import os
import csv
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
url = "https://myhpgas.in/myHPGas/HPGas/LocateDistributor.aspx"
driver.set_window_size(1120, 550)
driver.get(url);
time.sleep(5)
stateList = driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlState")
options = stateList.find_elements_by_tag_name("option")
optionsList = []
for option in options:
optionsList.append(option.get_attribute("value"))
optionsList[1:len(optionsList)]
for optionValue in optionsList:
select = Select(driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlState"))
select.select_by_value(optionValue)
districtList = driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlDistrict")
distOptions = districtList.find_elements_by_tag_name("option")
distOptionsList = []
for distOption in distOptions: #iterate over the options, place attribute value in list
distOptionsList.append(distOption.get_attribute("value"))
for distOptionValue in distOptionsList[1:len(distOptionsList)]:
distSelect = Select(driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlDistrict"))
distSelect.select_by_value(distOptionValue)
driver.find_element_by_css_selector('#ContentPlaceHolder1_btnShowList').click()
data = []
for tr in driver.find_elements_by_css_selector('#ContentPlaceHolder1_gvDistributor'):
tds = tr.find_elements_by_tag_name('td')
if tds:
data.append([td.text for td in tds])
print(data)
dataRows = int(numpy.array(data).size / 7)
rowsTimesColumns = (dataRows * 7) -1
newArray = numpy.array(data)
outArray = newArray[0:rowsTimesColumns]
test = pandas.DataFrame(outArray.reshape(dataRows,7), columns=['no', 'distributor', 'address','contact1', 'contact2', 'contact3', 'map'])
file_path = 'Users/outpath' + '_' + optionValue + '_' + distOptionValue + '.csv'
test.to_csv(file_path, sep=',')
driver.back()
driver.back()
Can you tell me which line returns an error!? Also how about using xpaths!?
I couldn’t see the statement to implement explicit waits
WebDriverWait(driver, 30).until(EC.presence_of_element_located_by(By.CSS_SELECTOR,*your css selector*))

Resources