Webscraping with Selenium and Pandas don't show dataframe values on csv - python-3.x

Below code is a transformed sample of the original but it describes
the problem as well. In my project I am using Selenium to collect data, then clicking
the statistics button and let pandas libraries to read page source. After some lines
of manipulation I have the desired dataframe. The problem is at the exporting procedure,
since despite the data from selectors are perfectly writed in csv, the dataframe values
displayed as zeros.
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from time import sleep
from datetime import datetime, timezone
import pandas as pd
import re
errors = []
season = []
my_url = f'https://www.mackolik.com/mac/bayern-m%C3%BCnchen-vs-augsburg/2mck8cqve7nadhtfff65a1mhg'
option = Options()
option.headless = False
driver = webdriver.Firefox(options=option)
driver.get(my_url)
driver.maximize_window()
sleep(5)
#scraping
try:
date_elm = driver.find_element(By.XPATH,
"//p[#class='p0c-soccer-match-details-header__info']//span[#class='p0c-soccer-match-details-header__info-date']").get_attribute('data-utc')
ts = int(date_elm)
ts /=1000
date = datetime.fromtimestamp(ts).strftime('%d/%m/%Y %H:%M')
info_bar = driver.find_element(By.CSS_SELECTOR,
"p[class='p0c-soccer-match-details-header__info']").text
info = info_bar.split('|')
day = info[2]
matchday = re.findall("\d+", day)[0]
crowd = info[3]
attedance = crowd[crowd.find('(')+1:crowd.find(')')]
home_team = driver.find_element(By.CSS_SELECTOR,
".p0c-soccer-match-details-header__team-name.p0c-soccer-match-details-header__team-name--home").text
away_team = driver.find_element(By.CSS_SELECTOR,
".p0c-soccer-match-details-header__team-name.p0c-soccer-match-details-header__team-name--away").text
home_score = driver.find_element(By.CSS_SELECTOR,
".p0c-soccer-match-details-header__score-home").text
away_score = driver.find_element(By.CSS_SELECTOR,
".p0c-soccer-match-details-header__score-away").text
ht_scoreA = driver.find_element(By.XPATH,
"(//div[#class='p0c-soccer-match-details-header__detailed-score'])[1]").text
ht_scoreB = split_string = re.split(r'[(-) ]', ht_scoreA)
home_htscore = ht_scoreB[2]
away_htscore = ht_scoreB[4]
referee = driver.find_element(By.CSS_SELECTOR,
"li[class='p0c-match-officials__official-list-item p0c-match-officials__official-list-item--main '] span[class='p0c-match-officials__official-name']").text
elem = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((
By.LINK_TEXT, "İstatistik")))
driver.execute_script("arguments[0].click();", elem)
sleep(10)
dfs = pd.read_html(driver.page_source)
gentab = dfs[0]
gentab = gentab.replace('%','', regex=True)
gentab.drop(gentab.columns[1], axis=1, inplace=True)
general = gentab.iloc[[0, 10]]
general.columns = ['team1', 'team2']
pastab = dfs[1]
pastab = pastab.replace('%','', regex=True)
pastab.drop(pastab.columns[1], axis=1, inplace=True)
passes = pastab.iloc[[6, 8]]
passes.columns = ['team1', 'team2']
att_tab = dfs[2]
att_tab = att_tab.replace('%','', regex=True)
att_tab.drop(att_tab.columns[1], axis=1, inplace=True)
attack = att_tab.iloc[[10, 8, 4]]
attack.columns = ['team1', 'team2']
foul_tab = dfs[4]
foul_tab = foul_tab.replace('%','', regex=True)
foul_tab.drop(foul_tab.columns[1], axis=1, inplace=True)
fouls = foul_tab.iloc[[0, 2, 4]]
fouls.columns = ['team1', 'team2']
stats = pd.concat([general, passes, attack, fouls], ignore_index=True)
stats.reset_index(drop=True, inplace=True)
stats = stats.assign(sts=['Possesion','Corners','Attack_Passes','Centres',
'Short_Shots','Long_Shots','Target_Shots','Fouls','Yellows',
'Reds'])
stats.columns = [home_team, away_team, 'sts']
stats = stats.reindex(columns=[home_team, 'sts', away_team])
driver.quit()
except:
driver.quit()
# Handling the stats
home_stats = {}
away_stats = {}
home_series = stats[home_team]
away_series = stats[away_team]
stats_series = stats['sts']
for row in zip(home_series, stats_series, away_series):
stat = row[1].replace(' ', '_').lower()
home_stats[stat] = row[0]
away_stats[stat] = row[2]
stats_check = ['Yellows', 'Reds', 'Short_Shots', 'Long_Shots', 'Target_Shots',
'Corners', 'Possesion', 'Centres', 'Attack_Passes', 'Fouls']
for stat in stats_check:
if stat not in home_stats.keys():
home_stats[stat] = 0
away_stats[stat] = 0
# Storing the data
match = [date, matchday, home_team, away_team, home_score, away_score, home_htscore, away_htscore,
referee, attedance, home_stats['Yellows'], away_stats['Yellows'],home_stats['Reds'], away_stats['Reds'],
home_stats['Short_Shots'], away_stats['Short_Shots'], home_stats['Long_Shots'], away_stats['Long_Shots'],
home_stats['Target_Shots'], away_stats['Target_Shots'], home_stats['Corners'], away_stats['Corners'],
home_stats['Possesion'], away_stats['Possesion'], home_stats['Centres'], away_stats['Centres'],
home_stats['Attack_Passes'], away_stats['Attack_Passes'], home_stats['Fouls'], away_stats['Fouls']]
season.append(match)
# Exporting the data
columns = ['date', 'matchday', 'home_team', 'away_team', 'home_score', 'away_score',
'home_htscore', 'away_htscore', 'referee', 'attedance']
for stat in stats_check:
columns.append(f'home_{stat}')
columns.append(f'away_{stat}')
dataset = pd.DataFrame(season, columns=columns)
dataset.to_csv('Bundesliga_test.csv', index=False)
print('.csv file exported.')
print(f'Number of errors: {len(errors)}')
print('Errors:\n')
print(errors)

As chitown88 suggested i tried debugging and finally spotted the problem. I had to replace list values with all lowercase in below lists. Fixed and running smoothly.
match = [date, matchday, home_team, away_team, home_score, away_score, home_htscore, away_htscore, referee, attedance,
home_stats['possesion'], away_stats['possesion'], home_stats['corners'], away_stats['corners'],
home_stats['attack_passes'], away_stats['attack_passes'], home_stats['centres'], away_stats['centres'],
home_stats['short_shots'], away_stats['short_shots'], home_stats['long_shots'], away_stats['long_shots'],
home_stats['target_shots'], away_stats['target_shots'], home_stats['fouls'], away_stats['fouls'],
home_stats['yellows'], away_stats['yellows'], home_stats['reds'], away_stats['reds']]
and
stats_check = ['possesion', 'corners', 'attack_passes', 'centres',
'short_shots', 'long_shots', 'target_shots', 'fouls', 'yellows', 'reds']

Related

How to removing trailing .0 from a series column

I am trying to remove the trailing .0 from the rows of CAS/ID NO column using this code:
import requests
import pandas as pd
url = ' https://www.accessdata.fda.gov/scripts/sda/sdNavigation.cfm?sd=edisrev&displayAll=true'
html = requests.get(url).content
df_list2 = pd.read_html(html)
df2 = df_list2[0]
df3=df2.dropna(subset = ['CAS/ID NO'])
df3['CAS'] = df3['CAS/ID NO'].to_string()
df3['CAS'] = df3['CAS/ID NO'].astype(str).replace('\.0', '', regex=False)
df3
It is steadfastly resisting all of my efforts.
You could try to convert in to type .astype('int64'):
df2['CAS'] = df2['CAS/ID NO'].astype('int64')
Example
import requests
import pandas as pd
url = ' https://www.accessdata.fda.gov/scripts/sda/sdNavigation.cfm?sd=edisrev&displayAll=true'
html = requests.get(url).content
df_list2 = pd.read_html(html)
df2 = df_list2[0]
df2 = df2.dropna(subset = ['CAS/ID NO']).copy()
df2['CAS'] = df2['CAS/ID NO'].astype('int64')
df2

Pass url column's values one by one to web crawler code in Python

Based on the answered code from this link, I'm able to create a new column: df['url'] = 'https://www.cspea.com.cn/list/c01/' + df['projectCode'].
Next step I would like to pass the url column's values to the following code and append all the scrapied contents as dataframe.
import urllib3
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186" # url column's values should be passed here one by one
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
print(df)
How could I do that in Python? Thanks.
Updated:
import requests
from bs4 import BeautifulSoup
import pandas as pd
df = pd.read_excel('items_scraped.xlsx')
data = []
urls = df.url.tolist()
for url_link in urls:
url = url_link
# url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186"
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
df = df.T
df.reset_index(drop=True, inplace=True)
print(df)
df.to_excel('result.xlsx', index = False)
But it only saved one rows into excel file.
You need to combine the dfs generated in the loop. You could add them to a list and then call pd.concat on that list.
import requests
from bs4 import BeautifulSoup
import pandas as pd
df = pd.read_excel('items_scraped.xlsx')
# data = []
urls = df.url.tolist()
dfs = []
for url_link in urls:
url = url_link
# url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186"
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
df = df.T
df.reset_index(drop=True, inplace=True)
print(df)
dfs.append(df)
df = pd.concat(dfs)
df.to_excel('result.xlsx', index = False)
Use
urls = df.url.tolist()
To create a list of URLs and then iterate through them using f string to insert each one into your base url

BeautifulSoup, Requests, Dataframe, extracting from <SPAN> and Saving to Excel

Python novice here again! 2 questions:
1) Instead of saving to multiple tabs (currently saving each year to a tab named after the year) how can I save all this data into one sheet in excel called "summary".
2) ('div',class_="sidearm-schedule-game-result") returns the format "W, 1-0". How can I split the "W, 1-0" into two columns, one containing "W" and the next column containing "1-0".
Thanks so much
import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup
import openpyxl
import csv
year_id = ['2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019']
lehigh_url = 'https://lehighsports.com/sports/mens-soccer/schedule/'
results = []
with requests.Session() as req:
for year in range(2003, 2020):
print(f"Extracting Year# {year}")
url = req.get(f"{lehigh_url}{year}")
if url.status_code == 200:
soup = BeautifulSoup(url.text, 'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
try:
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
except:
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
results.append(sheet)
def save_xls(list_dfs, xls_path):
with ExcelWriter(xls_path) as writer:
for n, df in enumerate(list_dfs):
df.to_excel(writer,'%s' %year_id[n],index=False,)
writer.save()
save_xls(results,'lehigh.xlsx')
Instead of creating a list of dataframes, you can append each sheet into 1 dataframe and write that to file with pandas. Then to split into 2 columns, just use .str.split() and split on the comma.
import requests
import pandas as pd
from bs4 import BeautifulSoup
year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']
results = pd.DataFrame()
for year in year_id:
url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + year
print (url)
lehigh = requests.get(url).text
soup = BeautifulSoup(lehigh,'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
try:
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
except:
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
results = results.append(sheet, sort=True).reset_index(drop=True)
results['result'], results['score'] = results['result'].str.split(',', 1).str
results.to_excel('lehigh.xlsx')

Having error in the concatenation of the data for the multiple pages in python

I am facing the error while concatenating the data of the multiple pages and exporting it in the single CSV file. According to my code, the data is exporting upto page 10 but after page number 10 it is working.
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
url = 'http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?
hDistName=Buldhana'
chrome_path =
r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
d = webdriver.Chrome(executable_path=chrome_path)
d.implicitly_wait(10)
d.get(url)
Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlTaluka')).select_by_value('7')
Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlVillage')).select_by_value('1464')
tableElement = d.find_element_by_id(
'ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate')
table = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
#print(table)
table.columns = table.iloc[0]
table = table.iloc[1:]
#print(type(table))
table = table[table.Select == 'SurveyNo']
#print(table) #assumption SurveyNo exists for all wanted rows
surveyNo_scripts = [item.get_attribute('href') for item in
d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWi
seRate [href*='Select$']")]
#print(surveyNo_scripts)
i = 0
for script in surveyNo_scripts:
d.execute_script(script)
surveys = d.find_element_by_css_selector('textarea').text
table.iloc[[i],table.columns.get_loc('Select')] = surveys
i += 1
print(table)
j=2
while True:
if len(d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(j)))>0:
#print( d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(i))[0].get_attribute('href'))
d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(j))[0].click()
tableElement = d.find_element_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate")
table1 = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
table1.columns = table1.iloc[0]
table1 = table1.iloc[1:]
#print(type(table))
table1 = table1[table1.Select == 'SurveyNo']
#print(table) #assumption SurveyNo exists for all wanted rows
surveyNo_scripts = [item.get_attribute('href') for item in
d.find_elements_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate
[href*='Select$']")]
#print(surveyNo_scripts)
i = 0
for script in surveyNo_scripts:
d.execute_script(script)
surveys =
d.find_element_by_css_selector('textarea').text
table1.iloc[[i],table1.columns.get_loc('Select')] =
surveys
i += 1
#print(table1)
#table = table.append(table1.reindex(columns=table.columns))
table1.columns = table.columns
table = pd.concat([table, table1] ,ignore_index=True)
print(table)
j+=1
else:
break
table.to_csv(r"C:\Users\Guest\Desktop\Sample_buldhana.csv", sep=',', encoding='utf-8-sig',index = False )

Import and parse .data file

there is a file I tried to import and safe as pandas df. At a first sight looks like it's already columns and rows ordered, but finally I had to do a bunch of stuff to create pandas df. Could you please check if there is much faster way to manage it?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
My way of doing it is:
import requests
import pandas as pd
r = requests.get(url)
file = r.text
step_1 = file.split('\n')
for n in range(len(step_1)): # remove empty strings
if bool(step_1[n]) == False:
del(step_1[n])
step_2 = [i.split('\t') for i in step_1]
cars_names = [i[1] for i in step_2]
step_3 = [i[0].split(' ') for i in step_2]
for e in range(len(step_3)): # remove empty strings in each sublist
step_3[e] = [item for item in step_3[e] if item != '']
mpg = [i[0] for i in step_3]
cylinders = [i[1] for i in step_3]
disp = [i[2] for i in step_3]
horsepower = [i[3] for i in step_3]
weight = [i[4] for i in step_3]
acce = [i[5] for i in step_3]
year = [i[6] for i in step_3]
origin = [i[7] for i in step_3]
list_cols = [cars_names, mpg, cylinders, disp, horsepower, weight, acce, year, origin]
# list_labels written manually:
list_labels = ['car name', 'mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin']
zipped = list(zip(list_labels, list_cols))
data = dict(zipped)
df = pd.DataFrame(data)
When you replaced \t to blankspace, you can use read_csv to read it. But you need to wrap up your text, because the first parameter in read_csv is filepath_or_buffer which needs object with a read() method (such as a file handle or StringIO). Then your question can be transform to read_csv doesn't read the column names correctly on this file?
import requests
import pandas as pd
from io import StringIO
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
r = requests.get(url)
file = r.text.replace("\t"," ")
# list_labels written manually:
list_labels = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin','car name']
df = pd.read_csv(StringIO(file),sep="\s+",header = None,names=list_labels)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df)

Resources