Having error in the concatenation of the data for the multiple pages in python - python-3.x

I am facing the error while concatenating the data of the multiple pages and exporting it in the single CSV file. According to my code, the data is exporting upto page 10 but after page number 10 it is working.
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
url = 'http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?
hDistName=Buldhana'
chrome_path =
r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
d = webdriver.Chrome(executable_path=chrome_path)
d.implicitly_wait(10)
d.get(url)
Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlTaluka')).select_by_value('7')
Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlVillage')).select_by_value('1464')
tableElement = d.find_element_by_id(
'ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate')
table = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
#print(table)
table.columns = table.iloc[0]
table = table.iloc[1:]
#print(type(table))
table = table[table.Select == 'SurveyNo']
#print(table) #assumption SurveyNo exists for all wanted rows
surveyNo_scripts = [item.get_attribute('href') for item in
d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWi
seRate [href*='Select$']")]
#print(surveyNo_scripts)
i = 0
for script in surveyNo_scripts:
d.execute_script(script)
surveys = d.find_element_by_css_selector('textarea').text
table.iloc[[i],table.columns.get_loc('Select')] = surveys
i += 1
print(table)
j=2
while True:
if len(d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(j)))>0:
#print( d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(i))[0].get_attribute('href'))
d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(j))[0].click()
tableElement = d.find_element_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate")
table1 = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
table1.columns = table1.iloc[0]
table1 = table1.iloc[1:]
#print(type(table))
table1 = table1[table1.Select == 'SurveyNo']
#print(table) #assumption SurveyNo exists for all wanted rows
surveyNo_scripts = [item.get_attribute('href') for item in
d.find_elements_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate
[href*='Select$']")]
#print(surveyNo_scripts)
i = 0
for script in surveyNo_scripts:
d.execute_script(script)
surveys =
d.find_element_by_css_selector('textarea').text
table1.iloc[[i],table1.columns.get_loc('Select')] =
surveys
i += 1
#print(table1)
#table = table.append(table1.reindex(columns=table.columns))
table1.columns = table.columns
table = pd.concat([table, table1] ,ignore_index=True)
print(table)
j+=1
else:
break
table.to_csv(r"C:\Users\Guest\Desktop\Sample_buldhana.csv", sep=',', encoding='utf-8-sig',index = False )

Related

Webscraping with Selenium and Pandas don't show dataframe values on csv

Below code is a transformed sample of the original but it describes
the problem as well. In my project I am using Selenium to collect data, then clicking
the statistics button and let pandas libraries to read page source. After some lines
of manipulation I have the desired dataframe. The problem is at the exporting procedure,
since despite the data from selectors are perfectly writed in csv, the dataframe values
displayed as zeros.
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from time import sleep
from datetime import datetime, timezone
import pandas as pd
import re
errors = []
season = []
my_url = f'https://www.mackolik.com/mac/bayern-m%C3%BCnchen-vs-augsburg/2mck8cqve7nadhtfff65a1mhg'
option = Options()
option.headless = False
driver = webdriver.Firefox(options=option)
driver.get(my_url)
driver.maximize_window()
sleep(5)
#scraping
try:
date_elm = driver.find_element(By.XPATH,
"//p[#class='p0c-soccer-match-details-header__info']//span[#class='p0c-soccer-match-details-header__info-date']").get_attribute('data-utc')
ts = int(date_elm)
ts /=1000
date = datetime.fromtimestamp(ts).strftime('%d/%m/%Y %H:%M')
info_bar = driver.find_element(By.CSS_SELECTOR,
"p[class='p0c-soccer-match-details-header__info']").text
info = info_bar.split('|')
day = info[2]
matchday = re.findall("\d+", day)[0]
crowd = info[3]
attedance = crowd[crowd.find('(')+1:crowd.find(')')]
home_team = driver.find_element(By.CSS_SELECTOR,
".p0c-soccer-match-details-header__team-name.p0c-soccer-match-details-header__team-name--home").text
away_team = driver.find_element(By.CSS_SELECTOR,
".p0c-soccer-match-details-header__team-name.p0c-soccer-match-details-header__team-name--away").text
home_score = driver.find_element(By.CSS_SELECTOR,
".p0c-soccer-match-details-header__score-home").text
away_score = driver.find_element(By.CSS_SELECTOR,
".p0c-soccer-match-details-header__score-away").text
ht_scoreA = driver.find_element(By.XPATH,
"(//div[#class='p0c-soccer-match-details-header__detailed-score'])[1]").text
ht_scoreB = split_string = re.split(r'[(-) ]', ht_scoreA)
home_htscore = ht_scoreB[2]
away_htscore = ht_scoreB[4]
referee = driver.find_element(By.CSS_SELECTOR,
"li[class='p0c-match-officials__official-list-item p0c-match-officials__official-list-item--main '] span[class='p0c-match-officials__official-name']").text
elem = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((
By.LINK_TEXT, "İstatistik")))
driver.execute_script("arguments[0].click();", elem)
sleep(10)
dfs = pd.read_html(driver.page_source)
gentab = dfs[0]
gentab = gentab.replace('%','', regex=True)
gentab.drop(gentab.columns[1], axis=1, inplace=True)
general = gentab.iloc[[0, 10]]
general.columns = ['team1', 'team2']
pastab = dfs[1]
pastab = pastab.replace('%','', regex=True)
pastab.drop(pastab.columns[1], axis=1, inplace=True)
passes = pastab.iloc[[6, 8]]
passes.columns = ['team1', 'team2']
att_tab = dfs[2]
att_tab = att_tab.replace('%','', regex=True)
att_tab.drop(att_tab.columns[1], axis=1, inplace=True)
attack = att_tab.iloc[[10, 8, 4]]
attack.columns = ['team1', 'team2']
foul_tab = dfs[4]
foul_tab = foul_tab.replace('%','', regex=True)
foul_tab.drop(foul_tab.columns[1], axis=1, inplace=True)
fouls = foul_tab.iloc[[0, 2, 4]]
fouls.columns = ['team1', 'team2']
stats = pd.concat([general, passes, attack, fouls], ignore_index=True)
stats.reset_index(drop=True, inplace=True)
stats = stats.assign(sts=['Possesion','Corners','Attack_Passes','Centres',
'Short_Shots','Long_Shots','Target_Shots','Fouls','Yellows',
'Reds'])
stats.columns = [home_team, away_team, 'sts']
stats = stats.reindex(columns=[home_team, 'sts', away_team])
driver.quit()
except:
driver.quit()
# Handling the stats
home_stats = {}
away_stats = {}
home_series = stats[home_team]
away_series = stats[away_team]
stats_series = stats['sts']
for row in zip(home_series, stats_series, away_series):
stat = row[1].replace(' ', '_').lower()
home_stats[stat] = row[0]
away_stats[stat] = row[2]
stats_check = ['Yellows', 'Reds', 'Short_Shots', 'Long_Shots', 'Target_Shots',
'Corners', 'Possesion', 'Centres', 'Attack_Passes', 'Fouls']
for stat in stats_check:
if stat not in home_stats.keys():
home_stats[stat] = 0
away_stats[stat] = 0
# Storing the data
match = [date, matchday, home_team, away_team, home_score, away_score, home_htscore, away_htscore,
referee, attedance, home_stats['Yellows'], away_stats['Yellows'],home_stats['Reds'], away_stats['Reds'],
home_stats['Short_Shots'], away_stats['Short_Shots'], home_stats['Long_Shots'], away_stats['Long_Shots'],
home_stats['Target_Shots'], away_stats['Target_Shots'], home_stats['Corners'], away_stats['Corners'],
home_stats['Possesion'], away_stats['Possesion'], home_stats['Centres'], away_stats['Centres'],
home_stats['Attack_Passes'], away_stats['Attack_Passes'], home_stats['Fouls'], away_stats['Fouls']]
season.append(match)
# Exporting the data
columns = ['date', 'matchday', 'home_team', 'away_team', 'home_score', 'away_score',
'home_htscore', 'away_htscore', 'referee', 'attedance']
for stat in stats_check:
columns.append(f'home_{stat}')
columns.append(f'away_{stat}')
dataset = pd.DataFrame(season, columns=columns)
dataset.to_csv('Bundesliga_test.csv', index=False)
print('.csv file exported.')
print(f'Number of errors: {len(errors)}')
print('Errors:\n')
print(errors)
As chitown88 suggested i tried debugging and finally spotted the problem. I had to replace list values with all lowercase in below lists. Fixed and running smoothly.
match = [date, matchday, home_team, away_team, home_score, away_score, home_htscore, away_htscore, referee, attedance,
home_stats['possesion'], away_stats['possesion'], home_stats['corners'], away_stats['corners'],
home_stats['attack_passes'], away_stats['attack_passes'], home_stats['centres'], away_stats['centres'],
home_stats['short_shots'], away_stats['short_shots'], home_stats['long_shots'], away_stats['long_shots'],
home_stats['target_shots'], away_stats['target_shots'], home_stats['fouls'], away_stats['fouls'],
home_stats['yellows'], away_stats['yellows'], home_stats['reds'], away_stats['reds']]
and
stats_check = ['possesion', 'corners', 'attack_passes', 'centres',
'short_shots', 'long_shots', 'target_shots', 'fouls', 'yellows', 'reds']

extracting columns contents only so that all columns for each row are in the same row using Python's BeautifulSoup

I have the following python snippet in Jupyter Notebooks that works.
The challenge I have is to extract just the rows of columnar data only
Here's the snippet:
from bs4 import BeautifulSoup as bs
import pandas as pd
page = requests.get("http://lib.stat.cmu.edu/datasets/boston")
page
soup = bs(page.content)
soup
allrows = soup.find_all("p")
print(allrows)
I'm a little unclear of what you are after but I think it's each individual row of data from URL provided.
I couldn't find a way to use beautiful soup to parse the data you are after but did find a way to separate the rows using .split()
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
page = requests.get("http://lib.stat.cmu.edu/datasets/boston")
soup = bs(page.content)
allrows = soup.find_all("p")
text = soup.text # turn soup into text
text_split = text.split('\n\n') # split the page into 3 sections
data = text_split[2] # rows of data
# create df column titles using variable titles on page
col_titles = text_split[1].split('\n')
df = pd.DataFrame(columns=range(14))
df.columns = col_titles[1:]
# 'try/except' to catch end of index,
# loop throw text data building complete rows
try:
complete_row = []
n1 = 0 #used to track index
n2 = 1
rows = data.split('\n')
for el in range(len(rows)):
full_row = rows[n1] + rows[n2]
complete_row.append(full_row)
n1 = n1 + 2
n2 = n2 + 2
except IndexError:
print('end of loop')
# loop through rows of data, clean whitespace and append to df
for row in complete_row:
elem = row.split(' ')
df.loc[len(df)] = [el for el in elem if el]
#fininshed dataframe
df

scrape a table in a website with python (no table tag)

I'm trying to scrape daily the stock value of a product. This is the web https://funds.ddns.net/f.php?isin=ES0110407097. And this is the code I'm trying:
import pandas as pd
from bs4 import BeautifulSoup
html_string = 'https://funds.ddns.net/f.php?isin=ES0110407097'
soup = BeautifulSoup(html_string, 'lxml')
new_table = pd.DataFrame(columns=range(0,2), index = [0])
row_marker = 0
column_marker = 0
for row in soup.find_all('tr'):
columns = soup.find_all('td')
for column in columns:
new_table.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
print(new_table)
I would like to get in Python the same format I can see in the web, both the data and the number. How can I get it, please?
There's a simpler way for that particular page:
import requests
import pandas as pd
url = 'https://funds.ddns.net/f.php?isin=ES0110407097'
resp = requests.get(url)
new_table = pd.read_html(resp.text)[0]
print(new_table.head(5))
Output:
0 1
0 FECHA VL:EUR
1 2019-12-20 120170000
2 2019-12-19 119600000
3 2019-12-18 119420000
4 2019-12-17 119390000

BeautifulSoup, Requests, Dataframe, extracting from <SPAN> and Saving to Excel

Python novice here again! 2 questions:
1) Instead of saving to multiple tabs (currently saving each year to a tab named after the year) how can I save all this data into one sheet in excel called "summary".
2) ('div',class_="sidearm-schedule-game-result") returns the format "W, 1-0". How can I split the "W, 1-0" into two columns, one containing "W" and the next column containing "1-0".
Thanks so much
import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup
import openpyxl
import csv
year_id = ['2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019']
lehigh_url = 'https://lehighsports.com/sports/mens-soccer/schedule/'
results = []
with requests.Session() as req:
for year in range(2003, 2020):
print(f"Extracting Year# {year}")
url = req.get(f"{lehigh_url}{year}")
if url.status_code == 200:
soup = BeautifulSoup(url.text, 'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
try:
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
except:
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
results.append(sheet)
def save_xls(list_dfs, xls_path):
with ExcelWriter(xls_path) as writer:
for n, df in enumerate(list_dfs):
df.to_excel(writer,'%s' %year_id[n],index=False,)
writer.save()
save_xls(results,'lehigh.xlsx')
Instead of creating a list of dataframes, you can append each sheet into 1 dataframe and write that to file with pandas. Then to split into 2 columns, just use .str.split() and split on the comma.
import requests
import pandas as pd
from bs4 import BeautifulSoup
year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']
results = pd.DataFrame()
for year in year_id:
url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + year
print (url)
lehigh = requests.get(url).text
soup = BeautifulSoup(lehigh,'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
try:
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
except:
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
results = results.append(sheet, sort=True).reset_index(drop=True)
results['result'], results['score'] = results['result'].str.split(',', 1).str
results.to_excel('lehigh.xlsx')

how to fix the indexing error and to scrape the data from a webpage

I want to scrape data from a webpage from a wayback machine using pandas. I used string split to split some string if its present.
the URL for the webpage is this
Here is my code:
import pandas as pd
url = "https://web.archive.org/web/20140528015357/http://eciresults.nic.in/statewiseS26.htm"
dfs = pd.read_html(url)
df = dfs[0]
idx = df[df[0] == '\xa0Next >>'].index[0]
# Error mentioned in comment happens on the above line.
cols = list(df.iloc[idx-1,:])
df.columns = cols
df = df[df['Const. No.'].notnull()]
df = df.loc[df['Const. No.'].str.isdigit()].reset_index(drop=True)
df = df.dropna(axis=1,how='all')
df['Leading Candidate'] = df['Leading Candidate'].str.split('i',expand=True)[0]
df['Leading Party'] = df['Leading Party'].str.split('iCurrent',expand=True)[0]
df['Trailing Party'] = df['Trailing Party'].str.split('iCurrent',expand=True)[0]
df['Trailing Candidate'] = df['Trailing Candidate'].str.split('iAssembly',expand=True)[0]
df.to_csv('Chhattisgarh_cand.csv', index=False)
The expected output from that webpage must be in csv format like
You can use BeautifulSoup to extract the data. Panadas will help you to process the data in efficient way but its not ment for data extraction.
import pandas as pd
from bs4 import BeautifulSoup
import requests
response = requests.get('https://web.archive.org/web/20140528015357/http://eciresults.nic.in/statewiseS26.htm?st=S26')
soup = BeautifulSoup(response.text,'lxml')
table_data = []
required_table = [table for table in soup.find_all('table') if str(table).__contains__('Indian National Congress')]
if required_table:
for tr_tags in required_table[0].find_all('tr',{'style':'font-size:12px;'}):
td_data = []
for td_tags in tr_tags.find_all('td'):
td_data.append(td_tags.text.strip())
table_data.append(td_data)
df = pd.DataFrame(table_data[1:])
# print(df.head())
df.to_csv("DataExport.csv",index=False)
You can expect result like this in pandas dataframe,
0 1 ... 6 7
0 BILASPUR 5 ... 176436 Result Declared
1 DURG 7 ... 16848 Result Declared
2 JANJGIR-CHAMPA 3 ... 174961 Result Declared
3 KANKER 11 ... 35158 Result Declared
4 KORBA 4 ... 4265 Result Declared
The code below should get you the table on your url link ("Chhattisgarh Result Status") using a combination of BS and pandas; you can then save it as csv:
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
url = "https://web.archive.org/web/20140528015357/http://eciresults.nic.in/statewiseS26.htm?st=S26"
response = urllib.request.urlopen(url)
elect = response.read()
soup = BeautifulSoup(elect,"lxml")
res = soup.find_all('table')
df = pd.read_html(str(res[7]))
df[3]

Resources