I want to learn how to scrape a page using BeautifulSoup and write it to a csv file. When I start appending columns to the key in a dictionary all the values are appended to each key not just a single one.
I get the information I want:
[<td class="column-2">655</td>]
[<td class="column-2">660</td>]
[<td class="column-2">54</td>]
[<td class="column-2">241</td>]
Afterwards when I try to assign each value to a key I get:
{'date': ['14th November 2016'], 'total complaints': ['655', '660', '54', '241'], 'complaints': ['655', '660', '54', '241'], 'departures': ['655', '660', '54', '241'], 'arrivals': ['655', '660', '54', '241']}
Full code (csv writer is just for testing now):
import requests
from bs4 import BeautifulSoup as BS
import csv
operational_data_url = "http://heathrowoperationaldata.com/daily-operational-data/"
operational_data_page = requests.get(operational_data_url).text
soup = BS(operational_data_page, "html.parser")
data_div = soup.find_all("ul", class_="sub-menu")
list_items = data_div[0].find_all("li")
data_links = []
for menu in data_div:
list_items = menu.find_all("li")
for links in list_items:
data_link = links.find("a")
for page in data_links[:1]:
data_page = requests.get(page).text
soup = BS(data_page, "html.parser")
date = soup.find("title")
table = soup.find("tbody")
data = {
"date" : [],
"arrivals" : [],
"departures" : [],
"complaints" : [],
"total complaints" : [],
for day in date:
rows = table.find_all("tr", class_=["row-3", "row-4", "row-36", "row-37"])
for row in rows:
cols = row.find_all("td", class_="column-2")
data["arrivals"].append( cols[0].get_text() )
data["departures"].append( cols[0].get_text() )
data["complaints"].append( cols[0].get_text() )
data["total complaints"].append( cols[0].get_text() )
with open('test.csv', 'w') as test_file:
fields = ['date', 'arrivals', 'departures', 'complaints', 'total complaints']
writer = csv.DictWriter(test_file, fields)
row = {'date': day, 'arrivals': 655, 'departures': 660, 'complaints': 54, 'total complaints': 241 }
Thanks for any help!
When I start appending columns to the key in a dictionary all the values are appended to each key not just a single one.
Currently, your for row in rows: loop does this explicitly.
It appears to me that you want to be doing something like this instead:
rows = table.find_all("tr", class_=["row-3", "row-4", "row-36", "row-37"])
cols = [row.find_all("td", class_="column-2")[0] for row in rows]
data["total complaints"].append(cols[3].get_text())
This will give you the following result for data:
{'date': [u'14th November 2016'], 'complaints': [u'54'], 'total complaints': [u'241'], 'departures': [u'660'], 'arrivals': [u'655']}
Note that this will only work if your rows are in the right order.
The code below gives me the following error:
ValueError: Length mismatch: Expected axis has 0 elements, new values have 1 elements
on the df.columns = ["GP Practice Name"] line.
I tried
import pandas as pd
import requests
from bs4 import BeautifulSoup
postal_codes = ["2000", "2010", "2020", "2030", "2040"]
places_by_postal_code = {}
def get_places(postal_code):
url = f"https://www.yellowpages.com.au/search/listings?clue={postal_code}&locationClue=&latitude=&longitude=&selectedViewMode=list&refinements=category:General%20Practitioner&selectedSortType=distance"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
places = soup.find_all("div", {"class": "listing-content"})
return [place.find("h2").text for place in places]
for postal_code in postal_codes:
places = get_places(postal_code)
places_by_postal_code[postal_code] = places
df = pd.DataFrame.from_dict(places_by_postal_code, orient='index')
df.columns = ["GP Practice Name"]
df = pd.DataFrame(places_by_postal_code.values(), index=places_by_postal_code.keys(), columns=["GP Practice Name"])
and was expecting a list of GPs for the postcodes specified in the postal_codes variable.
I am Trying to scrape graph data from the webpage: 'https://cawp.rutgers.edu/women-percentage-2020-candidates'
I tried bellow code to extract data from Graph:
import requests
from bs4 import BeautifulSoup
Res = requests.get('https://cawp.rutgers.edu/women-percentage-2020-candidates').text
soup = BeautifulSoup(Res, "html.parser")
Values= [i.text for i in soup.findAll('g', {'class': 'igc-graph'}) if i]
Dates = [i.text for i in soup.findAll('g', {'class': 'igc-legend-entry'}) if i]
print(Values, Dates) ## both list are empty
Data= pd.DataFrame({'Value':Values,'Date':Dates}) ## Returning an Empty Dataframe
I want to extract Date and Value from all the 4 bar Graphs. Please anyone suggest what i have to do here to extract the graph data, or is there any other method that i can try to extract the data. thanks;
This graph was located on this url : https://e.infogram.com/5bb50948-04b2-4113-82e6-5e5f06236538
You can find the infogram id (path of target url) directly on the original url if you look for div with class infogram-embed which has the value of attribute data-id:
<div class="infogram-embed" data-id="5bb50948-04b2-4113-82e6-5e5f06236538" data-title="Candidate Tracker 2020_US House_Proportions" data-type="interactive"> </div>
From this url, it loads a static JSON in javascript. You can use regex to extract it and parse the JSON structure to get row/column, and the different tables:
import requests
from bs4 import BeautifulSoup
import re
import json
original_url = "https://cawp.rutgers.edu/women-percentage-2020-candidates"
r = requests.get(original_url)
soup = BeautifulSoup(r.text, "html.parser")
infogram_url = f'https://e.infogram.com/{soup.find("div",{"class":"infogram-embed"})["data-id"]}'
r = requests.get(infogram_url)
soup = BeautifulSoup(r.text, "html.parser")
script = [
for t in soup.findAll("script")
if "window.infographicData" in t.text
extract = re.search(r".*window\.infographicData=(.*);$", script)
data = json.loads(extract.group(1))
entities = data["elements"]["content"]["content"]["entities"]
tables = [
(entities[key]["props"]["chartData"]["sheetnames"], entities[key]["props"]["chartData"]["data"])
for key in entities.keys()
if ("props" in entities[key]) and ("chartData" in entities[key]["props"])
data = []
for t in tables:
for i, sheet in enumerate(t[0]):
"sheetName": sheet,
"table": dict([(t[1][i][0][j],t[1][i][1][j]) for j in range(len(t[1][i][0])) ])
[{'sheetName': 'Sheet 1',
'table': {'': '2020', 'Districts Already Filed': '435'}},
{'sheetName': 'All',
'table': {'': 'Filed', '2016': '17.8%', '2018': '24.2%', '2020': '29.1%'}},
{'sheetName': 'Democrats Only',
'table': {'': 'Filed', '2016': '25.1%', '2018': '32.5%', '2020': '37.9%'}},
{'sheetName': 'Republicans Only',
'table': {'': 'Filed', '2016': '11.5%', '2018': '13.7%', '2020': '21.3%'}}]
I'm trying to scrape a website and get the output in an Excel file. I manage to create the Excel file but the columns are all messed up (please see the pictures).
How should I go about transferring the data correctly from the CSV file to the Excel file?
The code I used:
import requests
import pandas as pd
from bs4 import BeautifulSoup
page = requests.get('https://forecast.weather.gov/MapClick.php?lat=34.05349000000007&lon=-118.24531999999999#.XsTs9RMzZTZ')
soup = BeautifulSoup(page.content, 'html.parser')
week = soup.find(id = 'seven-day-forecast-body')
items = week.find_all(class_='tombstone-container')
period_names = [item.find(class_='period-name').get_text() for item in items]
short_descriptions = [item.find(class_='short-desc').get_text() for item in items]
temperatures = [item.find(class_='temp').get_text() for item in items]
weather_stuff = pd.DataFrame(
'period' : period_names,
'short_descriptions' : short_descriptions,
'temperatures' : temperatures,
A minimilistic working example:
df1 = pd.DataFrame([['a', 'b'], ['c', 'd']],
index=['row 1', 'row 2'],
columns=['col 1', 'col 2'])
# To specify the sheet name:
df1.to_excel("output.xlsx", sheet_name='Sheet_name_1')
Source: Documentation
Python novice here again! 2 questions:
1) Instead of saving to multiple tabs (currently saving each year to a tab named after the year) how can I save all this data into one sheet in excel called "summary".
2) ('div',class_="sidearm-schedule-game-result") returns the format "W, 1-0". How can I split the "W, 1-0" into two columns, one containing "W" and the next column containing "1-0".
Thanks so much
import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup
import openpyxl
import csv
year_id = ['2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019']
lehigh_url = 'https://lehighsports.com/sports/mens-soccer/schedule/'
results = []
with requests.Session() as req:
for year in range(2003, 2020):
print(f"Extracting Year# {year}")
url = req.get(f"{lehigh_url}{year}")
if url.status_code == 200:
soup = BeautifulSoup(url.text, 'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
def save_xls(list_dfs, xls_path):
with ExcelWriter(xls_path) as writer:
for n, df in enumerate(list_dfs):
df.to_excel(writer,'%s' %year_id[n],index=False,)
Instead of creating a list of dataframes, you can append each sheet into 1 dataframe and write that to file with pandas. Then to split into 2 columns, just use .str.split() and split on the comma.
import requests
import pandas as pd
from bs4 import BeautifulSoup
year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']
results = pd.DataFrame()
for year in year_id:
url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + year
print (url)
lehigh = requests.get(url).text
soup = BeautifulSoup(lehigh,'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
results = results.append(sheet, sort=True).reset_index(drop=True)
results['result'], results['score'] = results['result'].str.split(',', 1).str
I have some code which compares two excel files and determines any new rows (new_rows) added or any rows which were deleted (dropped_rows). It then uses xlsxwriter to write this to a excel sheet. The bit of code I am having trouble with is that it is supposed to then iterate through the rows and if the row was a new row or a dropped row it is supposed to format it a certain way. For whatever reason this part of the code isn't working correct and being ignored.
I've tried a whole host of different syntax to make this work but no luck.
After some more trial and error the issue seems to be caused by the index column. It is a Case Number column and the values have a prefix like "Case_123, Case_456, Case_789, etc..". This seems to be the root of the issue. But not sure how to solve for it.
grey_fmt = workbook.add_format({'font_color': '#E0E0E0'})
highlight_fmt = workbook.add_format({'font_color': '#FF0000', 'bg_color':'#B1B3B3'})
new_fmt = workbook.add_format({'font_color': '#32CD32','bold':True})
# set format over range
## highlight changed cells
worksheet.conditional_format('A1:J10000', {'type': 'text',
'criteria': 'containing',
'format': highlight_fmt})
# highlight new/changed rows
for row in range(dfDiff.shape[0]):
if row+1 in newRows:
worksheet.set_row(row+1, 15, new_fmt)
if row+1 in droppedRows:
worksheet.set_row(row+1, 15, grey_fmt)
the last part # highlight new/changed rows is the bit that is not working. The conditional format portion works fine.
the rest of the code:
import pandas as pd
from pathlib import Path
def excel_diff(path_OLD, path_NEW, index_col):
df_OLD = pd.read_excel(path_OLD, index_col=index_col).fillna(0)
df_NEW = pd.read_excel(path_NEW, index_col=index_col).fillna(0)
# Perform Diff
dfDiff = df_NEW.copy()
droppedRows = []
newRows = []
cols_OLD = df_OLD.columns
cols_NEW = df_NEW.columns
sharedCols = list(set(cols_OLD).intersection(cols_NEW))
for row in dfDiff.index:
if (row in df_OLD.index) and (row in df_NEW.index):
for col in sharedCols:
value_OLD = df_OLD.loc[row,col]
value_NEW = df_NEW.loc[row,col]
if value_OLD==value_NEW:
dfDiff.loc[row,col] = df_NEW.loc[row,col]
dfDiff.loc[row,col] = ('{}→{}').format(value_OLD,value_NEW)
for row in df_OLD.index:
if row not in df_NEW.index:
dfDiff = dfDiff.append(df_OLD.loc[row,:])
dfDiff = dfDiff.sort_index().fillna('')
print('\nNew Rows: {}'.format(newRows))
print('Dropped Rows: {}'.format(droppedRows))
# Save output and format
fname = '{} vs {}.xlsx'.format(path_OLD.stem,path_NEW.stem)
writer = pd.ExcelWriter(fname, engine='xlsxwriter')
dfDiff.to_excel(writer, sheet_name='DIFF', index=True)
df_NEW.to_excel(writer, sheet_name=path_NEW.stem, index=True)
df_OLD.to_excel(writer, sheet_name=path_OLD.stem, index=True)
# get xlsxwriter objects
workbook = writer.book
worksheet = writer.sheets['DIFF']
# define formats
date_fmt = workbook.add_format({'align': 'center', 'num_format': 'yyyy-mm-dd'})
center_fmt = workbook.add_format({'align': 'center'})
number_fmt = workbook.add_format({'align': 'center', 'num_format': '#,##0.00'})
cur_fmt = workbook.add_format({'align': 'center', 'num_format': '$#,##0.00'})
perc_fmt = workbook.add_format({'align': 'center', 'num_format': '0%'})
grey_fmt = workbook.add_format({'font_color': '#E0E0E0'})
highlight_fmt = workbook.add_format({'font_color': '#FF0000', 'bg_color':'#B1B3B3'})
new_fmt = workbook.add_format({'font_color': '#32CD32','bold':True})
# set format over range
## highlight changed cells
worksheet.conditional_format('A1:J10000', {'type': 'text',
'criteria': 'containing',
'format': highlight_fmt})
# highlight new/changed rows
for row in range(dfDiff.shape[0]):
if row+1 in newRows:
worksheet.set_row(row+1, 15, new_fmt)
if row+1 in droppedRows:
worksheet.set_row(row+1, 15, grey_fmt)
# save
def main():
path_OLD = Path('file1.xlsx')
path_NEW = Path('file2.xlsx')
# get index col from data
df = pd.read_excel(path_NEW)
index_col = df.columns[0]
print('\nIndex column: {}\n'.format(index_col))
excel_diff(path_OLD, path_NEW, index_col)
if __name__ == '__main__':