Pandas and adding column and data to a table - python-3.x

Any idea how to add the division(j) to each row?? I run the program and it runs through each division (division 1 through 5). I want to add what division it is to each row. I have the headers 'Name, Gender, State, Position, Grad, Club/HS, Rating, Commitment, Division' at the top of the table. Right now I don't know which division each row is because it is blank. Thanks for your help....
import pandas as pd
max_page_num = 10
with open('results.csv', 'a', newline='') as f:
f.write('Name, Gender, State, Position, Grad, Club/HS, Rating, Commitment, Division\n')
def division():
for j in range(1,5):
division = str(j)
for i in range(max_page_num):
print('page:', i)
graduation = str(2020)
area = "commitments" # "commitments" or "clubplayer"
gender = "m"
page_num = str(i)
source = "https://www.topdrawersoccer.com/search/?query=&divisionId=" + division + "&genderId=m&graduationYear=" + graduation + "&playerRating=&pageNo=" + page_num + "&area=" + area +""
all_tables = pd.read_html(source)
df = all_tables[0]
print('items:', len(df))
df.to_csv('results.csv', header=False, index=False, mode='a')
division()

Simply adding the column 'division' should do it if I understand correctly.
import pandas as pd
max_page_num = 10
with open('results.csv', 'a', newline='') as f:
f.write('Name, Gender, State, Position, Grad, Club/HS, Rating, Commitment, Division\n')
def division():
for j in range(1,5):
division = str(j)
for i in range(max_page_num):
print('page:', i)
graduation = str(2020)
area = "commitments" # "commitments" or "clubplayer"
gender = "m"
page_num = str(i)
source = "https://www.topdrawersoccer.com/search/?query=&divisionId=" + division + "&genderId=m&graduationYear=" + graduation + "&playerRating=&pageNo=" + page_num + "&area=" + area +""
all_tables = pd.read_html(source)
df = all_tables[0]
df['division'] = division
print('items:', len(df))
df.to_csv('results.csv', header=False, index=False, mode='a')
division()

Related

ValueError: Length of values (1) does not match length of index (50)

Hey there awesome peeps,
I am trying to retrieve some trend information based on some keywords that I have in a list (1000 keywords). In order to minimize the chance of getting blocked by Google I have a cutoff period of 50 and a 10 second pause. At the moment I get an error saying that my Length of value does not match the length of the index. This fails on the
df3['Trend'] = trends
If anyone can help I will really appreciate it.
Thanks!
!pip install pytrends
import pandas as pd
import json
import time
from pytrends.request import TrendReq
get_gsc_file = "/content/Queries.csv"
sortby = "Clicks"
cutoff = 50
pause = 10
timeframe = "today 3-m"
geo = "US"
df = pd.read_csv(get_gsc_file, encoding='utf-8')
df.sort_values(by=[sortby], ascending=False, inplace=True)
df = df[:cutoff]
d = {'Keyword': [], sortby:[], 'Trend': []}
df3 = pd.DataFrame(data=d)
keywords = []
trends = []
metric = df[sortby].tolist()
up = 0
down = 0
flat = 0
na = 0
for index, row in df.iterrows():
keyword = row['Top queries']
pytrends = TrendReq(hl='en-US', tz=360, retries=2, backoff_factor=0.1)
kw_list = [keyword]
pytrends.build_payload(kw_list, cat=0, timeframe=timeframe, geo=geo, gprop='')
df2 = pytrends.interest_over_time()
keywords.append(keyword)
try:
trend1 = int((df2[keyword][-5] + df2[keyword][-4] + df2[keyword][-3])/3)
trend2 = int((df2[keyword][-4] + df2[keyword][-3] + df2[keyword][-2])/3)
trend3 = int((df2[keyword][-3] + df2[keyword][-2] + df2[keyword][-1])/3)
if trend3 > trend2 and trend2 > trend1:
trends.append('UP')
up+=1
elif trend3 < trend2 and trend2 < trend1:
trends.append('DOWN')
down+=1
else:
trends.append('FLAT')
flat+=1
except:
trends.append('N/A')
na+=1
time.sleep(pause)
df3['Keyword'] = keywords
df3['Trend'] = trends
df3[sortby] = metric
def colortable(val):
if val == 'DOWN':
color="lightcoral"
elif val == 'UP':
color = "lightgreen"
elif val == 'FLAT':
color = "lightblue"
else:
color = 'white'
return 'background-color: %s' % color
df3 = df3.style.applymap(colortable)
total = len(trends)
print("Up: " + str(up) + " | " + str(round((up/total)*100,0)) + "%")
print("Down: " + str(down) + " | " + str(round((down/total)*100,0)) + "%")
print("Flat: " + str(flat) + " | " + str(round((flat/total)*100,0)) + "%")
print("N/A: " + str(na) + " | " + str(round((na/total)*100,0)) + "%")
df3

Saving xlsx files that arent corrupted via openpyxl

I am generating around 10000 xlsx files to run a Monte Carlo simulation using a program called AMPL.
To generate these files I am using the below python script using openpyxl. The xlsx file that results needs to still be opened and "save as" and replaced as the same xlsx in order for AMPL to recognize it.
I only know how to do this by hand but am looking into suggestions on:
1) What can I modify in the python script to avoid the file corruption so I don't have to save and replace this file by hand.
2) How to "Save as" a batch of xlsx files to the same name xlsx files.
Here is the code
"""
Created on Mon Mar 2 14:59:43 2020
USES OPENPYXL to generate mc tables WITH NAMED RANGE
#author: rsuthar
"""
import openpyxl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import scipy.stats as stats
#import seaborn as sns
for k in range(1,2):
wb = openpyxl.Workbook()
sheet = wb.active
#named range for COL table so that AMPL can read it
new_range = openpyxl.workbook.defined_name.DefinedName('COL', attr_text='Sheet!$A$1:$D$64')
wb.defined_names.append(new_range)
#Probability
#Storage temp as truncated normal
#temperature as normal mean 55 with 5F variation
storagetempfarenht = 55.4
storagetempkelvin = (storagetempfarenht + 459.67) * (5.0/9.0)
highesttemp=60.8
lowesttemp=50
sigma = ((highesttemp + 459.67) * (5.0/9.0)) - storagetempkelvin
mu, sigma = storagetempkelvin, sigma
lower, upper = mu-2*sigma , mu+2*sigma
temp = stats.truncnorm.rvs((lower - mu) / sigma, (upper - mu) / sigma, loc=mu, scale=sigma, size=1)
#Generate the color after each condition with temp uncertainty
kterm=0.0019*math.exp((170604/8.314)*((1/288.15)-(1/temp)))
Hterm = '=16.949*EXP((-0.025)*(42 +((124-42)/(1+((EXP(%f*A2:A64*(124-42)))*(124-(16.949*EXP((-0.025)*C2:C64))/((16.949*EXP((-0.025)*C2:C64)-42))))))))' % kterm
#First column
sheet['A1'] = 'DAYD'
number_of_repeats = 5
days=range(1,13)
current_cell_num = 2
for repeat in range(number_of_repeats):
for day in days:
cell_string = 'A%d' % current_cell_num
sheet[cell_string] = day
current_cell_num = current_cell_num + 1
if repeat == number_of_repeats - 1:
for day in range(13,16,1):
cell_string = 'A%d' % current_cell_num
sheet[cell_string] = day
current_cell_num = current_cell_num + 1
#Second Column
sheet['B1'] = 'CROP'
for i, rowOfCellObjects in enumerate(sheet['B2':'B64']):
for n, cellObj in enumerate(rowOfCellObjects):
cellObj.value = 'TA'
#Third Column
sheet['C1'] = 'QUAL'
for i, rowOfCellObjects in enumerate(sheet['C2':'C13']):
for n, cellObj in enumerate(rowOfCellObjects):
cellObj.value = 2
sheet['C1'] = 'QUAL'
for i, rowOfCellObjects in enumerate(sheet['C14':'C25']):
for n, cellObj in enumerate(rowOfCellObjects):
cellObj.value = 3
sheet['C1'] = 'QUAL'
for i, rowOfCellObjects in enumerate(sheet['C26':'C37']):
for n, cellObj in enumerate(rowOfCellObjects):
cellObj.value = 4
sheet['C1'] = 'QUAL'
for i, rowOfCellObjects in enumerate(sheet['C38':'C49']):
for n, cellObj in enumerate(rowOfCellObjects):
cellObj.value = 5
sheet['C1'] = 'QUAL'
for i, rowOfCellObjects in enumerate(sheet['C50':'C64']):
for n, cellObj in enumerate(rowOfCellObjects):
cellObj.value = 1
#fourth Column
sheet['D1'] = 'COL'
for i, rowOfCellObjects in enumerate(sheet['D2':'D64']):
for n, cellObj in enumerate(rowOfCellObjects):
cellObj.value = Hterm
#save the file everytime
wb.save(filename='COL' + str(k) + '.xlsx')

Bokeh charts unresponsive on rangeslider on_change

I am working on bokeh charts for the first time. I have followed a few tutorials but due to some reason, update function is not working on rangeslider on_change()
def make_data(df, start, end):
#df['ID'] = range(1, len(df) + 1)
s = df['ID'] >= start
e = df['ID'] <= end
df1 = df[e & s]
date = df1['date'].tolist()
capi = df1['capi'].tolist()
data = {'x': dateTime(date), 'y': capi}
source = ColumnDataSource(data)
return source
def update(attr, old, new):
df = pd.DataFrame.from_csv("main_data.csv", index_col = None)
df['ID'] = range(1, len(df) + 1)
new_src = make_dataset(df, range_start = range_select.value[0], range_end = range_select.value[1])
source.data.update(new_src.data)
def make_plot(source):
p1 = figure(x_axis_type="datetime", title="Stock Closing Prices")
p1.grid.grid_line_alpha=0.3
p1.xaxis.axis_label = 'Date'
p1.yaxis.axis_label = 'Price'
p1.line('x', 'y', source = source, color='#A6CEE3', legend='capi')
return p1
range_select = RangeSlider(title="Date range", value=(ids[0], ids[100]), start=ids[0], end=ids[-1], step=1)
range_select.on_change('value', update)
source = make_data(df, 1, 1000)
p = make_plot(source)
controls = WidgetBox(range_select)
layout = column(controls, p)
tab = Panel(child=layout, title = 'Histogram')
tabs = Tabs(tabs = [tab])
show(tabs)
can someone please point me in the right direction here

Bokeh Sliders for stacked vbar to increase segment size & HoverTool

I am aiming in the below code to make a stacked bar chart with bokeh, appended with sliders so I can increase or decrease the size of each bar segment and shift the others in turn.
My issue right now is that it will not update when running from a bokeh server. My guess is maybe bokeh does not run the calculations again after updating the source... Or I am getting a source conflict. (So far I have only implemented it for "Engineering". Wanted to get that to work before I sort the rest out.
Other things of note. I am using a depreciated technique of providing each glyph with bottom / top data as well as a source. This was done as it was the only way I could get the hovertool to show.
The only way I have got this to work was to redraw the graph completely, I would be ok with this option, but it was stacking the graphs on top of each other. Is there a way to clear all previous graphs in Bokeh? Obviously I would prefer a solution which just alters the data and doesn't completely redraw the graph.
from bokeh.plotting import figure, show, curdoc
from bokeh.models import NumeralTickFormatter
from bokeh.models import HoverTool
from bokeh.models import ColumnDataSource
from bokeh.layouts import widgetbox, column
from bokeh.models import CustomJS, Slider
from matplotlib import colors
import pandas as pd
import numpy as np
# Read Data
df=pd.read_csv('/home/mint/SAGD_Costs.csv')
# Master source
source = ColumnDataSource(df)
# Bar Tops Data
engtop = source.data['Engineering'][0]
equiptop = source.data['Engineering'][0] + source.data['Equipment'][0]
bulktop = source.data['Engineering'][0] + source.data['Equipment'][0] + source.data['Bulk_Materials'][0]
inditop = source.data['Engineering'][0] + source.data['Equipment'][0] + source.data['Bulk_Materials'][0] + source.data['Indirects'][0]
labtop = source.data['Engineering'][0] + source.data['Equipment'][0] + source.data['Bulk_Materials'][0] + source.data['Indirects'][0] + source.data['Labour'][0]
# Source for Stupid Hovertool
engsource = ColumnDataSource(data=dict(x=[0], y=[engtop], desc = ['Engineering']))
equipsource = ColumnDataSource(data=dict(x=[0], y=[equiptop-engtop], desc = ['Equipment']))
bulksource = ColumnDataSource(data=dict(x=[0], y=[bulktop-equiptop], desc = ['Bulk Materials']))
indisource = ColumnDataSource(data=dict(x=[0], y=[inditop-bulktop], desc = ['Indirects']))
labsource = ColumnDataSource(data=dict(x=[0], y=[labtop-inditop], desc = ['Labour']))
# HoverTool Label
hover = HoverTool(
tooltips=[
('Item', '#desc'),
('Cost', '#y{$ 0.00 a}'),
]
)
# Other Tools
TOOLS = 'box_zoom, box_select, resize, reset'
# Figure
p = figure(title="Capital Costs Breakdown", title_location="above", plot_width=600, plot_height=600, x_range=(-2, 2), tools=[TOOLS, hover])
# Plots
engbar = p.vbar(x=source.data['Year'][0], width=2, bottom=0,
top=engtop, alpha=0.75, color="darkslategrey", legend="Engineering", source=engsource)
equipbar = p.vbar(x=[source.data['Year'][0]], width=2, bottom=engtop,
top = equiptop, alpha=0.75, color="teal", legend="Equipment", source=equipsource)
bulkbar = p.vbar(x=[source.data['Year'][0]], width=2, bottom=equiptop,
top=bulktop, alpha=0.75, color="cyan", legend="Bulk Materials", source=bulksource)
indibar = p.vbar(x=[source.data['Year'][0]], width=2, bottom=bulktop,
top=inditop, alpha=0.75, color="powderblue", legend="Indirects", source=indisource)
labbar = p.vbar(x=[source.data['Year'][0]], width=2, bottom=inditop,
top=labtop, alpha=0.75, color="lavender", legend="Labour", source=labsource)
# Format
p.yaxis[0].formatter = NumeralTickFormatter(format="$0,000")
# Set up widgets
eng_slider = Slider(start=5000000, end=100000000, value=40000000, step=5000000, title="Engineering")
def update_data(attrname, old, new):
# Get the current slider values
a = eng_slider.value
# Generate the new curve
df['Engineering'][0] = a
source = ColumnDataSource(df)
#source.data = dict(x=x, y=y)
for w in [eng_slider]:
w.on_change('value', update_data)
# Set up layouts and add to document
inputs = widgetbox(eng_slider)
# Show!
curdoc().add_root(column(inputs, p))
curdoc().title = "Sliders"
Picture of Current Graph
Dataset
Not sure the etiquette on answering your own question... Its mostly fixed however the Hovertools are not working correctly. As the Hovertool is #y it is showing the total of the stack at each item. I want it to show the difference. Is it possible to calculate a value for the HoverTool?
In case my situation helps someone, The mistake I was making above is I was changing a value with the slider which had to be then passed through a calculation before being passed into the Glyph.
The correct way, is to do any calculations within the update function
If you come from Pandas & Matplotlib like me you may end up building in df column calls into your charts e.g. x = df['Column_name'][0]. When plotting with Glyphs in bokeh, I believe the correct way is to create a source with the data you want, so you can just pass x and y into your Glyph. See the: Master source, Get source data, Calculate Top & Bottom and New Sources from my code below.
# Read Data
df=pd.read_csv('/home/mint/SAGD_Costs.csv')
# Master source
source = ColumnDataSource(df)
# Get source data
a = source.data['Engineering'][0]
b = source.data['Equipment'][0]
c = source.data['Bulk_Materials'][0]
d = source.data['Indirects'][0]
e = source.data['Labour'][0]
# Calculate Top & Bottom
ab = 0
at = a
bb = a
bt = a + b
cb = a + b
ct = a + b + c
db = a + b + c
dt = a + b + c + d
eb = a + b + c + d
et = a + b + c + d + e
# New sources
engsource = ColumnDataSource(data=dict(x=[ab], y=[at], desc = ['Engineering']))
equipsource = ColumnDataSource(data=dict(x=[bb], y=[bt], desc = ['Equipment']))
bulksource = ColumnDataSource(data=dict(x=[cb], y=[ct], desc = ['Bulk Materials']))
indisource = ColumnDataSource(data=dict(x=[db], y=[dt], desc = ['Indirects']))
labsource = ColumnDataSource(data=dict(x=[eb], y=[et], desc = ['Labour']))
# HoverTool Label
hover = HoverTool(
tooltips=[
('Item', '#desc'),
('Cost', '#y{$ 0.00 a}'),
]
)
# Other Tools
TOOLS = 'box_zoom, box_select, resize, reset'
# Figure
p = figure(title="Capital Costs Breakdown", title_location="above", plot_width=600, plot_height=600, x_range=(-2, 2), tools=[TOOLS, hover])
# Plots
engbar = p.vbar(x=0, width=2, bottom = 'x',
top ='y', alpha=0.75, color="darkslategrey", legend="Engineering", source=engsource)
equipbar = p.vbar(x=0, width=2, bottom = 'x',
top = 'y', alpha=0.75, color="teal", legend="Equipment", source=equipsource)
bulkbar = p.vbar(x=0, width=2, bottom = 'x',
top ='y', alpha=0.75, color="cyan", legend="Bulk Materials", source=bulksource)
indibar = p.vbar(x=0, width=2, bottom = 'x',
top ='y', alpha=0.75, color="powderblue", legend="Indirects", source=indisource)
labbar = p.vbar(x=0, width=2, bottom = 'x',
top = 'y', alpha=0.75, color="lavender", legend="Labour", source=labsource)
# Format
p.yaxis[0].formatter = NumeralTickFormatter(format="$0,000")
# Set up widgets
eng_slider = Slider(start=5000000, end=100000000, value=40000000, step=5000000, title="Engineering")
equip_slider = Slider(start=5000000, end=100000000, value=40000000, step=5000000, title="Equipment")
bulk_slider = Slider(start=5000000, end=100000000, value=40000000, step=5000000, title="Bulk_Materials")
indi_slider = Slider(start=5000000, end=100000000, value=40000000, step=5000000, title="Indirects")
lab_slider = Slider(start=5000000, end=100000000, value=40000000, step=5000000, title="Labour")
def update_data(attrname, old, new):
# Get the current slider values
a = eng_slider.value
b = equip_slider.value
c = bulk_slider.value
d = indi_slider.value
e = lab_slider.value
# Calculate Top & Bottom
ab = 0
at = a
bb = a
bt = a + b
cb = a + b
ct = a + b + c
db = a + b + c
dt = a + b + c + d
eb = a + b + c + d
et = a + b + c + d + e
# New sources
engsource.data=dict(x=[ab], y=[at], desc = ['Engineering'])
equipsource.data=dict(x=[bb], y=[bt], desc = ['Equipment'])
bulksource.data=dict(x=[cb], y=[ct], desc = ['Bulk Materials'])
indisource.data=dict(x=[db], y=[dt], desc = ['Indirects'])
labsource.data=dict(x=[eb], y=[et], desc = ['Labour'])
for w in [eng_slider, equip_slider, bulk_slider, indi_slider, lab_slider]:
w.on_change('value', update_data)
# Set up layouts and add to document
inputs = widgetbox(eng_slider, equip_slider, bulk_slider, indi_slider, lab_slider)
# Show!
curdoc().add_root(column(inputs, p))
curdoc().title = "Sliders"

Import excel data and keep date time

thanks in advance for your help. i'm importing data from excel using openpyxl though i'd like to get strings into datetime, below is the code i'm using:
import openpyxl, pprint, datetime
print ('Opening workbook...')
wb= openpyxl.load_workbook('ACLogs_test_Conv2.xlsx')
sheet = wb.get_sheet_by_name('Sheet1')
print sheet
ACLogsData = {}
print ('Reading rows...')
for row in range(2, sheet.max_row +1):
pangalan = sheet['B' + str(row)].value
dates = sheet['D' + str(row)].value
time = sheet['E' + str(row)].value
ACLogsData.setdefault(pangalan,{})
ACLogsData[pangalan].setdefault(dates,{})
ACLogsData[pangalan][dates].setdefault(time)
use datetime.strptime()
FMT = '%H:%M' # Whatever format your times are in
for row in range(2, sheet.max_row +1):
pangalan = sheet['B' + str(row)].value
dates = sheet['D' + str(row)].value
time = datetime.strptime(sheet['E' + str(row)].value, FMT)

Resources