Why is plot returning "ValueError: could not convert string to float:" when a dataframe column of floats is being passed to the plot function? - python-3.x

I am trying to plot a dataframe I have created from an excel spreadsheet using either matplotlib or matplotlib and pandas ie. df.plot. However, python keeps returning a cannot convert string to float error. This is confusing since when I print the column of the dataframe it appears to be all float values.
I've tried printing the values of the dataframe column and using the pandas.plot syntax. I've also tried saving the column to a new variable.
import pandas as pd
from matplotlib import pyplot as plt
import glob
import openpyxl
import math
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Border, Side, Alignment
import seaborn as sns
import itertools
directory = 'E:\some directory'
#QA_directory = directory + '**/*COPY.xlsx'
wb = openpyxl.load_workbook(directory + '\\Calcs\\' + "excel file.xlsx", data_only = 'True')
plt.figure(figsize=(16,9))
axes = plt.axes()
plt.title('Drag Amplification', fontsize = 16)
plt.xlabel('Time (s)', fontsize = 14)
plt.ylabel('Cf', fontsize = 14)
d = pd.DataFrame()
n=[]
for sheets in wb.sheetnames:
if '2_1' in sheets and '2%' not in sheets and '44%' not in sheets:
name = sheets[:8]
print(name)
ws = wb[sheets]
data = ws.values
cols = next(data)[1:]
data = list(data)
idx = [r[0] for r in data]
data = (itertools.islice(r, 1, None) for r in data)
df = pd.DataFrame(data, index=idx, columns=cols)
df = df.dropna()
#x = df['x/l']
#y = df.Cf
print(df.columns)
print(df.Cf.values)
x=df['x/l'].values
plt.plot(x, df.Cf.values)
"""x = [wb[sheets].cell(row=row,column=1).value for row in range(1,2000) if wb[sheets].cell(row=row,column=1).value]
print(x)
Cf = [wb[sheets].cell(row=row,column=6).value for row in range(1,2000) if wb[sheets].cell(row=row,column=1).value]
d[name+ 'x'] = pd.DataFrame(x)
d[name + '_Cf'] = pd.Series(Cf, index=d.index)
print(name)"""
print(df)
plt.show()
I'm expecting a plot of line graphs with the values of x/l on the x access and Cf on the 'y' with a line for each of the relevant sheets in the workbook. Any insights as to why i am getting this error would be appreciated!

Related

How to add entire dataframe row as scatter plot annotation

I'm plotting two columns of a Pandas DataFrame on a scatterplot and I want each point to show all the row values of the DataFrame. I've looked at this post, and tried to do something similar with mplcursors:
import pandas as pd
from datetime import date, datetime, time, timedelta
import numpy as np
import matplotlib.pyplot as plt
from mplcursors import cursor
df = pd.DataFrame()
df['datetime'] = pd.date_range(start='2016-01-01', end='2016-01-14', freq='30T')
#df = df.set_index('datetime')
df['x1'] = np.random.randint(-30, 30, size=len(df))
df['x2'] = np.random.randint(-30, 20, size=len(df))
df['x3'] = np.random.randint(-20, 30, size=len(df))
df['y1'] = np.random.randint(-100, 100, size=len(df))
df['y2'] = np.random.randint(-300, 200, size=len(df))
df['y3'] = np.random.randint(-200, 300, size=len(df))
def conditions(s):
if (s['y1'] > 20) or (s['y3'] < 0):
return 'group1'
elif (s['x3'] < 20):
return 'group2'
elif (s['x2'] == 0):
return 'group3'
else:
return 'group4'
df['category'] = df.apply(conditions, axis=1)
fig = plt.figure(figsize=(12,4))
ax1 = plt.subplot(121)
ax1.scatter(df.x1, df.y1, label='test1')
ax1.scatter(df.x2, df.y2, label='test2')
#cursor(hover=True)
ax1.set_xlabel('test1')
ax1.set_ylabel('test2')
ax1.legend(['test1','test2'])
cr1 = cursor(ax1,hover=True)
#ax1.annotation_names = df.columns.tolist()
cr1.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
ax2 = plt.subplot(122)
ax2.scatter(df.x1, df.y1, label='test1')
ax2.scatter(df.x3, df.y3, label='test3')
ax2.set_xlabel('test1')
ax2.set_ylabel('test3')
ax2.legend(['test1','test3'])
cr2 = cursor(ax2,hover=True)
#ax2.annotation_names = df.columns.tolist()
cr2.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
# save figure
import pickle
pickle.dump(fig, open('FigureObject.fig.pickle', 'wb'))
plt.show()
When I hover over a point, I want to see a label containing (for example):
datetime = 2016-01-01 00:00:00
x1 = 1
x2 = -4
x3 = 22
y1 = -42
y2 = -219
y3 = -158
category = group1
but I get this type of error:
cr2.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
IndexError: list index out of range
How do I fix it?
The IndexError occurs because of df.columns.tolist()[x.target.index]
df.columns.tolist() is a list of 7 columns, which is then indexed by [x.target.index].
df.iloc[x.target.index, :].to_dict() will get the desired row data for the point as a dict
A list comprehension creates a list of strings for each key value pair
'\n'.join(...) creates a string with each column separated by a \n
In mplcursors v0.5.1, Selection.target.index is deprecated, use Selection.index instead.
df.iloc[x.index, :] instead of df.iloc[x.target.index, :]
cr1.connect("add", lambda x: x.annotation.set_text('\n'.join([f'{k}: {v}' for k, v in df.iloc[x.index, :].to_dict().items()])))
Alternatively, use .to_string()
cr1.connect("add", lambda x: x.annotation.set_text(df.iloc[x.index, :].to_string()))

Apply function on a Pandas Dataframe

Apply function on a Pandas Dataframe
I have a code (C01) that calculates the moving averages (21 periods) of a given stock (individual) on the stock exchange (IBOV - B3-BRAZIL). Then I created a for loop where it determines that an asset is in an upward trend after 6 highs followed by moving averages (hypothesis, considering that there are more variables to determine this).
However, I want to do this loop for more than one asset, in this case C02, that is, it applies a function in each column of my code and returns only the name of the assets that are in an upward trend (in this case, the column name). I tried to turn the for loop into a function and apply that function using the pandas 'apply' to each column (axis = 1, I tried tbm axis = 'columns'). But I'm having an error creating the function. When I execute the function using apply, the message "ValueError: Lengths must match to compare" appears. How can I fix this?
Grateful for the attention.
import numpy as np
import pandas as pd
from pandas_datareader import data as wb
from mpl_finance import candlestick_ohlc
from pandas_datareader import data as wb
from datetime import datetime
import matplotlib.dates as mpl_dates
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
#STOCK
ativo = 'WEGE3.SA'
acao2 = ativo.upper()
#START AND END ANALYSIS
inicio = '2020-1-1'
fim = '2021-1-27'
#MAKE DATAFRAME
df00 = wb.DataReader(acao2, data_source='yahoo', start=inicio, end=fim)
df00.index.names = ['Data']
df= df00.copy(deep=True)
df['Data'] = df.index.map(mdates.date2num)
# MOVING AVERAGE
df['ema21'] = df['Close'].ewm(span=21, adjust=False).mean()
df['ema72'] = df['Close'].ewm(span=72, adjust=False).mean()
#DF PLOT
df1=df
df2=df[-120:]
#TREND RULE
alta=1
for i in range(6):
if(df2.ema21[-i-1] < df2.ema21[-i-2]):
alta=0
baixa=1
for i in range(6):
if(df2.ema21[-i-1] > df2.ema21[-i-2]):
baixa=0
if (alta==1 and baixa==0):
a1 = ativo.upper()+ ' HIGH TREND'
elif (alta==0 and baixa==1):
a1 = ativo.upper()+ ' LOW TREND!'
else:
a1 = ativo.upper()+ ' UNDEFINED'
#PLOT RESULTS
print("---------------------------------------")
print(a1)
print("---------------------------------------")
ohlc = df[['Data', 'Open', 'High', 'Low', 'Close']]
f1, ax = plt.subplots(figsize=(14, 8))
# plot the candlesticks
candlestick_ohlc(ax, ohlc.values, width=.6, colorup='green', colordown='red')
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
label_ = acao2.upper() + ' EMA26'
label_2 = acao2.upper() + ' EMA09'
ax.plot(df.index, df1['ema21'], color='black', label=label_)
ax.plot(df.index, df1['ema72'], color='blue', label=label_)
ax.grid(False)
ax.legend()
ax.grid(True)
plt.title(acao2.upper() + ' : Gráfico Diário')
plt.show(block=True)
#C02
#START/END ANALISYS
inicio = '2020-1-1'
fim = '2021-1-27'
#STOCKS
ativos = ['SAPR11.SA','WEGE3.SA']
#DATAFRAME
mydata = pd.DataFrame()
for t in ativos:
mydata[t] = wb.DataReader(t, data_source='yahoo', start=inicio, end=fim)['Close']
df2 = mydata
#MOVING AVERAGE
df3 = df2.apply(lambda x: x.rolling(window=21).mean())
#MAKE FUNCTION
def trend(x):
tendencia_alta=1
for i in range(6):
if(df3.columns[-i-1:] > df3.columns[-i-2:]):
tendencia_alta=0
print()
if (alta==1 and baixa==0):
a1 = ativo.upper()+ ' HIGH TREND'
elif (alta==0 and baixa==1):
a1 = ativo.upper()+ ' LOW TREND!'
else:
a1 = ativo.upper()+ ' UNDEFINED'
#TRYING TO APPLY THE FUNCTION IN EVERY DF3 COLUMN
df3.apply(trend, axis=1)´´´
something like:
def myfunc(x):
#do things here where x is the group of rows sent to function
#instead of df['column'], you'll use x['column']
#because you are passing the rows into x
return x
df.groupby('yourcolumn').apply(myfunc)

How do I create a Bokeh Select menu for a line plot for an indeterminate number of options?

I've been working on getting a select menu and Bokeh plot up and running on a dataset I'm working with. The dataset can be found here. I have no experience with JavaScript, but I believe my select menu isn't connected/-ing to my plot. Therefore, I have a plot outline, but no data displayed. As I run the script from the console with bokeh serve --show test.py, I get the first 7 notifications in my JS console. The last three (those in the red bracket in the screenshot) occur when I try and change to a different item in my select menu.
Goal: Display the plot of data for rows those id number ('ndc' in this example) is selected in the Select menu.
Here's my code (modified from this post) that I used to get started. This one was also used, as were a handful of others, and the Bokeh documentation itself.
import pandas as pd
from bokeh.io import curdoc, output_notebook, output_file
from bokeh.layouts import row, column
from bokeh.models import Select, DataRange1d, ColumnDataSource
from bokeh.plotting import figure
# output_notebook()
output_file('test.html')
def get_dataset(src, drug_id):
src.drop('Unnamed: 0', axis = 1, inplace = True)
df = src[src.ndc == drug_id].copy()
df['date'] = pd.to_datetime(df['date'])
df = df.set_index(['date'])
df.sort_index(inplace=True)
source = ColumnDataSource(data=df)
return source
def make_plot(source, title):
plot = figure(plot_width=800, plot_height = 800, tools="", x_axis_type = 'datetime', toolbar_location=None)
plot.xaxis.axis_label = 'Time'
plot.yaxis.axis_label = 'Price ($)'
plot.axis.axis_label_text_font_style = 'bold'
plot.x_range = DataRange1d(range_padding = 0.0)
plot.grid.grid_line_alpha = 0.3
plot.title.text = title
plot.line(x= 'date', y='nadac_per_unit', source=source)
return plot
def update_plot(attrname, old, new):
ver = vselect.value
plot.title.text = "Drug Prices"
src = get_dataset(df, ver)
source.date.update(src.date)
df = pd.read_csv('data/plotting_data.csv')
ver = '54034808' #Initial id number
cc = df['ndc'].astype(str).unique() #select-menu options
vselect = Select(value=ver, title='Drug ID', options=sorted((cc)))
source = get_dataset(df, ver)
plot = make_plot(source, "Drug Prices")
vselect.on_change('value', update_plot)
controls = row(vselect)
curdoc().add_root(row(plot, controls))
There were some problems in your code:
You want to drop the Unnamed: 0 column. This can only be done once and when you try this again it will throw an error since this column does not exist anymore.
The way you tried to filter the dataframe didn't work and would result in an empty dataframe. You can select rows based on a column value like this: df.loc[df['column_name'] == some_value]
Updating the ColumnDataSource object can be done by replacing source.data with the new data.
import pandas as pd
from bokeh.io import curdoc, output_notebook, output_file
from bokeh.layouts import row, column
from bokeh.models import Select, DataRange1d, ColumnDataSource
from bokeh.plotting import figure
output_notebook()
output_file('test.html')
def get_dataset(src, drug_id):
src.drop('Unnamed: 0', axis = 1, inplace = True)
df = src.loc[src['ndc'] == int(drug_id)]
df['date'] = pd.to_datetime(df['date'])
df = df.set_index(['date'])
df.sort_index(inplace=True)
source = ColumnDataSource(data=df)
return source
def make_plot(source, title):
plot = figure(plot_width=800, plot_height = 800, tools="", x_axis_type = 'datetime', toolbar_location=None)
plot.xaxis.axis_label = 'Time'
plot.yaxis.axis_label = 'Price ($)'
plot.axis.axis_label_text_font_style = 'bold'
plot.x_range = DataRange1d(range_padding = 0.0)
plot.grid.grid_line_alpha = 0.3
plot.title.text = title
plot.line(x= 'date', y='nadac_per_unit', source=source)
return plot
def update_plot(attrname, old, new):
ver = vselect.value
df1 = df.loc[df['ndc'] == int(new)]
df1['date'] = pd.to_datetime(df1['date'])
df1 = df1.set_index(['date'])
df1.sort_index(inplace=True)
newSource = ColumnDataSource(df1)
source.data = newSource.data
df = pd.read_csv('data/plotting_data.csv')
ver = '54034808' #Initial id number
cc = df['ndc'].astype(str).unique() #select-menu options
vselect = Select(value=ver, title='Drug ID', options=sorted((cc)))
source = get_dataset(df, ver)
plot = make_plot(source, "Drug Prices")
vselect.on_change('value', update_plot)
controls = row(vselect)
curdoc().add_root(row(plot, controls))

Locate columns in dataframe to graph

I have an excel file with 3 columns and 60 rows. The first column is the Date but I want to put that on the x axis and plot the other 2 against it. I need help locating the 2 other columns so i can enter it in ax1.plot() and ax2.plot().
I have tried to locate it by [:,1] but that doesnt work and I have tried to locate it by the name of the column. The second column is "S&P/TSX Composite index (^GSPTSE)" and the third column is "Bitcoin CAD (BTC-CAD)"
import pandas as pd
import matplotlib.pyplot as plt
InputData = pd.read_excel('Python_assignment_InputData.xlsx')
#InputData = InputData[:15]
"""
print("a)\n", InputData,"\n")
print("b)")
InputData['Date'] = InputData.DATE.dt.year
InputData['Year'] = pd.to_datetime(InputData.Date).dt.year
"""
#ax1 = InputData.iloc[:,1]
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot()
ax2.plot()
ax1.set_ylabel("TSX", color = 'b')
ax2.set_ylabel("BTC", color = 'g')
ax1.set_xlabel("Year")
plt.title("Question 6")
plt.show()

Plotly iplot() doesnt run within a function

I am trying to use iplot() within a function within Jupyter so that i can use a filter on the graph and have it change dynamically. The code works in a cell on its own like this
# Code for put by ticker
data = []
opPriceDic = priceToArray(getPuts(getOptionPricesByTicker('ABBV')))
for key, values in opPriceDic.items():
trace = go.Scatter(
x = numberOfDays,
y = values,
name = 'option',
line = dict(
width = 4)
)
data.append(trace)
# Edit the layout
layout = dict(title = 'Call prices for ' ,
xaxis = dict(title = 'Days to Expiration'),
yaxis = dict(title = 'Price '),
)
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='calls For ')
But once this is placed within a function the graph fails to load
def graph(ticker):
# Code for put by ticker
data = []
opPriceDic = priceToArray(getPuts(getOptionPricesByTicker(ticker)))
for key, values in opPriceDic.items():
trace = go.Scatter(
x = numberOfDays,
y = values,
name = 'option',
line = dict(
width = 4)
)
data.append(trace)
# Edit the layout
layout = dict(title = 'Call prices for ' ,
xaxis = dict(title = 'Days to Expiration'),
yaxis = dict(title = 'Price '),
)
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='calls For ')
But if I change the iplot() to plot() it calls the plotly API and opens a new tab with the graph displaying.
I am just wondering if anyone has noticed this before and may have come across a solution?
(if I am in the wrong area I will remove the post)
I have tried to use pandas data.reader calls to pull ticker data between a start and end date. The data.reader seems to work from within the function. In the question code, if the opPriceDic dictionary could be converted to a dataframe, then iplot() could plot it without use of layout and fig as below:
# Import libraries
import datetime
from datetime import date
import pandas as pd
import numpy as np
from plotly import __version__
%matplotlib inline
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
init_notebook_mode(connected=True)
cf.go_offline()
# Create function that uses data.reader and iplot()
def graph(ticker):
# create sample data set
start = datetime.datetime(2006, 1, 1)
end = datetime.datetime(2016, 1, 1)
df = data.DataReader(ticker, 'morningstar', start, end)
df = df.reset_index()
df['numberOfDays'] = df.apply(lambda x: abs((datetime.datetime.now() - x['Date']).days), axis=1)
# call iplot within the function graph()
df.iplot(kind='line', x='numberOfDays', y='Close', xTitle='Days', yTitle='Value', title='Prices', width=4)

Resources