Apply function on a Pandas Dataframe - python-3.x

Apply function on a Pandas Dataframe
I have a code (C01) that calculates the moving averages (21 periods) of a given stock (individual) on the stock exchange (IBOV - B3-BRAZIL). Then I created a for loop where it determines that an asset is in an upward trend after 6 highs followed by moving averages (hypothesis, considering that there are more variables to determine this).
However, I want to do this loop for more than one asset, in this case C02, that is, it applies a function in each column of my code and returns only the name of the assets that are in an upward trend (in this case, the column name). I tried to turn the for loop into a function and apply that function using the pandas 'apply' to each column (axis = 1, I tried tbm axis = 'columns'). But I'm having an error creating the function. When I execute the function using apply, the message "ValueError: Lengths must match to compare" appears. How can I fix this?
Grateful for the attention.
import numpy as np
import pandas as pd
from pandas_datareader import data as wb
from mpl_finance import candlestick_ohlc
from pandas_datareader import data as wb
from datetime import datetime
import matplotlib.dates as mpl_dates
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
#STOCK
ativo = 'WEGE3.SA'
acao2 = ativo.upper()
#START AND END ANALYSIS
inicio = '2020-1-1'
fim = '2021-1-27'
#MAKE DATAFRAME
df00 = wb.DataReader(acao2, data_source='yahoo', start=inicio, end=fim)
df00.index.names = ['Data']
df= df00.copy(deep=True)
df['Data'] = df.index.map(mdates.date2num)
# MOVING AVERAGE
df['ema21'] = df['Close'].ewm(span=21, adjust=False).mean()
df['ema72'] = df['Close'].ewm(span=72, adjust=False).mean()
#DF PLOT
df1=df
df2=df[-120:]
#TREND RULE
alta=1
for i in range(6):
if(df2.ema21[-i-1] < df2.ema21[-i-2]):
alta=0
baixa=1
for i in range(6):
if(df2.ema21[-i-1] > df2.ema21[-i-2]):
baixa=0
if (alta==1 and baixa==0):
a1 = ativo.upper()+ ' HIGH TREND'
elif (alta==0 and baixa==1):
a1 = ativo.upper()+ ' LOW TREND!'
else:
a1 = ativo.upper()+ ' UNDEFINED'
#PLOT RESULTS
print("---------------------------------------")
print(a1)
print("---------------------------------------")
ohlc = df[['Data', 'Open', 'High', 'Low', 'Close']]
f1, ax = plt.subplots(figsize=(14, 8))
# plot the candlesticks
candlestick_ohlc(ax, ohlc.values, width=.6, colorup='green', colordown='red')
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
label_ = acao2.upper() + ' EMA26'
label_2 = acao2.upper() + ' EMA09'
ax.plot(df.index, df1['ema21'], color='black', label=label_)
ax.plot(df.index, df1['ema72'], color='blue', label=label_)
ax.grid(False)
ax.legend()
ax.grid(True)
plt.title(acao2.upper() + ' : Gráfico Diário')
plt.show(block=True)
#C02
#START/END ANALISYS
inicio = '2020-1-1'
fim = '2021-1-27'
#STOCKS
ativos = ['SAPR11.SA','WEGE3.SA']
#DATAFRAME
mydata = pd.DataFrame()
for t in ativos:
mydata[t] = wb.DataReader(t, data_source='yahoo', start=inicio, end=fim)['Close']
df2 = mydata
#MOVING AVERAGE
df3 = df2.apply(lambda x: x.rolling(window=21).mean())
#MAKE FUNCTION
def trend(x):
tendencia_alta=1
for i in range(6):
if(df3.columns[-i-1:] > df3.columns[-i-2:]):
tendencia_alta=0
print()
if (alta==1 and baixa==0):
a1 = ativo.upper()+ ' HIGH TREND'
elif (alta==0 and baixa==1):
a1 = ativo.upper()+ ' LOW TREND!'
else:
a1 = ativo.upper()+ ' UNDEFINED'
#TRYING TO APPLY THE FUNCTION IN EVERY DF3 COLUMN
df3.apply(trend, axis=1)´´´

something like:
def myfunc(x):
#do things here where x is the group of rows sent to function
#instead of df['column'], you'll use x['column']
#because you are passing the rows into x
return x
df.groupby('yourcolumn').apply(myfunc)

Related

How to add entire dataframe row as scatter plot annotation

I'm plotting two columns of a Pandas DataFrame on a scatterplot and I want each point to show all the row values of the DataFrame. I've looked at this post, and tried to do something similar with mplcursors:
import pandas as pd
from datetime import date, datetime, time, timedelta
import numpy as np
import matplotlib.pyplot as plt
from mplcursors import cursor
df = pd.DataFrame()
df['datetime'] = pd.date_range(start='2016-01-01', end='2016-01-14', freq='30T')
#df = df.set_index('datetime')
df['x1'] = np.random.randint(-30, 30, size=len(df))
df['x2'] = np.random.randint(-30, 20, size=len(df))
df['x3'] = np.random.randint(-20, 30, size=len(df))
df['y1'] = np.random.randint(-100, 100, size=len(df))
df['y2'] = np.random.randint(-300, 200, size=len(df))
df['y3'] = np.random.randint(-200, 300, size=len(df))
def conditions(s):
if (s['y1'] > 20) or (s['y3'] < 0):
return 'group1'
elif (s['x3'] < 20):
return 'group2'
elif (s['x2'] == 0):
return 'group3'
else:
return 'group4'
df['category'] = df.apply(conditions, axis=1)
fig = plt.figure(figsize=(12,4))
ax1 = plt.subplot(121)
ax1.scatter(df.x1, df.y1, label='test1')
ax1.scatter(df.x2, df.y2, label='test2')
#cursor(hover=True)
ax1.set_xlabel('test1')
ax1.set_ylabel('test2')
ax1.legend(['test1','test2'])
cr1 = cursor(ax1,hover=True)
#ax1.annotation_names = df.columns.tolist()
cr1.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
ax2 = plt.subplot(122)
ax2.scatter(df.x1, df.y1, label='test1')
ax2.scatter(df.x3, df.y3, label='test3')
ax2.set_xlabel('test1')
ax2.set_ylabel('test3')
ax2.legend(['test1','test3'])
cr2 = cursor(ax2,hover=True)
#ax2.annotation_names = df.columns.tolist()
cr2.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
# save figure
import pickle
pickle.dump(fig, open('FigureObject.fig.pickle', 'wb'))
plt.show()
When I hover over a point, I want to see a label containing (for example):
datetime = 2016-01-01 00:00:00
x1 = 1
x2 = -4
x3 = 22
y1 = -42
y2 = -219
y3 = -158
category = group1
but I get this type of error:
cr2.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
IndexError: list index out of range
How do I fix it?
The IndexError occurs because of df.columns.tolist()[x.target.index]
df.columns.tolist() is a list of 7 columns, which is then indexed by [x.target.index].
df.iloc[x.target.index, :].to_dict() will get the desired row data for the point as a dict
A list comprehension creates a list of strings for each key value pair
'\n'.join(...) creates a string with each column separated by a \n
In mplcursors v0.5.1, Selection.target.index is deprecated, use Selection.index instead.
df.iloc[x.index, :] instead of df.iloc[x.target.index, :]
cr1.connect("add", lambda x: x.annotation.set_text('\n'.join([f'{k}: {v}' for k, v in df.iloc[x.index, :].to_dict().items()])))
Alternatively, use .to_string()
cr1.connect("add", lambda x: x.annotation.set_text(df.iloc[x.index, :].to_string()))

Reading in multiple files in Python and saving them one by one in a different directory

import glob
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
files = glob.glob("Angular_position_*_*.csv")
output = pd.DataFrame()
for f in files:
df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
np.savetxt("enter directory path/Filtered_files/Filtered_Angular_position_*_*", output, delimiter = ', ', newline = "\n")
I am trying to read in all files in a directory, they are then low pass filtered. After that the results are saved one after the other but not in one file. The result gives each files with 3 columns and ideally I would like them to named with headers e.g. col1, col2, col3.
Without using glob, I can filter all my files individually but I have more than 100 such files.
Any help would be appreciated.
best wishes,
I have partially solved the issue apart from the header names:
import glob
import pandas as pd
from tnorma import tnorma
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
path = r'location_of_dir'
all_files = glob.glob(path + '/*.csv')
# yn = np.zeros(shape = (101,1))
# tn = np.zeros(shape = (101,1))
#ynew = []
yn = np.zeros(shape = (101,1))
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
print(filename)
foo = filename.split("/")[-1]
#df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
#print(ynew)
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
#print (output)
tn = np.linspace(0, 100, 101) # new time vector for the new time-normalized data
yn, tn, indie = tnorma(output, k=3, smooth =1, mask = None, show = False)
np.savetxt("path_name/foldername/file"+ foo, yn, delimiter = ', ', newline = "\n")
However, I am having difficulty in putting header names on the 3 columns per file.

Why is plot returning "ValueError: could not convert string to float:" when a dataframe column of floats is being passed to the plot function?

I am trying to plot a dataframe I have created from an excel spreadsheet using either matplotlib or matplotlib and pandas ie. df.plot. However, python keeps returning a cannot convert string to float error. This is confusing since when I print the column of the dataframe it appears to be all float values.
I've tried printing the values of the dataframe column and using the pandas.plot syntax. I've also tried saving the column to a new variable.
import pandas as pd
from matplotlib import pyplot as plt
import glob
import openpyxl
import math
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Border, Side, Alignment
import seaborn as sns
import itertools
directory = 'E:\some directory'
#QA_directory = directory + '**/*COPY.xlsx'
wb = openpyxl.load_workbook(directory + '\\Calcs\\' + "excel file.xlsx", data_only = 'True')
plt.figure(figsize=(16,9))
axes = plt.axes()
plt.title('Drag Amplification', fontsize = 16)
plt.xlabel('Time (s)', fontsize = 14)
plt.ylabel('Cf', fontsize = 14)
d = pd.DataFrame()
n=[]
for sheets in wb.sheetnames:
if '2_1' in sheets and '2%' not in sheets and '44%' not in sheets:
name = sheets[:8]
print(name)
ws = wb[sheets]
data = ws.values
cols = next(data)[1:]
data = list(data)
idx = [r[0] for r in data]
data = (itertools.islice(r, 1, None) for r in data)
df = pd.DataFrame(data, index=idx, columns=cols)
df = df.dropna()
#x = df['x/l']
#y = df.Cf
print(df.columns)
print(df.Cf.values)
x=df['x/l'].values
plt.plot(x, df.Cf.values)
"""x = [wb[sheets].cell(row=row,column=1).value for row in range(1,2000) if wb[sheets].cell(row=row,column=1).value]
print(x)
Cf = [wb[sheets].cell(row=row,column=6).value for row in range(1,2000) if wb[sheets].cell(row=row,column=1).value]
d[name+ 'x'] = pd.DataFrame(x)
d[name + '_Cf'] = pd.Series(Cf, index=d.index)
print(name)"""
print(df)
plt.show()
I'm expecting a plot of line graphs with the values of x/l on the x access and Cf on the 'y' with a line for each of the relevant sheets in the workbook. Any insights as to why i am getting this error would be appreciated!

How to have a chart multiple columns continuously by iterating through a data-frame with matplotlib

BACKGROUND INFORMATION:
I have dataframe of x many stocks with y price sets (closing & 3 day SMA), (currently this is 5 and 2 respectively (one is closing price, the other is a 3 day Simple Moving Average SMA).
The current output is [2781 rows x 10 columns] with a ranging data set start_date = '2006-01-01' till end_date = '2016-12-31'. The output is as follows as a dataframe print(df):
CURRENT OUTPUT:
ANZ Price ANZ 3 day SMA CBA Price CBA 3 day SMA MQG Price MQG 3 day SMA NAB Price NAB 3 day SMA WBC Price WBC 3 day SMA
Date
2006-01-02 23.910000 NaN 42.569401 NaN 66.558502 NaN 30.792999 NaN 22.566401 NaN
2006-01-03 24.040001 NaN 42.619099 NaN 66.086403 NaN 30.935699 NaN 22.705400 NaN
2006-01-04 24.180000 24.043334 42.738400 42.642300 66.587997 66.410967 31.078400 30.935699 22.784901 22.685567
2006-01-05 24.219999 24.146667 42.708599 42.688699 66.558502 66.410967 30.964300 30.992800 22.794800 22.761700
... ... ... ... ... ... ... ... ... ... ...
2016-12-27 87.346667 30.670000 30.706666 32.869999 32.729999 87.346667 30.670000 30.706666 32.869999 32.729999
2016-12-28 87.456667 31.000000 30.773333 32.980000 32.829999 87.456667 31.000000 30.773333 32.980000 32.829999
2016-12-29 87.520002 30.670000 30.780000 32.599998 32.816666 87.520002 30.670000 30.780000 32.599998 32.816666
MY WORKING CODE:
#!/usr/bin/python3
from pandas_datareader import data
import pandas as pd
import itertools as it
import os
import numpy as np
import fix_yahoo_finance as yf
import matplotlib.pyplot as plt
yf.pdr_override()
stock_list = sorted(["ANZ.AX", "WBC.AX", "MQG.AX", "CBA.AX", "NAB.AX"])
number_of_decimal_places = 8
moving_average_period = 3
def get_moving_average(df, stock_name):
df2 = df.rolling(window=moving_average_period).mean()
df2.rename(columns={stock_name: stock_name.replace("Price", str(moving_average_period) + " day SMA")}, inplace=True)
df = pd.concat([df, df2], axis=1, join_axes=[df.index])
return df
# Function to get the closing price of the individual stocks
# from the stock_list list
def get_closing_price(stock_name, specific_close):
symbol = stock_name
start_date = '2006-01-01'
end_date = '2016-12-31'
df = data.get_data_yahoo(symbol, start_date, end_date)
sym = symbol + " "
print(sym * 10)
df = df.drop(['Open', 'High', 'Low', 'Adj Close', 'Volume'], axis=1)
df = df.rename(columns={'Close': specific_close})
# https://stackoverflow.com/questions/16729483/converting-strings-to-floats-in-a-dataframe
# df[specific_close] = df[specific_close].astype('float64')
# print(type(df[specific_close]))
return df
# Creates a big DataFrame with all the stock's Closing
# Price returns the DataFrame
def get_all_close_prices(directory):
count = 0
for stock_name in stock_list:
specific_close = stock_name.replace(".AX", "") + " Price"
if not count:
prev_df = get_closing_price(stock_name, specific_close)
prev_df = get_moving_average(prev_df, specific_close)
else:
new_df = get_closing_price(stock_name, specific_close)
new_df = get_moving_average(new_df, specific_close)
# https://stackoverflow.com/questions/11637384/pandas-join-merge-concat-two-dataframes
prev_df = prev_df.join(new_df)
count += 1
# prev_df.to_csv(directory)
df = pd.DataFrame(prev_df, columns=list(prev_df))
df = df.apply(pd.to_numeric)
convert_df_to_csv(df, directory)
return df
def convert_df_to_csv(df, directory):
df.to_csv(directory)
def main():
# FINDS THE CURRENT DIRECTORY AND CREATES THE CSV TO DUMP THE DF
csv_in_current_directory = os.getcwd() + "/stock_output.csv"
csv_in_current_directory_dow_distribution = os.getcwd() + "/dow_distribution.csv"
# FUNCTION THAT GETS ALL THE CLOSING PRICES OF THE STOCKS
# AND RETURNS IT AS ONE COMPLETE DATAFRAME
df = get_all_close_prices(csv_in_current_directory)
print(df)
# Main line of code
if __name__ == "__main__":
main()
QUESTION:
From this df I want to create x many lines graphs (one graph per stock) with y many lines (price, and SMAs). How can I do this with matplotlib? Could this be done with a for loop and save the individuals plots as the loop gets iterated? If so how?
First import import matplotlib.pyplot as plt.
Then it depends whether you want x many individual plots or one plot with x many subplots:
Individual plots
df.plot(y=[0,1])
df.plot(y=[2,3])
df.plot(y=[4,5])
df.plot(y=[6,7])
df.plot(y=[8,9])
plt.show()
You can also save the individual plots in a loop:
for i in range(0,9,2):
df.plot(y=[i,i+1])
plt.savefig('{}.png'.format(i))
Subplots
fig, axes = plt.subplots(nrows=2, ncols=3)
df.plot(ax=axes[0,0],y=[0,1])
df.plot(ax=axes[0,1],y=[2,3])
df.plot(ax=axes[0,2],y=[4,5])
df.plot(ax=axes[1,0],y=[6,7])
df.plot(ax=axes[1,1],y=[8,9])
plt.show()
See https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html for options to customize your plot(s).
The best approach is to make a function that is dependent on the size of your lists x and y. Thereby the function should be as follows:
def generate_SMA_graphs(df):
columnNames = list(df.head(0))
print("CN:\t", columnNames)
print(len(columnNames))
count = 0
for stock in stock_list:
stock_iter = count * (len(moving_average_period_list) + 1)
sma_iter = stock_iter + 1
for moving_average_period in moving_average_period_list:
fig = plt.figure()
df.plot(y=[columnNames[stock_iter], columnNames[sma_iter]])
plt.xlabel('Time')
plt.ylabel('Price ($)')
graph_title = columnNames[stock_iter] + " vs. " + columnNames[sma_iter]
plt.title(graph_title)
plt.grid(True)
plt.savefig(graph_title.replace(" ", "") + ".png")
print("\t\t\t\tCompleted: ", graph_title)
plt.close(fig)
sma_iter += 1
count += 1
With the code above, irrespective how ever long either list is (for x or y, stock list or SMA list) the above function will generate a graph comparing the original price with every SMA for that given stock.

Plotly iplot() doesnt run within a function

I am trying to use iplot() within a function within Jupyter so that i can use a filter on the graph and have it change dynamically. The code works in a cell on its own like this
# Code for put by ticker
data = []
opPriceDic = priceToArray(getPuts(getOptionPricesByTicker('ABBV')))
for key, values in opPriceDic.items():
trace = go.Scatter(
x = numberOfDays,
y = values,
name = 'option',
line = dict(
width = 4)
)
data.append(trace)
# Edit the layout
layout = dict(title = 'Call prices for ' ,
xaxis = dict(title = 'Days to Expiration'),
yaxis = dict(title = 'Price '),
)
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='calls For ')
But once this is placed within a function the graph fails to load
def graph(ticker):
# Code for put by ticker
data = []
opPriceDic = priceToArray(getPuts(getOptionPricesByTicker(ticker)))
for key, values in opPriceDic.items():
trace = go.Scatter(
x = numberOfDays,
y = values,
name = 'option',
line = dict(
width = 4)
)
data.append(trace)
# Edit the layout
layout = dict(title = 'Call prices for ' ,
xaxis = dict(title = 'Days to Expiration'),
yaxis = dict(title = 'Price '),
)
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='calls For ')
But if I change the iplot() to plot() it calls the plotly API and opens a new tab with the graph displaying.
I am just wondering if anyone has noticed this before and may have come across a solution?
(if I am in the wrong area I will remove the post)
I have tried to use pandas data.reader calls to pull ticker data between a start and end date. The data.reader seems to work from within the function. In the question code, if the opPriceDic dictionary could be converted to a dataframe, then iplot() could plot it without use of layout and fig as below:
# Import libraries
import datetime
from datetime import date
import pandas as pd
import numpy as np
from plotly import __version__
%matplotlib inline
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
init_notebook_mode(connected=True)
cf.go_offline()
# Create function that uses data.reader and iplot()
def graph(ticker):
# create sample data set
start = datetime.datetime(2006, 1, 1)
end = datetime.datetime(2016, 1, 1)
df = data.DataReader(ticker, 'morningstar', start, end)
df = df.reset_index()
df['numberOfDays'] = df.apply(lambda x: abs((datetime.datetime.now() - x['Date']).days), axis=1)
# call iplot within the function graph()
df.iplot(kind='line', x='numberOfDays', y='Close', xTitle='Days', yTitle='Value', title='Prices', width=4)

Resources