How to add entire dataframe row as scatter plot annotation - python-3.x

I'm plotting two columns of a Pandas DataFrame on a scatterplot and I want each point to show all the row values of the DataFrame. I've looked at this post, and tried to do something similar with mplcursors:
import pandas as pd
from datetime import date, datetime, time, timedelta
import numpy as np
import matplotlib.pyplot as plt
from mplcursors import cursor
df = pd.DataFrame()
df['datetime'] = pd.date_range(start='2016-01-01', end='2016-01-14', freq='30T')
#df = df.set_index('datetime')
df['x1'] = np.random.randint(-30, 30, size=len(df))
df['x2'] = np.random.randint(-30, 20, size=len(df))
df['x3'] = np.random.randint(-20, 30, size=len(df))
df['y1'] = np.random.randint(-100, 100, size=len(df))
df['y2'] = np.random.randint(-300, 200, size=len(df))
df['y3'] = np.random.randint(-200, 300, size=len(df))
def conditions(s):
if (s['y1'] > 20) or (s['y3'] < 0):
return 'group1'
elif (s['x3'] < 20):
return 'group2'
elif (s['x2'] == 0):
return 'group3'
else:
return 'group4'
df['category'] = df.apply(conditions, axis=1)
fig = plt.figure(figsize=(12,4))
ax1 = plt.subplot(121)
ax1.scatter(df.x1, df.y1, label='test1')
ax1.scatter(df.x2, df.y2, label='test2')
#cursor(hover=True)
ax1.set_xlabel('test1')
ax1.set_ylabel('test2')
ax1.legend(['test1','test2'])
cr1 = cursor(ax1,hover=True)
#ax1.annotation_names = df.columns.tolist()
cr1.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
ax2 = plt.subplot(122)
ax2.scatter(df.x1, df.y1, label='test1')
ax2.scatter(df.x3, df.y3, label='test3')
ax2.set_xlabel('test1')
ax2.set_ylabel('test3')
ax2.legend(['test1','test3'])
cr2 = cursor(ax2,hover=True)
#ax2.annotation_names = df.columns.tolist()
cr2.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
# save figure
import pickle
pickle.dump(fig, open('FigureObject.fig.pickle', 'wb'))
plt.show()
When I hover over a point, I want to see a label containing (for example):
datetime = 2016-01-01 00:00:00
x1 = 1
x2 = -4
x3 = 22
y1 = -42
y2 = -219
y3 = -158
category = group1
but I get this type of error:
cr2.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
IndexError: list index out of range
How do I fix it?

The IndexError occurs because of df.columns.tolist()[x.target.index]
df.columns.tolist() is a list of 7 columns, which is then indexed by [x.target.index].
df.iloc[x.target.index, :].to_dict() will get the desired row data for the point as a dict
A list comprehension creates a list of strings for each key value pair
'\n'.join(...) creates a string with each column separated by a \n
In mplcursors v0.5.1, Selection.target.index is deprecated, use Selection.index instead.
df.iloc[x.index, :] instead of df.iloc[x.target.index, :]
cr1.connect("add", lambda x: x.annotation.set_text('\n'.join([f'{k}: {v}' for k, v in df.iloc[x.index, :].to_dict().items()])))
Alternatively, use .to_string()
cr1.connect("add", lambda x: x.annotation.set_text(df.iloc[x.index, :].to_string()))

Related

Trying to plot a rolling corr line chart but Matplot keeps saying to bring in only valid columns?

Im trying to create a rolling corr using matplot but I get the error "select only valid columns before calling the operation. Dropped columns were Index(['time'], dtype='object')
I have dropped that field from my data frame but the error keeps on appearing ?
Is it something to do with my .iloc argument?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import seaborn as sns
import scipy.stats as stats
import json
from datetime import timezone
from datetime import datetime
from pycoingecko import CoinGeckoAPI
pd.options.display.width = 0
def datetime_to_unix(year, month, day):
'''datetime_to_unix(2021, 6, 1) => 1622505600.0'''
dt = datetime(year, month, day)
timestamp = (dt - datetime(1970, 1, 1)).total_seconds()
return timestamp
def unix_to_datetime(unix_time):
'''unix_to_datetime(1622505700)=> ''2021-06-01 12:01am'''''
ts = int(unix_time/1000 if len(str(unix_time)) > 10 else unix_time) # /1000 handles milliseconds
return datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %l:%M%p').lower()
# Initialize the client
cg = CoinGeckoAPI()
# Retrieve looksrare data in USD
result = cg.get_coin_market_chart_range_by_id(
id='looksrare',
vs_currency='usd',
from_timestamp=datetime_to_unix(2022, 1, 11),
to_timestamp=datetime_to_unix(2022, 4, 20)
)
time = [ unix_to_datetime(i[0]) for i in result['prices'] ]
p_array = np.array(result['prices'])
price = p_array[:,1]
v_array = np.array(result['total_volumes'])
volume = v_array[:,1]
df = pd.DataFrame({'time':time, 'price':price,})
df.head(100)
# Retrieve ETH data in USD
result = cg.get_coin_market_chart_range_by_id(
id='ethereum',
vs_currency='usd',
from_timestamp=datetime_to_unix(2022, 1, 11),
to_timestamp=datetime_to_unix(2022, 4, 20)
)
time = [ unix_to_datetime(i[0]) for i in result['prices'] ]
p_array = np.array(result['prices'])
price = p_array[:,1]
v_array = np.array(result['total_volumes'])
volume = v_array[:,1]
df2 = pd.DataFrame({'time':time, 'price':price,})
df2.head(100)
df_cd = pd.merge(df, df2, how='inner', on='time')
df_cd = df_cd.drop('time', 1)
output = df_cd.corr()
output1 = df_cd['price_x'].corr(df_cd['price_y'])
overall_pearson_r = df_cd.corr().iloc[0,1]
print(df_cd)
print(f"Pandas computed Pearson r: {overall_pearson_r}")
r, p = stats.pearsonr(df_cd.dropna()['price_x'], df_cd.dropna()['price_y'])
print(f"Scipy computed Pearson r: {r} and p-value: {p}")
# compute rolling window synchrony
f,ax=plt.subplots(figsize=(7,3))
df.rolling(window=30,center=True).median().plot(ax=ax)
ax.set(xlabel='Time',ylabel='Pearson r')
ax.set(title=f"Overall Pearson r = {np.round(overall_pearson_r,2)}");

Apply function on a Pandas Dataframe

Apply function on a Pandas Dataframe
I have a code (C01) that calculates the moving averages (21 periods) of a given stock (individual) on the stock exchange (IBOV - B3-BRAZIL). Then I created a for loop where it determines that an asset is in an upward trend after 6 highs followed by moving averages (hypothesis, considering that there are more variables to determine this).
However, I want to do this loop for more than one asset, in this case C02, that is, it applies a function in each column of my code and returns only the name of the assets that are in an upward trend (in this case, the column name). I tried to turn the for loop into a function and apply that function using the pandas 'apply' to each column (axis = 1, I tried tbm axis = 'columns'). But I'm having an error creating the function. When I execute the function using apply, the message "ValueError: Lengths must match to compare" appears. How can I fix this?
Grateful for the attention.
import numpy as np
import pandas as pd
from pandas_datareader import data as wb
from mpl_finance import candlestick_ohlc
from pandas_datareader import data as wb
from datetime import datetime
import matplotlib.dates as mpl_dates
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
#STOCK
ativo = 'WEGE3.SA'
acao2 = ativo.upper()
#START AND END ANALYSIS
inicio = '2020-1-1'
fim = '2021-1-27'
#MAKE DATAFRAME
df00 = wb.DataReader(acao2, data_source='yahoo', start=inicio, end=fim)
df00.index.names = ['Data']
df= df00.copy(deep=True)
df['Data'] = df.index.map(mdates.date2num)
# MOVING AVERAGE
df['ema21'] = df['Close'].ewm(span=21, adjust=False).mean()
df['ema72'] = df['Close'].ewm(span=72, adjust=False).mean()
#DF PLOT
df1=df
df2=df[-120:]
#TREND RULE
alta=1
for i in range(6):
if(df2.ema21[-i-1] < df2.ema21[-i-2]):
alta=0
baixa=1
for i in range(6):
if(df2.ema21[-i-1] > df2.ema21[-i-2]):
baixa=0
if (alta==1 and baixa==0):
a1 = ativo.upper()+ ' HIGH TREND'
elif (alta==0 and baixa==1):
a1 = ativo.upper()+ ' LOW TREND!'
else:
a1 = ativo.upper()+ ' UNDEFINED'
#PLOT RESULTS
print("---------------------------------------")
print(a1)
print("---------------------------------------")
ohlc = df[['Data', 'Open', 'High', 'Low', 'Close']]
f1, ax = plt.subplots(figsize=(14, 8))
# plot the candlesticks
candlestick_ohlc(ax, ohlc.values, width=.6, colorup='green', colordown='red')
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
label_ = acao2.upper() + ' EMA26'
label_2 = acao2.upper() + ' EMA09'
ax.plot(df.index, df1['ema21'], color='black', label=label_)
ax.plot(df.index, df1['ema72'], color='blue', label=label_)
ax.grid(False)
ax.legend()
ax.grid(True)
plt.title(acao2.upper() + ' : Gráfico Diário')
plt.show(block=True)
#C02
#START/END ANALISYS
inicio = '2020-1-1'
fim = '2021-1-27'
#STOCKS
ativos = ['SAPR11.SA','WEGE3.SA']
#DATAFRAME
mydata = pd.DataFrame()
for t in ativos:
mydata[t] = wb.DataReader(t, data_source='yahoo', start=inicio, end=fim)['Close']
df2 = mydata
#MOVING AVERAGE
df3 = df2.apply(lambda x: x.rolling(window=21).mean())
#MAKE FUNCTION
def trend(x):
tendencia_alta=1
for i in range(6):
if(df3.columns[-i-1:] > df3.columns[-i-2:]):
tendencia_alta=0
print()
if (alta==1 and baixa==0):
a1 = ativo.upper()+ ' HIGH TREND'
elif (alta==0 and baixa==1):
a1 = ativo.upper()+ ' LOW TREND!'
else:
a1 = ativo.upper()+ ' UNDEFINED'
#TRYING TO APPLY THE FUNCTION IN EVERY DF3 COLUMN
df3.apply(trend, axis=1)´´´
something like:
def myfunc(x):
#do things here where x is the group of rows sent to function
#instead of df['column'], you'll use x['column']
#because you are passing the rows into x
return x
df.groupby('yourcolumn').apply(myfunc)

Daily data of same month over years

I have data from the same month over period of time and I trying to plot the mean by day of the motnh but I don´t know how to do it.
This is how the dataframe looks like
The main code to get the dataframe:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import matplotlib.pyplot as plt
from datetime import date, timedelta
from datetime import datetime
inicio = date(1973, 1, 1)
#inicio = date(2019, 2, 15)
#final = date(2000, 10, 10)
final = date(1974, 3, 1)
delta = timedelta(days=1)
años=[]
links=[]
while inicio <= final:
fechas=inicio.strftime("%Y-%m-%d")
#años.append(datetime.strptime(fechas, '%Y-%m-%d').date())
años.append(fechas)
url='http://weather.uwyo.edu/cgi-bin/sounding?region=samer&TYPE=TEXT%3ALIST&YEAR={}&MONTH={}&FROM={}12&TO={}12&STNM=80222'.format(fechas[0:4],fechas[5:7],fechas[8:10],fechas[8:10])
links.append(url)
inicio += delta
d = dict(zip(años, links))
df1=pd.DataFrame(list(d.items()), columns=['Fecha', 'url'])
df1.set_index('Fecha', inplace=True)
Enero=pd.DataFrame()
Febrero=pd.DataFrame()
for i in df1.index:
if i[5:7]=='01':
Enero = Enero.append(df1.loc[i], ignore_index=False)
elif i[5:7]=='02':
Febrero = Febrero.append(df1.loc[i], ignore_index=False)
labels = ['PRES', 'HGHT', 'TEMP', 'DWPT', 'RELH', 'MIXR', 'DRCT', 'SKNT', 'THTA', 'THTE', 'THTV']
def reques(url):
try:
results = []
peticion=requests.get(url)
soup=bs(peticion.content, 'lxml')
pre = (soup.select_one('pre')).text
for line in pre.split('\n')[4:-1]:
#print (line)
if '--' not in line:
row = [line[i:i+7].strip() for i in range(0, len(line), 7)]
results.append(row)
else:
pass
df5=pd.DataFrame.from_records(results, columns=labels)
#return x
return df5
except AttributeError:
pass
SuperDF = pd.DataFrame()
SuperDF = pd.DataFrame(columns=labels)
startTime = datetime.now()
sin_datos=[]
for i in Febrero['url']:
try:
x=reques(i)
df2=x
y=str(df1[df1['url']==i].index.values)
df2.index = [y] * len(x)
SuperDF=SuperDF.append(x)
except TypeError:
sin_datos.append(df1[df1['url']==i].index.values)
print (df1[df1['url']==i].index.values)
SuperDF.index= SuperDF.index.map(lambda x: x.lstrip("['").rstrip("]''"))
SuperDF.index = pd.to_datetime(SuperDF.index)
SuperDF=SuperDF.apply(pd.to_numeric)
SuperDF
I've been trying to do it whit this
import seaborn as sns
SuperDF = SuperDF[(SuperDF['TEMP']==0)]
ax = SuperDF.loc['02', 'RELH'].plot(marker='o', linestyle='-')
ax.set_ylabel('RELH');
but I got this error
KeyError: '02'
It works when i pass the year but i need the mean by day for the month. Any help will be appreciate.
This is what I need

Why is plot returning "ValueError: could not convert string to float:" when a dataframe column of floats is being passed to the plot function?

I am trying to plot a dataframe I have created from an excel spreadsheet using either matplotlib or matplotlib and pandas ie. df.plot. However, python keeps returning a cannot convert string to float error. This is confusing since when I print the column of the dataframe it appears to be all float values.
I've tried printing the values of the dataframe column and using the pandas.plot syntax. I've also tried saving the column to a new variable.
import pandas as pd
from matplotlib import pyplot as plt
import glob
import openpyxl
import math
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Border, Side, Alignment
import seaborn as sns
import itertools
directory = 'E:\some directory'
#QA_directory = directory + '**/*COPY.xlsx'
wb = openpyxl.load_workbook(directory + '\\Calcs\\' + "excel file.xlsx", data_only = 'True')
plt.figure(figsize=(16,9))
axes = plt.axes()
plt.title('Drag Amplification', fontsize = 16)
plt.xlabel('Time (s)', fontsize = 14)
plt.ylabel('Cf', fontsize = 14)
d = pd.DataFrame()
n=[]
for sheets in wb.sheetnames:
if '2_1' in sheets and '2%' not in sheets and '44%' not in sheets:
name = sheets[:8]
print(name)
ws = wb[sheets]
data = ws.values
cols = next(data)[1:]
data = list(data)
idx = [r[0] for r in data]
data = (itertools.islice(r, 1, None) for r in data)
df = pd.DataFrame(data, index=idx, columns=cols)
df = df.dropna()
#x = df['x/l']
#y = df.Cf
print(df.columns)
print(df.Cf.values)
x=df['x/l'].values
plt.plot(x, df.Cf.values)
"""x = [wb[sheets].cell(row=row,column=1).value for row in range(1,2000) if wb[sheets].cell(row=row,column=1).value]
print(x)
Cf = [wb[sheets].cell(row=row,column=6).value for row in range(1,2000) if wb[sheets].cell(row=row,column=1).value]
d[name+ 'x'] = pd.DataFrame(x)
d[name + '_Cf'] = pd.Series(Cf, index=d.index)
print(name)"""
print(df)
plt.show()
I'm expecting a plot of line graphs with the values of x/l on the x access and Cf on the 'y' with a line for each of the relevant sheets in the workbook. Any insights as to why i am getting this error would be appreciated!

How to read from CSV file

I am trying to understand how Kalman Filter for non-linear system works. While searching for an example, I cam across this good basic example.
import numpy as np
import pylab as pl
import pandas as pd
from pykalman import UnscentedKalmanFilter
# initialize parameters
def transition_function(state, noise):
a = np.sin(state[0]) + state[1] * noise[0]
b = state[1] + noise[1]
return np.array([a, b])
def observation_function(state, noise):
C = np.array([[-1, 0.5], [0.2, 0.1]])
return np.dot(C, state) + noise
transition_covariance = np.eye(2)
random_state = np.random.RandomState(0)
observation_covariance = np.eye(2) + random_state.randn(2, 2) * 0.1
initial_state_mean = [0, 0]
initial_state_covariance = [[1, 0.1], [-0.1, 1]]
# sample from model
kf = UnscentedKalmanFilter(
transition_function, observation_function,
transition_covariance, observation_covariance,
initial_state_mean, initial_state_covariance,
random_state=random_state
)
states, observations = kf.sample(50, initial_state_mean)
# estimate state with filtering and smoothing
filtered_state_estimates = kf.filter(observations)[0]
smoothed_state_estimates = kf.smooth(observations)[0]
# draw estimates
pl.figure()
lines_true = pl.plot(states, color='b')
lines_filt = pl.plot(filtered_state_estimates, color='r', ls='-')
lines_smooth = pl.plot(smoothed_state_estimates, color='g', ls='-.')
pl.legend((lines_true[0], lines_filt[0], lines_smooth[0]),
('true', 'filt', 'smooth'),
loc='lower left'
)
pl.show()
This code produces the following graph.
However,for my experiment - I have created a very small time series data ready with three columns formatted as follows. The full dataset is attached here for reproduciability.
time X Y
0.040662 1.041667 1
0.139757 1.760417 2
0.144357 1.190104 1
0.145341 1.047526 1
0.145401 1.011882 1
0.148465 1.002970 1
.... ..... .
Instead of using the random values as shown in the code, how can we input from the CSV file I attached? Here is my approach, but it doesn't seem to workout for me and I would appreciate for any help.
df = pd.read_csv('testdata.csv')
pd.set_option('use_inf_as_null', True)
df.dropna(inplace=True)
X = df.drop('Y', axis=1)
y = df['Y']
d1= np.array(X)
d2 = np.array(y)
From the link I shared, here is how you get the CSV data into Numpy Arrays.
import numpy as np
import csv
with open('testdata.csv','r') as csvfile:
r = csv.reader(csvfile, delimiter=',')
data = [i for i in r]
headings = data.pop(0)
data = np.array([[np.float(j) for j in i] for i in data])
T = data.T[0] #Time
X = data.T[1] #X
Y = data.T[2] #Y
print(T)
print(X)
print(Y)

Resources