Trying to plot a rolling corr line chart but Matplot keeps saying to bring in only valid columns? - python-3.x

Im trying to create a rolling corr using matplot but I get the error "select only valid columns before calling the operation. Dropped columns were Index(['time'], dtype='object')
I have dropped that field from my data frame but the error keeps on appearing ?
Is it something to do with my .iloc argument?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import seaborn as sns
import scipy.stats as stats
import json
from datetime import timezone
from datetime import datetime
from pycoingecko import CoinGeckoAPI
pd.options.display.width = 0
def datetime_to_unix(year, month, day):
'''datetime_to_unix(2021, 6, 1) => 1622505600.0'''
dt = datetime(year, month, day)
timestamp = (dt - datetime(1970, 1, 1)).total_seconds()
return timestamp
def unix_to_datetime(unix_time):
'''unix_to_datetime(1622505700)=> ''2021-06-01 12:01am'''''
ts = int(unix_time/1000 if len(str(unix_time)) > 10 else unix_time) # /1000 handles milliseconds
return datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %l:%M%p').lower()
# Initialize the client
cg = CoinGeckoAPI()
# Retrieve looksrare data in USD
result = cg.get_coin_market_chart_range_by_id(
id='looksrare',
vs_currency='usd',
from_timestamp=datetime_to_unix(2022, 1, 11),
to_timestamp=datetime_to_unix(2022, 4, 20)
)
time = [ unix_to_datetime(i[0]) for i in result['prices'] ]
p_array = np.array(result['prices'])
price = p_array[:,1]
v_array = np.array(result['total_volumes'])
volume = v_array[:,1]
df = pd.DataFrame({'time':time, 'price':price,})
df.head(100)
# Retrieve ETH data in USD
result = cg.get_coin_market_chart_range_by_id(
id='ethereum',
vs_currency='usd',
from_timestamp=datetime_to_unix(2022, 1, 11),
to_timestamp=datetime_to_unix(2022, 4, 20)
)
time = [ unix_to_datetime(i[0]) for i in result['prices'] ]
p_array = np.array(result['prices'])
price = p_array[:,1]
v_array = np.array(result['total_volumes'])
volume = v_array[:,1]
df2 = pd.DataFrame({'time':time, 'price':price,})
df2.head(100)
df_cd = pd.merge(df, df2, how='inner', on='time')
df_cd = df_cd.drop('time', 1)
output = df_cd.corr()
output1 = df_cd['price_x'].corr(df_cd['price_y'])
overall_pearson_r = df_cd.corr().iloc[0,1]
print(df_cd)
print(f"Pandas computed Pearson r: {overall_pearson_r}")
r, p = stats.pearsonr(df_cd.dropna()['price_x'], df_cd.dropna()['price_y'])
print(f"Scipy computed Pearson r: {r} and p-value: {p}")
# compute rolling window synchrony
f,ax=plt.subplots(figsize=(7,3))
df.rolling(window=30,center=True).median().plot(ax=ax)
ax.set(xlabel='Time',ylabel='Pearson r')
ax.set(title=f"Overall Pearson r = {np.round(overall_pearson_r,2)}");

Related

Locate an id in Dataframe using constraint on columns percentile

I am trying to do a Weighted Aged Historical Var based on the below Dataframe. I would like to identify the ID in my dataframe corresponding to the 5% quantile of the 'Weight_Age_Cumul' column (like in the below example i found on internet)
enter image description here
I ve tryied the following line of code but i get the following error message : 'DataFrame' object has no attribute 'idmax'
cac_df_sorted[cac_df_sorted.Weight_Age_Cumul]<=0.05].CAC_Log_returns.idmax()
enter image description here
If you can help me on that it you be great, thank you
full code below if needed :
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from tabulate import tabulate
from scipy.stats import norm
import yfinance as yf
from yahoofinancials import YahooFinancials
import sys
cac_df = yf.download('^FCHI',
start='2020-04-01',
end='2022-05-31',
progress=False,
)
cac_df.head()
cac_df = cac_df.drop(columns=['Open','High','Low','Close','Volume'])
#convertion into retuns
cac_df['Adj Close_-1'] = cac_df['Adj Close'].shift(1)
cac_df['CAC_Log_returns'] = np.log(cac_df['Adj Close']/cac_df['Adj Close_-1'])
cac_df.index = pd.to_datetime(cac_df.index, format = '%Y-%m-%d').strftime('%Y-%m-%d')
#plot CAC returns graph & histogram
cac_df['CAC_Log_returns'].plot(kind='line',figsize=(15,7))
plt.show()
cac_df['CAC_Log_returns'].hist(bins=40,normed=True,histtype='stepfilled',alpha=0.5)
plt.xlabel('Returns')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
#Historical Var Constant weight & Age Weighted & Vol Weighted
cac_df_sorted = cac_df.copy()
cac_df_sorted.sort_values(by=['Date'],inplace=True,ascending = False)
#Weight for Var Age weighted
lamb = 0.98
n = len(cac_df_sorted['CAC_Log_returns'])
weight_age= []
weight_age = [(lamb**(i-1) * (1-lamb))/(1-lamb**n)for i in range(1, n+1)]
#design of the dataframe
cac_df_sorted['Weight_Age'] = weight_age
cac_df_sorted.sort_values(by=['CAC_Log_returns'],inplace=True,ascending = True)
cac_df_sorted['Weight_Age_Cumul'] = np.cumsum(weight_age)
#Historical Var Constant weight
Var_95_1d_CW = -cac_df_sorted['CAC_Log_returns'].quantile(0.05)
Var_99_1d_CW = -cac_df_sorted['CAC_Log_returns'].quantile(0.01)
#from Var1d to Var10d
mean = np.mean(cac_df['CAC_Log_returns'])
Var_95_10d_CW =(np.sqrt(10)*Var_95_1d_CW)+(mean *(np.sqrt(10)-10))
Var_99_10d_CW = (np.sqrt(10)*Var_99_1d_CW) +(mean *(np.sqrt(10)-10))
print(tabulate([['95%',Var_95_1d_CW,Var_95_10d_CW],['99%',Var_99_1d_CW,Var_99_10d_CW]], headers= ['Confidence Level', 'Value at Risk 1 day Constant Weight','Value at Risk 10 days Constant Weight']))
print(cac_df_sorted)
# Historical Var Age weighted
#Find where cumulative (percentile) hits 0.05 and 0.01
cac_df_sorted[cac_df_sorted['Weight_Age_Cumul']<=0.05].CAC_Log_returns.idmax()

How to add entire dataframe row as scatter plot annotation

I'm plotting two columns of a Pandas DataFrame on a scatterplot and I want each point to show all the row values of the DataFrame. I've looked at this post, and tried to do something similar with mplcursors:
import pandas as pd
from datetime import date, datetime, time, timedelta
import numpy as np
import matplotlib.pyplot as plt
from mplcursors import cursor
df = pd.DataFrame()
df['datetime'] = pd.date_range(start='2016-01-01', end='2016-01-14', freq='30T')
#df = df.set_index('datetime')
df['x1'] = np.random.randint(-30, 30, size=len(df))
df['x2'] = np.random.randint(-30, 20, size=len(df))
df['x3'] = np.random.randint(-20, 30, size=len(df))
df['y1'] = np.random.randint(-100, 100, size=len(df))
df['y2'] = np.random.randint(-300, 200, size=len(df))
df['y3'] = np.random.randint(-200, 300, size=len(df))
def conditions(s):
if (s['y1'] > 20) or (s['y3'] < 0):
return 'group1'
elif (s['x3'] < 20):
return 'group2'
elif (s['x2'] == 0):
return 'group3'
else:
return 'group4'
df['category'] = df.apply(conditions, axis=1)
fig = plt.figure(figsize=(12,4))
ax1 = plt.subplot(121)
ax1.scatter(df.x1, df.y1, label='test1')
ax1.scatter(df.x2, df.y2, label='test2')
#cursor(hover=True)
ax1.set_xlabel('test1')
ax1.set_ylabel('test2')
ax1.legend(['test1','test2'])
cr1 = cursor(ax1,hover=True)
#ax1.annotation_names = df.columns.tolist()
cr1.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
ax2 = plt.subplot(122)
ax2.scatter(df.x1, df.y1, label='test1')
ax2.scatter(df.x3, df.y3, label='test3')
ax2.set_xlabel('test1')
ax2.set_ylabel('test3')
ax2.legend(['test1','test3'])
cr2 = cursor(ax2,hover=True)
#ax2.annotation_names = df.columns.tolist()
cr2.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
# save figure
import pickle
pickle.dump(fig, open('FigureObject.fig.pickle', 'wb'))
plt.show()
When I hover over a point, I want to see a label containing (for example):
datetime = 2016-01-01 00:00:00
x1 = 1
x2 = -4
x3 = 22
y1 = -42
y2 = -219
y3 = -158
category = group1
but I get this type of error:
cr2.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
IndexError: list index out of range
How do I fix it?
The IndexError occurs because of df.columns.tolist()[x.target.index]
df.columns.tolist() is a list of 7 columns, which is then indexed by [x.target.index].
df.iloc[x.target.index, :].to_dict() will get the desired row data for the point as a dict
A list comprehension creates a list of strings for each key value pair
'\n'.join(...) creates a string with each column separated by a \n
In mplcursors v0.5.1, Selection.target.index is deprecated, use Selection.index instead.
df.iloc[x.index, :] instead of df.iloc[x.target.index, :]
cr1.connect("add", lambda x: x.annotation.set_text('\n'.join([f'{k}: {v}' for k, v in df.iloc[x.index, :].to_dict().items()])))
Alternatively, use .to_string()
cr1.connect("add", lambda x: x.annotation.set_text(df.iloc[x.index, :].to_string()))

Reading in multiple files in Python and saving them one by one in a different directory

import glob
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
files = glob.glob("Angular_position_*_*.csv")
output = pd.DataFrame()
for f in files:
df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
np.savetxt("enter directory path/Filtered_files/Filtered_Angular_position_*_*", output, delimiter = ', ', newline = "\n")
I am trying to read in all files in a directory, they are then low pass filtered. After that the results are saved one after the other but not in one file. The result gives each files with 3 columns and ideally I would like them to named with headers e.g. col1, col2, col3.
Without using glob, I can filter all my files individually but I have more than 100 such files.
Any help would be appreciated.
best wishes,
I have partially solved the issue apart from the header names:
import glob
import pandas as pd
from tnorma import tnorma
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
path = r'location_of_dir'
all_files = glob.glob(path + '/*.csv')
# yn = np.zeros(shape = (101,1))
# tn = np.zeros(shape = (101,1))
#ynew = []
yn = np.zeros(shape = (101,1))
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
print(filename)
foo = filename.split("/")[-1]
#df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
#print(ynew)
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
#print (output)
tn = np.linspace(0, 100, 101) # new time vector for the new time-normalized data
yn, tn, indie = tnorma(output, k=3, smooth =1, mask = None, show = False)
np.savetxt("path_name/foldername/file"+ foo, yn, delimiter = ', ', newline = "\n")
However, I am having difficulty in putting header names on the 3 columns per file.

Daily data of same month over years

I have data from the same month over period of time and I trying to plot the mean by day of the motnh but I don´t know how to do it.
This is how the dataframe looks like
The main code to get the dataframe:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import matplotlib.pyplot as plt
from datetime import date, timedelta
from datetime import datetime
inicio = date(1973, 1, 1)
#inicio = date(2019, 2, 15)
#final = date(2000, 10, 10)
final = date(1974, 3, 1)
delta = timedelta(days=1)
años=[]
links=[]
while inicio <= final:
fechas=inicio.strftime("%Y-%m-%d")
#años.append(datetime.strptime(fechas, '%Y-%m-%d').date())
años.append(fechas)
url='http://weather.uwyo.edu/cgi-bin/sounding?region=samer&TYPE=TEXT%3ALIST&YEAR={}&MONTH={}&FROM={}12&TO={}12&STNM=80222'.format(fechas[0:4],fechas[5:7],fechas[8:10],fechas[8:10])
links.append(url)
inicio += delta
d = dict(zip(años, links))
df1=pd.DataFrame(list(d.items()), columns=['Fecha', 'url'])
df1.set_index('Fecha', inplace=True)
Enero=pd.DataFrame()
Febrero=pd.DataFrame()
for i in df1.index:
if i[5:7]=='01':
Enero = Enero.append(df1.loc[i], ignore_index=False)
elif i[5:7]=='02':
Febrero = Febrero.append(df1.loc[i], ignore_index=False)
labels = ['PRES', 'HGHT', 'TEMP', 'DWPT', 'RELH', 'MIXR', 'DRCT', 'SKNT', 'THTA', 'THTE', 'THTV']
def reques(url):
try:
results = []
peticion=requests.get(url)
soup=bs(peticion.content, 'lxml')
pre = (soup.select_one('pre')).text
for line in pre.split('\n')[4:-1]:
#print (line)
if '--' not in line:
row = [line[i:i+7].strip() for i in range(0, len(line), 7)]
results.append(row)
else:
pass
df5=pd.DataFrame.from_records(results, columns=labels)
#return x
return df5
except AttributeError:
pass
SuperDF = pd.DataFrame()
SuperDF = pd.DataFrame(columns=labels)
startTime = datetime.now()
sin_datos=[]
for i in Febrero['url']:
try:
x=reques(i)
df2=x
y=str(df1[df1['url']==i].index.values)
df2.index = [y] * len(x)
SuperDF=SuperDF.append(x)
except TypeError:
sin_datos.append(df1[df1['url']==i].index.values)
print (df1[df1['url']==i].index.values)
SuperDF.index= SuperDF.index.map(lambda x: x.lstrip("['").rstrip("]''"))
SuperDF.index = pd.to_datetime(SuperDF.index)
SuperDF=SuperDF.apply(pd.to_numeric)
SuperDF
I've been trying to do it whit this
import seaborn as sns
SuperDF = SuperDF[(SuperDF['TEMP']==0)]
ax = SuperDF.loc['02', 'RELH'].plot(marker='o', linestyle='-')
ax.set_ylabel('RELH');
but I got this error
KeyError: '02'
It works when i pass the year but i need the mean by day for the month. Any help will be appreciate.
This is what I need

Use of datetime timedelta with numpy 3d array

I have a 3D array with the count of number of days past a benchmark date (e.g., 01.01.2000). I am interested in the actual day-of-year (DOY: 1-365/366)rather than the total number of days past a given date.
For a single value, the below syntax works. For e.g.,
import numpy as np
import datetime
data = 1595
date = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(data -1)
date.timetuple().tm_yday
134
However, I am having issues with using a 3D array.
import numpy as np
import datetime
data = np.random.randint(5, size = (2,2,2))
data = data + 1595
data
array([[[1596, 1595],
[1599, 1599]],
[[1596, 1599],
[1595, 1595]]])
#Function
def Int_to_DOY(int_array):
date_ = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(int_array - 1)
return date_.timetuple().tm_yday
doy_data = data * 0 #Empty array
for i in range(2):
doy_data[:, :, i] = Int_to_DOY(data[:, :, i])
Here is the error message and I am not able to figure this out.
TypeError: unsupported type for timedelta days component: numpy.ndarray
Thanks for your help.
import numpy as np
import datetime
data = np.random.randint(5, size = (2,2,2))
data = data + 1595
#Function
def Int_to_DOY(int_array):
date_ = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(int(int_array) -1)
return date_.timetuple().tm_yday
doy_data = data.flatten()
for i in range(len(doy_data)):
doy_data[i] = Int_to_DOY(doy_data[i])
doy_data = doy_data.reshape((2,2,2))
Since you tagged pandas:
data = np.array([[[1596, 1595],
[1599, 1599]],
[[1596, 1599],
[1595, 1595]]])
s = pd.to_datetime('2000-01-01') + pd.to_timedelta(data.ravel(), unit='D')
s.dayofyear.values.reshape(data.shape) - 1
Output:
array([[[135, 134],
[138, 138]],
[[135, 138],
[134, 134]]], dtype=int64)

Resources