Locate an id in Dataframe using constraint on columns percentile

Locate an id in Dataframe using constraint on columns percentile - python-3.x

I am trying to do a Weighted Aged Historical Var based on the below Dataframe. I would like to identify the ID in my dataframe corresponding to the 5% quantile of the 'Weight_Age_Cumul' column (like in the below example i found on internet)
enter image description here
I ve tryied the following line of code but i get the following error message : 'DataFrame' object has no attribute 'idmax'
cac_df_sorted[cac_df_sorted.Weight_Age_Cumul]<=0.05].CAC_Log_returns.idmax()
enter image description here
If you can help me on that it you be great, thank you
full code below if needed :
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from tabulate import tabulate
from scipy.stats import norm
import yfinance as yf
from yahoofinancials import YahooFinancials
import sys
cac_df = yf.download('^FCHI',
start='2020-04-01',
end='2022-05-31',
progress=False,
)
cac_df.head()
cac_df = cac_df.drop(columns=['Open','High','Low','Close','Volume'])
#convertion into retuns
cac_df['Adj Close_-1'] = cac_df['Adj Close'].shift(1)
cac_df['CAC_Log_returns'] = np.log(cac_df['Adj Close']/cac_df['Adj Close_-1'])
cac_df.index = pd.to_datetime(cac_df.index, format = '%Y-%m-%d').strftime('%Y-%m-%d')
#plot CAC returns graph & histogram
cac_df['CAC_Log_returns'].plot(kind='line',figsize=(15,7))
plt.show()
cac_df['CAC_Log_returns'].hist(bins=40,normed=True,histtype='stepfilled',alpha=0.5)
plt.xlabel('Returns')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
#Historical Var Constant weight & Age Weighted & Vol Weighted
cac_df_sorted = cac_df.copy()
cac_df_sorted.sort_values(by=['Date'],inplace=True,ascending = False)
#Weight for Var Age weighted
lamb = 0.98
n = len(cac_df_sorted['CAC_Log_returns'])
weight_age= []
weight_age = [(lamb**(i-1) * (1-lamb))/(1-lamb**n)for i in range(1, n+1)]
#design of the dataframe
cac_df_sorted['Weight_Age'] = weight_age
cac_df_sorted.sort_values(by=['CAC_Log_returns'],inplace=True,ascending = True)
cac_df_sorted['Weight_Age_Cumul'] = np.cumsum(weight_age)
#Historical Var Constant weight
Var_95_1d_CW = -cac_df_sorted['CAC_Log_returns'].quantile(0.05)
Var_99_1d_CW = -cac_df_sorted['CAC_Log_returns'].quantile(0.01)
#from Var1d to Var10d
mean = np.mean(cac_df['CAC_Log_returns'])
Var_95_10d_CW =(np.sqrt(10)*Var_95_1d_CW)+(mean *(np.sqrt(10)-10))
Var_99_10d_CW = (np.sqrt(10)*Var_99_1d_CW) +(mean *(np.sqrt(10)-10))
print(tabulate([['95%',Var_95_1d_CW,Var_95_10d_CW],['99%',Var_99_1d_CW,Var_99_10d_CW]], headers= ['Confidence Level', 'Value at Risk 1 day Constant Weight','Value at Risk 10 days Constant Weight']))
print(cac_df_sorted)
# Historical Var Age weighted
#Find where cumulative (percentile) hits 0.05 and 0.01
cac_df_sorted[cac_df_sorted['Weight_Age_Cumul']<=0.05].CAC_Log_returns.idmax()

Related

Trying to plot a rolling corr line chart but Matplot keeps saying to bring in only valid columns?

Im trying to create a rolling corr using matplot but I get the error "select only valid columns before calling the operation. Dropped columns were Index(['time'], dtype='object')
I have dropped that field from my data frame but the error keeps on appearing ?
Is it something to do with my .iloc argument?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import seaborn as sns
import scipy.stats as stats
import json
from datetime import timezone
from datetime import datetime
from pycoingecko import CoinGeckoAPI
pd.options.display.width = 0
def datetime_to_unix(year, month, day):
'''datetime_to_unix(2021, 6, 1) => 1622505600.0'''
dt = datetime(year, month, day)
timestamp = (dt - datetime(1970, 1, 1)).total_seconds()
return timestamp
def unix_to_datetime(unix_time):
'''unix_to_datetime(1622505700)=> ''2021-06-01 12:01am'''''
ts = int(unix_time/1000 if len(str(unix_time)) > 10 else unix_time) # /1000 handles milliseconds
return datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %l:%M%p').lower()
# Initialize the client
cg = CoinGeckoAPI()
# Retrieve looksrare data in USD
result = cg.get_coin_market_chart_range_by_id(
id='looksrare',
vs_currency='usd',
from_timestamp=datetime_to_unix(2022, 1, 11),
to_timestamp=datetime_to_unix(2022, 4, 20)
)
time = [ unix_to_datetime(i[0]) for i in result['prices'] ]
p_array = np.array(result['prices'])
price = p_array[:,1]
v_array = np.array(result['total_volumes'])
volume = v_array[:,1]
df = pd.DataFrame({'time':time, 'price':price,})
df.head(100)
# Retrieve ETH data in USD
result = cg.get_coin_market_chart_range_by_id(
id='ethereum',
vs_currency='usd',
from_timestamp=datetime_to_unix(2022, 1, 11),
to_timestamp=datetime_to_unix(2022, 4, 20)
)
time = [ unix_to_datetime(i[0]) for i in result['prices'] ]
p_array = np.array(result['prices'])
price = p_array[:,1]
v_array = np.array(result['total_volumes'])
volume = v_array[:,1]
df2 = pd.DataFrame({'time':time, 'price':price,})
df2.head(100)
df_cd = pd.merge(df, df2, how='inner', on='time')
df_cd = df_cd.drop('time', 1)
output = df_cd.corr()
output1 = df_cd['price_x'].corr(df_cd['price_y'])
overall_pearson_r = df_cd.corr().iloc[0,1]
print(df_cd)
print(f"Pandas computed Pearson r: {overall_pearson_r}")
r, p = stats.pearsonr(df_cd.dropna()['price_x'], df_cd.dropna()['price_y'])
print(f"Scipy computed Pearson r: {r} and p-value: {p}")
# compute rolling window synchrony
f,ax=plt.subplots(figsize=(7,3))
df.rolling(window=30,center=True).median().plot(ax=ax)
ax.set(xlabel='Time',ylabel='Pearson r')
ax.set(title=f"Overall Pearson r = {np.round(overall_pearson_r,2)}");

Changing the values of a dict in lowercase ( values are code colors ) to be accepted as a color parametrer in plotly.graph.object

So, I'm trying to get the colors from the dictionary 'Disaster_type' to draw the markers in geoscatters depending of the type of disaster.
Basically, I want to reprensent in the graphic the natural diasasters with it's color code. eg; it's is a volcanic activity paint it 'orange'. I want to change the size of the marker as well depending of the magnitude of the disaster, but that's for another day.
here's the link of the dataset: https://www.kaggle.com/datasets/brsdincer/all-natural-disasters-19002021-eosdis
import plotly.graph_objects as go
import pandas as pd
import plotly as plt
df = pd.read_csv('1900_2021_DISASTERS - main.csv')
df.head()
df.tail()
disaster_set = {disaster for disaster in df['Disaster Type']}
disaster_type = {'Storm':'aliceblue',
'Volcanic activity':'orange',
'Flood':'royalblue',
'Mass movement (dry)':'darkorange',
'Landslide':'#C76114',
'Extreme temperature':'#FF0000',
'Animal accident':'gray55',
'Glacial lake outburst':'#7D9EC0',
'Earthquake':'#CD8C95',
'Insect infestation':'#EEE8AA',
'Wildfire':' #FFFF00',
'Fog':'#00E5EE',
'Drought':'#FFEFD5',
'Epidemic':'#00CD66 ',
'Impact':'#FF6347'}
# disaster_type_lower = {(k, v.lower()) for k, v in disaster_type.items()}
# print(disaster_type_lower)
# for values in disaster_type.values():
# disaster_type[values] = disaster_type.lowercase()
fig = go.Figure(data=go.Scattergeo(
lon = df['Longitude'],
lat = df['Latitude'],
text = df['Country'],
mode = 'markers',
marker_color = disaster_type_.values()
)
)
fig.show()
I cant figure how, I've left in comments after the dict how I tried to do that.
It changes them to lowercase, but know I dont know hot to get them...My brain is completly melted

it's a simple case of pandas map
found data that appears same as yours on kaggle so have used that
one type is unmapped Extreme temperature so used a fillna("red") to remove any errors
gray55 gave me an error so replaced it with RGB equivalent
import kaggle.cli
import sys
import pandas as pd
from zipfile import ZipFile
import urllib
import plotly.graph_objects as go
# fmt: off
# download data set
url = "https://www.kaggle.com/brsdincer/all-natural-disasters-19002021-eosdis"
sys.argv = [sys.argv[0]] + f"datasets download {urllib.parse.urlparse(url).path[1:]}".split(" ")
kaggle.cli.main()
zfile = ZipFile(f'{urllib.parse.urlparse(url).path.split("/")[-1]}.zip')
dfs = {f.filename: pd.read_csv(zfile.open(f)) for f in zfile.infolist()}
# fmt: on
df = dfs["DISASTERS/1970-2021_DISASTERS.xlsx - emdat data.csv"]
disaster_type = {
"Storm": "aliceblue",
"Volcanic activity": "orange",
"Flood": "royalblue",
"Mass movement (dry)": "darkorange",
"Landslide": "#C76114",
"Extreme temperature": "#FF0000",
"Animal accident": "#8c8c8c", # gray55
"Glacial lake outburst": "#7D9EC0",
"Earthquake": "#CD8C95",
"Insect infestation": "#EEE8AA",
"Wildfire": " #FFFF00",
"Fog": "#00E5EE",
"Drought": "#FFEFD5",
"Epidemic": "#00CD66 ",
"Impact": "#FF6347",
}
fig = go.Figure(
data=go.Scattergeo(
lon=df["Longitude"],
lat=df["Latitude"],
text=df["Country"],
mode="markers",
marker_color=df["Disaster Type"].map(disaster_type).fillna("red"),
)
)
fig.show()

Apply function on a Pandas Dataframe

Apply function on a Pandas Dataframe
I have a code (C01) that calculates the moving averages (21 periods) of a given stock (individual) on the stock exchange (IBOV - B3-BRAZIL). Then I created a for loop where it determines that an asset is in an upward trend after 6 highs followed by moving averages (hypothesis, considering that there are more variables to determine this).
However, I want to do this loop for more than one asset, in this case C02, that is, it applies a function in each column of my code and returns only the name of the assets that are in an upward trend (in this case, the column name). I tried to turn the for loop into a function and apply that function using the pandas 'apply' to each column (axis = 1, I tried tbm axis = 'columns'). But I'm having an error creating the function. When I execute the function using apply, the message "ValueError: Lengths must match to compare" appears. How can I fix this?
Grateful for the attention.
import numpy as np
import pandas as pd
from pandas_datareader import data as wb
from mpl_finance import candlestick_ohlc
from pandas_datareader import data as wb
from datetime import datetime
import matplotlib.dates as mpl_dates
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
#STOCK
ativo = 'WEGE3.SA'
acao2 = ativo.upper()
#START AND END ANALYSIS
inicio = '2020-1-1'
fim = '2021-1-27'
#MAKE DATAFRAME
df00 = wb.DataReader(acao2, data_source='yahoo', start=inicio, end=fim)
df00.index.names = ['Data']
df= df00.copy(deep=True)
df['Data'] = df.index.map(mdates.date2num)
# MOVING AVERAGE
df['ema21'] = df['Close'].ewm(span=21, adjust=False).mean()
df['ema72'] = df['Close'].ewm(span=72, adjust=False).mean()
#DF PLOT
df1=df
df2=df[-120:]
#TREND RULE
alta=1
for i in range(6):
if(df2.ema21[-i-1] < df2.ema21[-i-2]):
alta=0
baixa=1
for i in range(6):
if(df2.ema21[-i-1] > df2.ema21[-i-2]):
baixa=0
if (alta==1 and baixa==0):
a1 = ativo.upper()+ ' HIGH TREND'
elif (alta==0 and baixa==1):
a1 = ativo.upper()+ ' LOW TREND!'
else:
a1 = ativo.upper()+ ' UNDEFINED'
#PLOT RESULTS
print("---------------------------------------")
print(a1)
print("---------------------------------------")
ohlc = df[['Data', 'Open', 'High', 'Low', 'Close']]
f1, ax = plt.subplots(figsize=(14, 8))
# plot the candlesticks
candlestick_ohlc(ax, ohlc.values, width=.6, colorup='green', colordown='red')
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
label_ = acao2.upper() + ' EMA26'
label_2 = acao2.upper() + ' EMA09'
ax.plot(df.index, df1['ema21'], color='black', label=label_)
ax.plot(df.index, df1['ema72'], color='blue', label=label_)
ax.grid(False)
ax.legend()
ax.grid(True)
plt.title(acao2.upper() + ' : Gráfico Diário')
plt.show(block=True)
#C02
#START/END ANALISYS
inicio = '2020-1-1'
fim = '2021-1-27'
#STOCKS
ativos = ['SAPR11.SA','WEGE3.SA']
#DATAFRAME
mydata = pd.DataFrame()
for t in ativos:
mydata[t] = wb.DataReader(t, data_source='yahoo', start=inicio, end=fim)['Close']
df2 = mydata
#MOVING AVERAGE
df3 = df2.apply(lambda x: x.rolling(window=21).mean())
#MAKE FUNCTION
def trend(x):
tendencia_alta=1
for i in range(6):
if(df3.columns[-i-1:] > df3.columns[-i-2:]):
tendencia_alta=0
print()
if (alta==1 and baixa==0):
a1 = ativo.upper()+ ' HIGH TREND'
elif (alta==0 and baixa==1):
a1 = ativo.upper()+ ' LOW TREND!'
else:
a1 = ativo.upper()+ ' UNDEFINED'
#TRYING TO APPLY THE FUNCTION IN EVERY DF3 COLUMN
df3.apply(trend, axis=1)´´´

something like:
def myfunc(x):
#do things here where x is the group of rows sent to function
#instead of df['column'], you'll use x['column']
#because you are passing the rows into x
return x
df.groupby('yourcolumn').apply(myfunc)

Use of datetime timedelta with numpy 3d array

I have a 3D array with the count of number of days past a benchmark date (e.g., 01.01.2000). I am interested in the actual day-of-year (DOY: 1-365/366)rather than the total number of days past a given date.
For a single value, the below syntax works. For e.g.,
import numpy as np
import datetime
data = 1595
date = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(data -1)
date.timetuple().tm_yday
134
However, I am having issues with using a 3D array.
import numpy as np
import datetime
data = np.random.randint(5, size = (2,2,2))
data = data + 1595
data
array([[[1596, 1595],
[1599, 1599]],
[[1596, 1599],
[1595, 1595]]])
#Function
def Int_to_DOY(int_array):
date_ = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(int_array - 1)
return date_.timetuple().tm_yday
doy_data = data * 0 #Empty array
for i in range(2):
doy_data[:, :, i] = Int_to_DOY(data[:, :, i])
Here is the error message and I am not able to figure this out.
TypeError: unsupported type for timedelta days component: numpy.ndarray
Thanks for your help.

import numpy as np
import datetime
data = np.random.randint(5, size = (2,2,2))
data = data + 1595
#Function
def Int_to_DOY(int_array):
date_ = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(int(int_array) -1)
return date_.timetuple().tm_yday
doy_data = data.flatten()
for i in range(len(doy_data)):
doy_data[i] = Int_to_DOY(doy_data[i])
doy_data = doy_data.reshape((2,2,2))

Since you tagged pandas:
data = np.array([[[1596, 1595],
[1599, 1599]],
[[1596, 1599],
[1595, 1595]]])
s = pd.to_datetime('2000-01-01') + pd.to_timedelta(data.ravel(), unit='D')
s.dayofyear.values.reshape(data.shape) - 1
Output:
array([[[135, 134],
[138, 138]],
[[135, 138],
[134, 134]]], dtype=int64)

ValueError: year is out of range using matplotlib.pyplot

After I call candlestick_ohlc, I can't seem to convert the x axis dates to something matplotlib can understand.
I'm a noob Python programmer. I've tried turning the dataframe into a list, I've tried passing dates to candlestick_ohlc, nothing seems to work other than changing
df['time'] = (df['time'].astype('float'))
into
df['time'] = (df['time'].astype('float')\1000)
Although that renders the wrong datetime.
import requests
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdate
import matplotlib.style as style
import matplotlib.ticker as mticker
from matplotlib.dates import date2num
from mpl_finance import candlestick_ohlc
import datetime as dt
import numpy as np
import matplotlib.ticker as mticker
def get_data(date):
""" Query the API for 2000 days historical price data starting from "date". """
url = "https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=USD&limit=2000&toTs={}".format(date)
r = requests.get(url)
ipdata = r.json()
return ipdata
def get_df(from_date, to_date):
""" Get historical price data between two dates. """
date = to_date
holder = []
# While the earliest date returned is later than the earliest date requested, keep on querying the API
# and adding the results to a list.
while date > from_date:
data = get_data(date)
holder.append(pd.DataFrame(data['Data']))
date = data['TimeFrom']
# Join together all of the API queries in the list.
df = pd.concat(holder, axis = 0)
# Remove data points from before from_date
df = df[df['time']>from_date]
# Convert to timestamp to readable date format
# df['time'] = pd.to_datetime(df['time'], unit='s')
# Make the DataFrame index the time
df.set_index('time', inplace=True)
# And sort it so its in time order
df.sort_index(ascending=False, inplace=True)
return df
df = get_df(1528502400, 1560112385)
style.use('dark_background')
fig = plt.figure()
ax1 = plt.subplot2grid((1,1), (0,0))
df = df.reset_index()
cols = ['time', 'open', 'high', 'low', 'close', 'volumefrom', 'volumeto']
df = df[cols]
#IF YOU /1000 AFER ('float') IT WILL RUN BUT NOT CORRECT DATE
df['time'] = (df['time'].astype('float'))
print(df.dtypes)
ohlc = df.values.tolist()
candlestick_ohlc(ax1, ohlc, width=.4, colorup='g', colordown='r')
# IF YOU COMMENT NEXT 4 LINES IT WILL RUN, but NO DATES for XAXIS
date_fmt = "%d-%m-%Y"
date_formatter = mdate.DateFormatter(date_fmt)
ax1.xaxis.set_major_formatter(date_formatter)
fig.autofmt_xdate()
ax1.set_ylabel('BTC Price (USD)')
ax1.set_xlabel('Date')
plt.show()
Expected result would be date labels plotted as d-m-y. :)
Wish this had dates for xaxis labels not seconds since 1970
This is what I want it to look like, but with accurate dates

This is how to fix the code:
df['time'] = df['time'].apply(mdates.epoch2num)
It was definitely one of those lines of code that you spend hours on... now I know.

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Locate an id in Dataframe using constraint on columns percentile - python-3.x

Related

Trying to plot a rolling corr line chart but Matplot keeps saying to bring in only valid columns?

Changing the values of a dict in lowercase ( values are code colors ) to be accepted as a color parametrer in plotly.graph.object

Apply function on a Pandas Dataframe

Use of datetime timedelta with numpy 3d array

ValueError: year is out of range using matplotlib.pyplot

Categories

Resources