Python 3 - matplotlib not recognizing timezones when plotting hours on x axis - python-3.x

I am trying to plot scheduled hours of work vs. actual hours of work, but the hours on the x axis are not recognizing the timezone offset.
Here is an example of the code I am using:
### Import stack
import pandas as pd
import datetime
from datetime import datetime as dt
from datetime import date, timedelta
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
### build dummy shifts df
shifts = [{'name': 'Bob', 'start': '2018-10-13 20:00:00+13:00', 'finish': '2018-10-14 03:00:00+13:00', 'type': 'scheduled'},
{'name': 'Bob', 'start': '2018-10-13 20:30:00+13:00', 'finish': '2018-10-14 03:02:00+13:00', 'type': 'actual'},
{'name': 'Joe', 'start': '2018-10-13 22:00:00+13:00', 'finish': '2018-10-14 03:00:00+13:00', 'type': 'scheduled'},
{'name': 'Joe', 'start': '2018-10-13 22:00:00+13:00', 'finish': '2018-10-14 02:06:00+13:00', 'type': 'actual'},
{'name': 'Sally', 'start': '2018-10-13 18:30:00+13:00', 'finish': '2018-10-14 03:00:00+13:00', 'type': 'scheduled'},
{'name': 'Sally', 'start': '2018-10-13 18:30:00+13:00', 'finish': '2018-10-14 02:05:00+13:00', 'type': 'actual'}]
df = pd.DataFrame(shifts)
df['start'] = pd.to_datetime(df['start'].apply(pd.Timestamp))
df['finish'] = pd.to_datetime(df['finish'].apply(pd.Timestamp))
### Plot of scheduled vs. actual hours
hours = mdates.HourLocator() # every hour
minutes = mdates.MinuteLocator(interval= 30) # every 30 mins
hoursFmt = mdates.DateFormatter('%I %p')
xStart = (df[['start']].min() - timedelta(hours = 1)).astype(datetime.datetime)
xEnd = (df[['finish']].max() + timedelta(hours = 1)).astype(datetime.datetime)
#scheduled time period
scheduledStart = mdates.date2num(df['start'][(df['type'] == 'scheduled')].dt.to_pydatetime())
scheduledEnd = mdates.date2num(df['finish'][(df['type'] == 'scheduled')].dt.to_pydatetime())
scheduledWidth = scheduledEnd - scheduledStart
#actual time period
actualStart = mdates.date2num(df['start'][(df['type'] == 'actual')].dt.to_pydatetime())
actualEnd = mdates.date2num(df['finish'][(df['type'] == 'actual')].dt.to_pydatetime())
actualWidth = actualEnd - actualStart
#y axis values
yval = df['name'].unique()
actualYTicks = [index for index, value in enumerate(actualStart)]
scheduledYTicks = [x+0.3 for x in actualYTicks]
yTicks = [sum(x)/2 for x in zip(actualYTicks, scheduledYTicks)]
#generate plot
fig = plt.figure(1)
ax = fig.add_subplot(111)
ax.barh(scheduledYTicks, width = scheduledWidth, left = scheduledStart, color = 'lightgrey', height = 0.3, label = 'Scheduled Hours')
ax.barh(actualYTicks, width = actualWidth, left = actualStart, color = 'green', height = 0.3, label = 'Actual Hours')
#format x axis to time of day
ax.xaxis.set_major_locator(hours)
ax.xaxis.set_major_formatter(hoursFmt)
ax.xaxis.set_minor_locator(minutes)
# autorotate the dates
fig.autofmt_xdate()
plt.yticks(yTicks, yval)
ax.set_xlim([mdates.date2num(xStart), mdates.date2num(xEnd)])
#extract legend labels
handles, labels = ax.get_legend_handles_labels()
lgd = ax.legend(handles, labels, loc= 'upper center', bbox_to_anchor=(0.5,-0.1))
plt.show()
The graph is formatted how I want it to look as seen in the image but the hours on the x axis are not correct. Sally is actually scheduled to start at 7:30 am local time and end at 4:00pm local time on October 14th. The start and end dates are 'Pacific/Auckland' timezone aware so why is this not being capture on the x axis using matplotlib dates date2num?

Related

weighted average acquisition cost pandas based on buy and sell

Thanks, everyone for your time
I have a dataframe like below,
import pandas as pd
import numpy as np
raw_data = {'Date': ['04-23-2020', '05-05-2020', '05-05-2020', '05-11-2020', '05-11-2020',
'05-12-2020', '05-12-2020', '05-27-2020', '06-03-2020'],
'Type': ['Buy', 'Buy', 'Sell', 'Buy', 'Sell', 'Buy', 'Buy',
'Buy', 'Sell'],
'Ticker': ['AAA', 'AAA', 'AAA', 'AAA', 'AAA',
'BBB', 'CCC', 'CCC', 'CCC'],
'Quantity': [60000, 12000, -30000, 49000, -30000, 2000, 10000, 28500, -10000],
'Price': [60.78, 82.20, 0, 100.00, 0, 545.00, 141.00, 172.00,
0]
}
df = pd.DataFrame (raw_data, columns = ['Date','Type','Ticker','Quantity','Price']).sort_values(['Date','Ticker']).reset_index(drop = True)
My objective is to calculate the weighted average price whenever there is a sell transaction. please see my below expected outcome. I have tried a for loop for the same but I was unable to get the results.
mycode current code
df['Pur_Value'] = df['Quantity'] * df['Price']
df['TotalQty'] = df.groupby('Ticker')['Quantity'].cumsum()
grpl = df.groupby(by = ['Ticker'])
df1 = pd.DataFrame()
finaldf = pd.DataFrame()
for key in grpl.groups.keys():
df1 = grpl.get_group(key).loc[:,['Type','Ticker','Date','Quantity','Price','Pur_Value']]
df1.sort_values(by=['Ticker','Date'],inplace=True)
df1.reset_index(drop=True,inplace=True)
Cum_Value = 0
Cum_Shares = 0
for Index,Tic in df1.iterrows():
if Tic['Type'] == "Buy":
Cum_Value += Tic['Pur_Value']
Cum_Shares += Tic['Quantity']
else:
df1['sold_price'] = Cum_Value/Cum_Shares
finaldf = finaldf.append(df1)
Expected results is columns which has the weighted avg price for sold shares like below.
I was able to solve my own issue with the below code. thanks
for key in grpl.groups.keys():
df1 = grpl.get_group(key).loc[
:, ["Type", "Ticker", "Date", "Quantity", "Price", "Pur_Value"]
]
df1.sort_values(by=["Ticker", "Date"], inplace=True)
df1.reset_index(drop=True, inplace=True)
Cum_Value = 0
Cum_Shares = 0
Cum_price = 0
sold_value = 0
for Index, Tic in df1.iterrows():
if Tic["Type"] == "Buy":
Cum_Value += Tic["Pur_Value"]
Cum_Shares += Tic["Quantity"]
Cum_price = Cum_Value / Cum_Shares
else:
sold_value = Cum_price * Tic["Quantity"]
Cum_Shares += Tic["Quantity"]
Cum_Value += sold_value
Cum_price = Cum_Value / Cum_Shares
Ticker.append(Tic["Ticker"])
Dates.append(Tic["Date"])
sold_price.append(Cum_price)

Continuous plotting with seaborn with varying markers based on variable z

I am trying to create a seaborn lineplot with sns.relplot using markers to distinguish between state changes over time, along with some arbitrary value on the y-axis.
Using the dataset below:
df = pd.DataFrame({
'NAME':['29078719','29078719','29078719','29078719','29078719','29078719','29078719'],
'DIAGNOSIS':['negative', 'negative', 'positive', 'positive', 'positive', 'positive', 'negative'],
'ENTRY_DATE': ['2014-01-23 15:13:54', '2015-03-06 15:57:16', '2016-02-26 14:40:53', '2016-02-26 14:40:53', '2017-11-24 15:20:38', '2020-01-29 13:41:24', '2020-03-30 12:11:24'],
'CALCULATED_VALUE': [0.456957, 0.468468, 0.865333, 0.896950, 0.920930, 0.767100, 0.835690]
})
df['ENTRY_DATE'] = pd.to_datetime(df.ENTRY_DATE, infer_datetime_format=True)
NAME
DIAGNOSIS
ENTRY_DATE
CALCULATED_VALUE
0
29078719
negative
2014-01-23 15:13:54
0.456957
1
29078719
negative
2015-03-06 15:57:16
0.468468
2
29078719
positive
2016-02-26 14:40:53
0.865333
3
29078719
positive
2016-02-26 14:40:53
0.89695
4
29078719
positive
2017-11-24 15:20:38
0.92093
5
29078719
positive
2020-01-29 13:41:24
0.7671
6
29078719
negative
2020-03-30 12:11:24
0.83569
sns.relplot(
data=df,
x='ENTRY_DATE',
y='CALCULATED_VALUE',
kind='line',
height=8.27,
aspect=11.7/8.27,
linewidth=2.5,
markers=True,
style='DIAGNOSIS'
)
plt.show()
Would like to achieve:
Note: The data changed in question, and the graph matched output prior to data in the question and images changing. That's why the output in my question does not match the new desired output, but the same code should work with the different data.
relplot is a figure-level plot that isn't great for dynamically plotting in a nuanced way like this. As such, I don't know if you could do such a customized graph with seaborn, but you could use matplotlib and plot three separate lines depending on what grp each line falls in. This is pretty customized, but you could adjust the logic to create the groups, and I have used a combination of the grp number as well as using using shift to connect the lines and try and match your expected output:
df = pd.DataFrame({
'NAME':['29078719','29078719','29078719','29078719','29078719','29078719','29078719'],
'DIAGNOSIS':['negative', 'negative', 'positive', 'positive', 'positive', 'negative', 'negative'],
'ENTRY_DATE': ['2014-01-23 15:13:54', '2015-03-06 15:57:16', '2016-02-26 14:40:53', '2016-02-26 14:40:53', '2017-11-24 15:20:38', '2017-11-24 15:20:38', '2020-01-29 13:41:24'],
'CALCULATED_VALUE': [0.456957, 0.468468, 0.865333, 0.896950, 0.920930, 0.833549, 0.767100]
})
df['ENTRY_DATE'] = pd.to_datetime(df.ENTRY_DATE, infer_datetime_format=True)
df = df.sort_values('ENTRY_DATE')
grp = (df['DIAGNOSIS'] != df['DIAGNOSIS'].shift()).cumsum()
plt.figure(dpi=125)
df1 = df[(grp == 1).shift(2).fillna(True)]
df2 = df[(grp == 2).shift().fillna(False) & (grp == 2)]
df3 = df[(grp == 3).shift(-1).fillna(True)]
plt.plot(df['ENTRY_DATE'], df['CALCULATED_VALUE'], visible=False)
plt.plot(df1['ENTRY_DATE'], df1['CALCULATED_VALUE'], color='red',
label=df['DIAGNOSIS'].iloc[0])
plt.plot(df2['ENTRY_DATE'], df2['CALCULATED_VALUE'], marker='x',
linestyle='dashed', markersize=8, color='red', label=df['DIAGNOSIS'].iloc[2])
plt.plot(df3['ENTRY_DATE'],df3['CALCULATED_VALUE'], color='red')
plt.xlabel('ENTRY_DATE')
plt.ylabel('DIAGNOSIS')
plt.legend()
plt.grid()
plt.show()
Another variation of with the markers. One issue is that the x markers and o markers will overlap, so you can use the same marker style but a different line style. Otherwisee, you might have to create more lines separting the points to get x and o markers as you've shown:
df = pd.DataFrame({
'NAME':['29078719','29078719','29078719','29078719','29078719','29078719','29078719'],
'DIAGNOSIS':['negative', 'negative', 'positive', 'positive', 'positive', 'negative', 'negative'],
'ENTRY_DATE': ['2014-01-23 15:13:54', '2015-03-06 15:57:16', '2016-02-26 14:40:53', '2016-02-26 14:40:53', '2017-11-24 15:20:38', '2017-11-24 15:20:38', '2020-01-29 13:41:24'],
'CALCULATED_VALUE': [0.456957, 0.468468, 0.865333, 0.896950, 0.920930, 0.833549, 0.767100]
})
df['ENTRY_DATE'] = pd.to_datetime(df.ENTRY_DATE, infer_datetime_format=True)
df = df.sort_values('ENTRY_DATE')
grp = (df['DIAGNOSIS'] != df['DIAGNOSIS'].shift()).cumsum()
plt.figure(dpi=125)
df1 = df[(grp == 1).shift(2).fillna(True)]
df2 = df[(grp == 2).shift().fillna(False) & (grp == 2)]
df3 = df[(grp == 3).shift(-1).fillna(True)]
plt.plot(df['ENTRY_DATE'], df['CALCULATED_VALUE'], visible=False)
plt.plot(df1['ENTRY_DATE'], df1['CALCULATED_VALUE'], color='red', marker='o',
label=df['DIAGNOSIS'].iloc[0])
plt.plot(df2['ENTRY_DATE'], df2['CALCULATED_VALUE'], marker='o',
linestyle='dashed', color='red', label=df['DIAGNOSIS'].iloc[2])
plt.plot(df3['ENTRY_DATE'],df3['CALCULATED_VALUE'], color='red', marker='o')
plt.xlabel('ENTRY_DATE')
plt.ylabel('DIAGNOSIS')
plt.legend()
plt.grid()
plt.show()

Python Dash Data Table should display only selected columns

I am trying to display only selected columns from my dataframe using datatable . i am able select how many rows i want . looking for a similar option like rows i want to select to display certain columns alone at the time of executing the code.
My dataframe has close to 25 columns . i dont want all of them to be displayed hence looking for this solution
here is my code :
import dash
import dash_core_components as dcc
import dash_bootstrap_components as dbc
import dash_html_components as html
import dash_table as dt
from dash.dependencies import Input, Output
import plotly.graph_objs as go
import plotly.express as px
import pandas as pd
df = pd.read_csv('E:\pylab\dshlab\infratickets.csv', low_memory = False )
app = dash.Dash(__name__)
#style={'visibility': 'hidden'}
dpdown = []
for i in df['ASSIGNED_GROUP'].unique() :
str(dpdown.append({'label':i,'value':(i)}))
app.layout = html.Div([
html.P([
html.Label("Choose a feature"),
html.Div(dcc.Dropdown(id='dropdown', options=dpdown),
style = {'width': '100px',
'fontSize' : '10px',
'padding-left' : '100px',
'display': 'inline-block'})]),
#style={'visibility': 'hidden'},
html.Div(id='table-container', className='tableDiv'),
dcc.Graph(id = 'plot',style={'height' : '25%', 'width' : '25%'})
])
#dcc.Dropdown(id='dropdown', style={'height': '30px', 'width': '100px'}, options=dpdown),
#dcc.Graph(id='graph'),
#html.Div(html.H3('country graph'),id='table-container1',className='tableDiv1')
#app.callback(
dash.dependencies.Output('table-container','children'),
[dash.dependencies.Input('dropdown', 'value')])
def display_table(dpdown):
df_temp = df[df['ASSIGNED_GROUP']==dpdown]
return html.Div([
dt.DataTable(
id='main-table',
columns=[{'name': i, 'id': i} for i in df_temp.columns],
data=df_temp[0:5].to_dict('rows'),
style_table={
'maxHeight': '20%',
#'overflowY': 'scroll',
'width': '30%',
'minWidth': '10%',
},
style_header={'backgroundColor': 'rgb(30, 30, 30)'},
style_cell={'backgroundColor': 'rgb(50, 50, 50)','color': 'white','height': 'auto','width': 'auto'},#minWidth': '0px', 'maxWidth': '180px', 'whiteSpace': 'normal'},
#style_cell={'minWidth': '120px', 'width': '150px', 'maxWidth': '180px'},
style_data={'whiteSpace': 'auto','height': 'auto','width': 'auto'}
)
])
if __name__ == '__main__':
app.run_server(debug=True)
Able to figure out the solution
changed the code
columns=[{'name': i, 'id': i} for i in df_temp.columns]
to
columns=[{'name': i, 'id': i} for i in df.loc[:,['Colname1','Colname2',...]
fixed it
You could also use by index:
df = pd.read_csv('E:\pylab\dshlab\infratickets.csv', low_memory = False ) # load in the dataframe, then ressign with just the columns you want
df = df.iloc[:,1:3] # Remember that Python does not slice inclusive of the ending index.
Would give all rows and columns 1 to 2 of the data frame.
You can change the
columns=[{'name': i, 'id': i} for i in df_temp.columns],
as below:
First define TABLE_SELECTED_COLUMNS = ['col1','col2'. ...]
and
columns=[{"name": i, "id": i} for i in TABLE_SELECTED_COLUMNS],

How can I annotate a Grouped Broken Barh Chart Python Matplotlib

I have searched to exhaustion trying to annotate my grouped broken barH chart. I would like to have the "Event" from my dataframe annotated in each broken bar section. The examples I have found online manually enter the events x,y positions, AND, are not grouped broken bar examples.
the end goal is to have these events display on-hover, but I believe I wont have an issue if I can just get the events to display.
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
import matplotlib.ticker as ticker
import io
pd.plotting.register_matplotlib_converters()
inp = u""" T29,11/4/2019 0:00,11/4/2019 0:00,off,none
T29,11/4/2019 0:00,11/5/2019 0:00,off,eventa
T29,11/5/2019 0:00,11/6/2019 0:00,on,none
T35,11/4/2019 0:00,11/5/2019 0:00,off,eventb
T35,11/5/2019 0:00,11/6/2019 0:00,paused,eventa
T43,11/4/2019 0:00,11/4/2019 4:01,on,none
T43,11/4/2019 4:01,11/4/2019 12:06,off,none
T43,11/4/2019 12:06,11/5/2019 8:07,on,eventc
T43,11/5/2019 8:07,11/5/2019 10:12,paused,eventd
T43,11/5/2019 10:12,11/5/2019 16:15,on,none
T43,11/5/2019 18:12,11/5/2019 20:15,off,none
"""
mydateparser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y %H:%M")
df = pd.read_csv(io.StringIO(inp), header=0, encoding = "ISO-8859-1", parse_dates=['StartTime', 'FinishTime'], date_parser=mydateparser, names=["Name", "StartTime", "FinishTime", "Status", "Event"])
color = {"on": "g", "paused": "yellow", "off": "black"}
df["Diff"] = df.FinishTime - df.StartTime
minDate = (datetime.datetime.toordinal(min(df.StartTime)))
maxDate = (datetime.datetime.toordinal(max(df.FinishTime)))
days = mdates.DayLocator()
Mcount = 0
fig, ax = plt.subplots(figsize=(6, 3), edgecolor="black", linewidth=1)
labels = []
for i, task in enumerate(df.groupby("Name")):
Mcount += 1
labels.append(task[0])
for r in task[1].groupby("Status"):
data = r[1][["StartTime", "Diff"]]
ax.broken_barh(data.values, (i - 0.4, 0.8), edgecolor="black", alpha=1, linewidth=1,
color=color[r[0]])
ax.set_ylim(bottom=-0.8, top=Mcount)
ax.set_yticks(range(len(labels)))
ax.set_yticklabels(labels)
ax.set_ylabel("Names", rotation=90, fontdict={'family': 'DejaVu Sans', 'color': 'black', 'weight': 'bold', 'size': 14})
ax.set_xlim(left=minDate, right=maxDate)
ax.set_xlabel("Date", fontdict={'family': 'DejaVu Sans', 'color': 'black', 'weight': 'bold', 'size': 14})
ax.xaxis.set_major_formatter(mdates.DateFormatter('%m-%d-%Y'))
ax.tick_params(which='major', axis='x', rotation=0, length=11, color='black')
ax.xaxis.set_major_locator(days)
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%H:%M'))
ax.tick_params(which='minor', rotation=0, labelsize=8, length=4, color='red', size=2)
ax.xaxis.set_minor_locator(ticker.MultipleLocator(.50))
plt.show()
Hello and welcome to StackOverflow. IIUC, you can append a for loop to your enumerate statement to add text to the axes.
for i, task in enumerate(df.groupby("Name")):
Mcount += 1
labels.append(task[0])
for r in task[1].groupby("Status"):
data = r[1][["StartTime", "Diff"]]
ax.broken_barh(data.values,
(i - 0.4, 0.8),
edgecolor="black",
alpha=1,
linewidth=1,
color=color[r[0]]
)
for x1, x2 in data.values:
ax.text(x=x1 + x2/2,
y=i,
s=r[1]["Event"].values[0],
ha='center',
va='center',
color='white',
)
Modified from the docs.
Output:
You can, of course, modify the text formatting.
The text requires an x location, a y location, and a string. The hacky indexing was the quickest way I could pull the event info out of your dataframe.

How do I iterate through combinations of two lists and perform a function each time?

Doing an Alphavantage API pull for historic stock data. I'm pulling one of their indicators. Instead of writing 36 separate functions and manually pulling, I'd like to iterate through the 36 possible combinations and do the pull each time with different variables (the variables being each of the combinations). Below is my code. It currently returns "NONE". What am I doing wrong?
Also, is there a way to combine these two functions into one?
Thanks!
def get_ppo_series(matype, series_type):
pull_parameters = {
'function': 'PPO',
'symbol': stock,
'interval': interval,
'series_type': series_type,
'fastperiod': 12,
'slowperiod': 26,
'matype': matype,
'datatype': 'json',
'apikey': key
}
column = 0
pull = rq.get(url, params=pull_parameters)
data = pull.json()
df = pd.DataFrame.from_dict(data['Technical Analysis: PPO'], orient='index', dtype=float)
df.reset_index(level=0, inplace=True)
df.columns = ['Date', 'PPO Series ' + str(column)]
df.insert(0, 'Stock', stock)
column += 1
return df.tail(past_years * annual_trading_days)
def run_ppo_series():
matype = list(range(8))
series_type = ['open', 'high', 'low', 'close']
combinations = product(matype, series_type)
for matype, series_type in combinations:
get_ppo_series(matype, series_type)
print(run_ppo_series())
I also tried the following. This version at least ran one iteration and returned data. But it stops there ???
def get_ppo_series():
column = 0
matype = list(range(8))
series_type = ['open', 'high', 'low', 'close']
combinations = product(matype, series_type)
for matype, series_type in combinations:
pull_parameters = {
'function': 'PPO',
'symbol': stock,
'interval': interval,
'series_type': series_type,
'fastperiod': 12,
'slowperiod': 26,
'matype': matype,
'datatype': 'json',
'apikey': key
}
pull = rq.get(url, params=pull_parameters)
data = pull.json()
df = pd.DataFrame.from_dict(data['Technical Analysis: PPO'], orient='index', dtype=float)
df.reset_index(level=0, inplace=True)
df.columns = ['Date', 'PPO Series ' + str(column)]
df.insert(0, 'Stock', stock)
column += 1
return df.tail(past_years * annual_trading_days)
print(get_ppo_series())
import requests as rq
import itertools
url = 'https://www.alphavantage.co/query?'
key = 'get your own key'
def get_ppo_series(matype, series_type):
pull_parameters = {
'function': 'PPO',
'symbol': 'msft',
'interval': '60min',
'series_type': series_type,
'fastperiod': 12,
'slowperiod': 26,
'matype': matype,
'datatype': 'json',
'apikey': key
}
column = 0
pull = rq.get(url, params=pull_parameters)
data = pull.json()
print('*' * 50)
print(f'MAType: {matype}, Series: {series_type}')
print(data)
def run_ppo_series():
matype = list(range(8))
series_type = ['open', 'high', 'low', 'close']
combinations = itertools.product(matype, series_type)
for matype, series_type in combinations:
get_ppo_series(matype, series_type)
run_ppo_series()
The code above works without issue once symbol and interval values are supplied.
Thank you for using Alpha Vantage! Our standard API call frequency is 5 calls per minute and 500 calls per day
I didn't bother with the DataFrame portion of get_ppo_series because it's not relevant for receiving the data
I would leave the functions separate, it looks cleaner and I think it's standard for a function to do 1 thing.
A counter can be added to the code and time.sleep(60) after every 5 iterations unless you have a different API call frequency
Function with 60 second wait after every 5 api calls
import time
def run_ppo_series():
matype = list(range(8))
series_type = ['open', 'high', 'low', 'close']
combinations = itertools.product(matype, series_type)
count = 0
for matype, series_type in combinations:
if (count%5 == 0) & (count != 0):
time.sleep(60)
get_ppo_series(matype, series_type)
count+=1

Resources