How can I annotate a Grouped Broken Barh Chart Python Matplotlib - python-3.x

I have searched to exhaustion trying to annotate my grouped broken barH chart. I would like to have the "Event" from my dataframe annotated in each broken bar section. The examples I have found online manually enter the events x,y positions, AND, are not grouped broken bar examples.
the end goal is to have these events display on-hover, but I believe I wont have an issue if I can just get the events to display.
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
import matplotlib.ticker as ticker
import io
pd.plotting.register_matplotlib_converters()
inp = u""" T29,11/4/2019 0:00,11/4/2019 0:00,off,none
T29,11/4/2019 0:00,11/5/2019 0:00,off,eventa
T29,11/5/2019 0:00,11/6/2019 0:00,on,none
T35,11/4/2019 0:00,11/5/2019 0:00,off,eventb
T35,11/5/2019 0:00,11/6/2019 0:00,paused,eventa
T43,11/4/2019 0:00,11/4/2019 4:01,on,none
T43,11/4/2019 4:01,11/4/2019 12:06,off,none
T43,11/4/2019 12:06,11/5/2019 8:07,on,eventc
T43,11/5/2019 8:07,11/5/2019 10:12,paused,eventd
T43,11/5/2019 10:12,11/5/2019 16:15,on,none
T43,11/5/2019 18:12,11/5/2019 20:15,off,none
"""
mydateparser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y %H:%M")
df = pd.read_csv(io.StringIO(inp), header=0, encoding = "ISO-8859-1", parse_dates=['StartTime', 'FinishTime'], date_parser=mydateparser, names=["Name", "StartTime", "FinishTime", "Status", "Event"])
color = {"on": "g", "paused": "yellow", "off": "black"}
df["Diff"] = df.FinishTime - df.StartTime
minDate = (datetime.datetime.toordinal(min(df.StartTime)))
maxDate = (datetime.datetime.toordinal(max(df.FinishTime)))
days = mdates.DayLocator()
Mcount = 0
fig, ax = plt.subplots(figsize=(6, 3), edgecolor="black", linewidth=1)
labels = []
for i, task in enumerate(df.groupby("Name")):
Mcount += 1
labels.append(task[0])
for r in task[1].groupby("Status"):
data = r[1][["StartTime", "Diff"]]
ax.broken_barh(data.values, (i - 0.4, 0.8), edgecolor="black", alpha=1, linewidth=1,
color=color[r[0]])
ax.set_ylim(bottom=-0.8, top=Mcount)
ax.set_yticks(range(len(labels)))
ax.set_yticklabels(labels)
ax.set_ylabel("Names", rotation=90, fontdict={'family': 'DejaVu Sans', 'color': 'black', 'weight': 'bold', 'size': 14})
ax.set_xlim(left=minDate, right=maxDate)
ax.set_xlabel("Date", fontdict={'family': 'DejaVu Sans', 'color': 'black', 'weight': 'bold', 'size': 14})
ax.xaxis.set_major_formatter(mdates.DateFormatter('%m-%d-%Y'))
ax.tick_params(which='major', axis='x', rotation=0, length=11, color='black')
ax.xaxis.set_major_locator(days)
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%H:%M'))
ax.tick_params(which='minor', rotation=0, labelsize=8, length=4, color='red', size=2)
ax.xaxis.set_minor_locator(ticker.MultipleLocator(.50))
plt.show()

Hello and welcome to StackOverflow. IIUC, you can append a for loop to your enumerate statement to add text to the axes.
for i, task in enumerate(df.groupby("Name")):
Mcount += 1
labels.append(task[0])
for r in task[1].groupby("Status"):
data = r[1][["StartTime", "Diff"]]
ax.broken_barh(data.values,
(i - 0.4, 0.8),
edgecolor="black",
alpha=1,
linewidth=1,
color=color[r[0]]
)
for x1, x2 in data.values:
ax.text(x=x1 + x2/2,
y=i,
s=r[1]["Event"].values[0],
ha='center',
va='center',
color='white',
)
Modified from the docs.
Output:
You can, of course, modify the text formatting.
The text requires an x location, a y location, and a string. The hacky indexing was the quickest way I could pull the event info out of your dataframe.

Related

Is there a way to add a colorbar to the top or side of the whole figure? [duplicate]

I cannot for the life of me figure out how to attach one colorbar for multiple pandas subplots. Almost all the other questions that resolve the issue of putting one colorbar for multiple subplots use np arrays, not dataframes, to plot.
There is one question, One colorbar for seaborn heatmaps in subplot, which seems like it could be useful, however I could not figure out how to extend it to my case.
Could anyone help? Below is an example of my current code. Thanks in advance!
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# If you're using a notebook:
# %matplotlib inline
df = pd.DataFrame({"BASE": np.random.randn(10),
"A": np.random.randn(10),
"B": np.random.randn(10),
"C": np.random.randn(10),
"D": np.random.randn(10),
"color_col": [1,1,2,2,1,1,2,1,2,2]})
plt.figure(1, figsize = (15,15))
plt.subplot(2,2,1)
df.plot.scatter(x = "BASE", y = "A", c = df["color_col"], ax = plt.gca())
plt.subplot(2,2,2)
df.plot.scatter(x = "BASE", y = "B", c = df["color_col"], ax = plt.gca())
plt.subplot(2,2,3)
df.plot.scatter(x = "BASE", y = "C", c = df["color_col"], ax = plt.gca())
plt.subplot(2,2,4)
df.plot.scatter(x = "BASE", y = "D", c = df["color_col"], ax = plt.gca())
The question Matplotlib 2 Subplots, 1 Colorbar is probably more what you are looking for. The problem is however that you do not directly have access to the mappable that is created in the pandas scatter plot.
The solution here would be to distill this mappable (in this case a PatchCollection) from the axes using it plt.gca().get_children()[0], which takes the first child artist from the axes.
This method is save as long as all scatterplots share the same colors and
as long as there are no other artists in the axes.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.DataFrame({"BASE": np.random.randn(10),
"A": np.random.randn(10),
"B": np.random.randn(10),
"C": np.random.randn(10),
"D": np.random.randn(10),
"color_col": np.random.randn(10)})
fig = plt.figure(1, figsize = (6,6))
plt.subplots_adjust(wspace=0.5, right=0.8, top=0.9, bottom=0.1)
for i, col in enumerate("ABCD"):
plt.subplot(2,2,i+1)
df.plot.scatter(x = "BASE", y = col, ax = plt.gca(),
c = df["color_col"], cmap="jet", colorbar=False)
# we now take the first axes and
# create a colorbar for it's first child (the PathCollection from scatter)
# this is save as long as all scatterplots share the same colors and
# as long as there are no other artists in the axes.
im = plt.gca().get_children()[0]
cax = fig.add_axes([0.85,0.1,0.03,0.8])
fig.colorbar(im, cax=cax)
plt.show()
Here're a few stones at the target. I'm pretty sure at least one more method exists (using GridSpec differently than outlined below) but I like the latter two.
1. You could simply choose when to plot a color bar vs not, as long as you don't care if one of your subplots is trimmed to make room for the colorbar.
df = pd.DataFrame({"BASE": np.random.randn(10),
"A": np.random.randn(10),
"B": np.random.randn(10),
"C": np.random.randn(10),
"D": np.random.randn(10),
"color_col": np.random.randn(10)})
fig,axis = plt.subplots(nrows=1,ncols=4,figsize=(18,6))
for i, col in enumerate("ABCD"):
ax = axis[i]
df.plot.scatter(ax=ax, x = "BASE", y = col, c = df["color_col"], s = 55, cmap="jet", colorbar=(i==len("ABCD")-1))
fig.tight_layout()
plt.show()
2. If you really wanted 2X2 grid, and allsubplots must be same size, you could try this. I like the first answer, sans having to place color bar at a specific location. I'd rather draw everything I want, use existing tight_layout() functionality and then add a colorbar. The only magic number I retained is figsize.
fig,axes = plt.subplots(2,2, figsize = (9,4))
df = pd.DataFrame({"BASE": np.random.randn(10),
"A": np.random.randn(10),
"B": np.random.randn(10),
"C": np.random.randn(10),
"D": np.random.randn(10),
"color_col": np.random.randn(10)})
for i, col in enumerate("ABCD"):
ax = axes[i/2][i%2]
im = df.plot.scatter(x = "BASE", y = col, ax = ax, s = 35,
c = df["color_col"], cmap="jet", colorbar=False)
fig.tight_layout()
# Note: since colorbar is manu`enter code here`ally added, tight_layout must be called before
# rendering colorbar
plt.colorbar(plt.gca().get_children()[0], ax=axes.ravel().tolist())
3. Yet another approach would be to use gridspec as an argument to suplots constructor:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.gridspec as gridspec
df = pd.DataFrame({"BASE": np.random.randn(10),
"A": np.random.randn(10),
"B": np.random.randn(10),
"C": np.random.randn(10),
"D": np.random.randn(10),
"color_col": np.random.randn(10)})
nrows,ncols = 2,2
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, sharex='col', sharey=False,
gridspec_kw={'width_ratios': [1, 1,0.1]},
figsize=(7, 4))
for i, col in enumerate("ABCD"):
ax = axes[i/(ncols)][i%(ncols)]
im = df.plot.scatter(x = "BASE", y = col, ax = ax, s = 35
,c = df["color_col"], cmap="jet", colorbar=False)
fig.tight_layout()
cax,kw = mpl.colorbar.make_axes([ax for ax in axes.flat])
plt.colorbar(axes[0][0].get_children()[0], cax=cax, **kw)
plt.show()

Matplotlib Control Spacing Between Bars

I am trying to insert spacing between two specific bars but cannot find any easy way to do this. I can manually add a dummy row with with 0 height to create and empty space but doesn't give me control of how wide the space should be. Is there a more programmatic method I can use to control the spacing between bars at any position?
Example Code:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
mydict = {
'Event': ['Running', 'Swimming', 'Biking', '', 'Hiking', 'Jogging'],
'Completed': [2, 4, 3, 0, 7, 9],
'Participants': [10, 20, 35, 0, 10, 20]}
df = pd.DataFrame(mydict).set_index('Event')
df = df.assign(Completion=(df.Completed / df.Participants) * 100)
plt.subplots(figsize=(5, 4))
print(df.index)
ax = sns.barplot(x=df.Completion, y=df.index, color="orange", orient='h')
plt.xticks(rotation=60)
plt.tight_layout()
plt.show()
Example DataFrame Output:
Completed Participants Completion
Event
Running 2 10 20.000000
Swimming 4 20 20.000000
Biking 3 35 8.571429
0 0 NaN
Hiking 7 10 70.000000
Jogging 9 20 45.000000
Example output (blue arrows added outside of code to show where empty row was added.):
I think you can access the position of the boxes and the name of the labels. Then modify them. You may find an more general way depending on your use case, but this works for the given example.
#define a function to add space starting a specific label
def add_space_after(ax, label_shift='', extra_space=0):
bool_space = False
# get postion of current ticks
ticks_position = np.array(ax.get_yticks()).astype(float)
# iterate over the boxes/label
for i, (patch, label) in enumerate(zip(ax.patches, ax.get_yticklabels())):
# if the label to start the shift found
if label.get_text()==label_shift: bool_space = True
# reposition the boxes and the labels afterward
if bool_space:
patch.set_y(patch.get_y() + extra_space)
ticks_position[i] += extra_space
# in the case where the spacing is needed
if bool_space:
ax.set_yticks(ticks_position)
ax.set_ylim([ax.get_ylim()[0]+extra_space, ax.get_ylim()[1]])
#note: no more blank row
mydict = {
'Event': ['Running', 'Swimming', 'Biking', 'Hiking', 'Jogging'],
'Completed': [2, 4, 3, 7, 9],
'Participants': [10, 20, 35, 10, 20]}
df = pd.DataFrame(mydict).set_index('Event')
df = df.assign(Completion=(df.Completed / df.Participants) * 100)
ax = sns.barplot(x=df.Completion, y=df.index, color="orange", orient='h')
plt.xticks(rotation=60)
plt.tight_layout()
#use the function
add_space_after(ax, 'Hiking', 0.6)
plt.show()

Python 3 - matplotlib not recognizing timezones when plotting hours on x axis

I am trying to plot scheduled hours of work vs. actual hours of work, but the hours on the x axis are not recognizing the timezone offset.
Here is an example of the code I am using:
### Import stack
import pandas as pd
import datetime
from datetime import datetime as dt
from datetime import date, timedelta
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
### build dummy shifts df
shifts = [{'name': 'Bob', 'start': '2018-10-13 20:00:00+13:00', 'finish': '2018-10-14 03:00:00+13:00', 'type': 'scheduled'},
{'name': 'Bob', 'start': '2018-10-13 20:30:00+13:00', 'finish': '2018-10-14 03:02:00+13:00', 'type': 'actual'},
{'name': 'Joe', 'start': '2018-10-13 22:00:00+13:00', 'finish': '2018-10-14 03:00:00+13:00', 'type': 'scheduled'},
{'name': 'Joe', 'start': '2018-10-13 22:00:00+13:00', 'finish': '2018-10-14 02:06:00+13:00', 'type': 'actual'},
{'name': 'Sally', 'start': '2018-10-13 18:30:00+13:00', 'finish': '2018-10-14 03:00:00+13:00', 'type': 'scheduled'},
{'name': 'Sally', 'start': '2018-10-13 18:30:00+13:00', 'finish': '2018-10-14 02:05:00+13:00', 'type': 'actual'}]
df = pd.DataFrame(shifts)
df['start'] = pd.to_datetime(df['start'].apply(pd.Timestamp))
df['finish'] = pd.to_datetime(df['finish'].apply(pd.Timestamp))
### Plot of scheduled vs. actual hours
hours = mdates.HourLocator() # every hour
minutes = mdates.MinuteLocator(interval= 30) # every 30 mins
hoursFmt = mdates.DateFormatter('%I %p')
xStart = (df[['start']].min() - timedelta(hours = 1)).astype(datetime.datetime)
xEnd = (df[['finish']].max() + timedelta(hours = 1)).astype(datetime.datetime)
#scheduled time period
scheduledStart = mdates.date2num(df['start'][(df['type'] == 'scheduled')].dt.to_pydatetime())
scheduledEnd = mdates.date2num(df['finish'][(df['type'] == 'scheduled')].dt.to_pydatetime())
scheduledWidth = scheduledEnd - scheduledStart
#actual time period
actualStart = mdates.date2num(df['start'][(df['type'] == 'actual')].dt.to_pydatetime())
actualEnd = mdates.date2num(df['finish'][(df['type'] == 'actual')].dt.to_pydatetime())
actualWidth = actualEnd - actualStart
#y axis values
yval = df['name'].unique()
actualYTicks = [index for index, value in enumerate(actualStart)]
scheduledYTicks = [x+0.3 for x in actualYTicks]
yTicks = [sum(x)/2 for x in zip(actualYTicks, scheduledYTicks)]
#generate plot
fig = plt.figure(1)
ax = fig.add_subplot(111)
ax.barh(scheduledYTicks, width = scheduledWidth, left = scheduledStart, color = 'lightgrey', height = 0.3, label = 'Scheduled Hours')
ax.barh(actualYTicks, width = actualWidth, left = actualStart, color = 'green', height = 0.3, label = 'Actual Hours')
#format x axis to time of day
ax.xaxis.set_major_locator(hours)
ax.xaxis.set_major_formatter(hoursFmt)
ax.xaxis.set_minor_locator(minutes)
# autorotate the dates
fig.autofmt_xdate()
plt.yticks(yTicks, yval)
ax.set_xlim([mdates.date2num(xStart), mdates.date2num(xEnd)])
#extract legend labels
handles, labels = ax.get_legend_handles_labels()
lgd = ax.legend(handles, labels, loc= 'upper center', bbox_to_anchor=(0.5,-0.1))
plt.show()
The graph is formatted how I want it to look as seen in the image but the hours on the x axis are not correct. Sally is actually scheduled to start at 7:30 am local time and end at 4:00pm local time on October 14th. The start and end dates are 'Pacific/Auckland' timezone aware so why is this not being capture on the x axis using matplotlib dates date2num?

add secondary description in axis values, plotly

I am using a dataframe which includes the following columns:
Country, GNI, CarSalesPerCap. I am using kmeans to create clusters. In the algorithm i pass the dataframe with the two numeric columns: 'GNI', 'CarSalesPerCap'.
Then i am using plotly to create a scatter plot, where x-axis is the CarsalesPerCap and Y-axis is GNI. My question is, how am i going to add to the plot the corresponding country for each point plotted on the graph.
df = pd.read_sql_query(query,conn)
df = df.dropna()
#Cluster the data
kmeans = KMeans(n_clusters=6, random_state=0).fit(df1)
labels = kmeans.labels_
#Glue back to originaal data
df['clusters'] = labels
#Lets analyze the clusters
print (df)
cluster0=df.loc[df['clusters'] == 0]
cluster1=df.loc[df['clusters'] == 1]
cluster2=df.loc[df['clusters'] == 2]
cluster3=df.loc[df['clusters'] == 3]
cluster4=df.loc[df['clusters'] == 4]
cluster5=df.loc[df['clusters'] == 5]
p0 = go.Scatter(x=cluster0['CarSalesPerCap'],
y= cluster0['GNI'],
mode='markers',
marker=dict(color='black')
)
p1 = go.Scatter(x=cluster1['CarSalesPerCap'],
y= cluster1['GNI'],
mode='markers',
marker=dict(color='teal')
)
p2 = go.Scatter(x=cluster2['CarSalesPerCap'],
y= cluster2['GNI'],
mode='markers',
marker=dict(color='grey')
)
p3 = go.Scatter(x=cluster3['CarSalesPerCap'],
y= cluster3['GNI'],
mode='markers',
marker=dict(color='pink')
)
p4 = go.Scatter(x=cluster4['CarSalesPerCap'],
y= cluster4['GNI'],
mode='markers',
marker=dict(color='purple')
)
p5 = go.Scatter(x=cluster5['CarSalesPerCap'],
y= cluster5['GNI'],
mode='markers',
marker=dict(color='orange')
)
layout = go.Layout(xaxis=dict(ticks='',
showticklabels=True,
zeroline=True,
title = 'CarSalesPerCap'),
yaxis=dict(ticks='',
showticklabels=True,
zeroline=True,
title='GNI'),
showlegend=False, hovermode='closest')
fig = go.Figure(data=[p0,p1,p2,p3,p4,p5], layout=layout)
py.offline.plot(fig)
You can add a text element to your trace and it will allow you to overlay anything you want. If you add your country column then it will be displayed on hover. If you want a permanent label you can add annotations
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import pandas as pd
df = pd.DataFrame({'country':["USA", "MEXICO", "CANADA"], 'x':[1, 2, 4], 'y':[5, 6, 7]})
p0 = go.Scatter(
x=df.x,
y= df.y,
mode='markers',
marker=dict(
color='#E90',
size=15
),
text = df.country,
)
data = [p0]
iplot(data)

How do I add an axis for a second trace in a Plotly subplot?

I have three traces, one of which I have in one subplot, and two of which are in another. I would like to have a distinct y-axis each of the traces in the subplot with 2 traces.
For example, I have
fig = plotly.tools.make_subplots(rows=2, cols=1, shared_xaxes=True)
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 1)
fig.append_trace(trace3, 2, 1)
fig['layout'].update(height=200, width=400)
which produces
And when I have no subplots, I can get a second axis for the second trace with
layout = go.Layout(
yaxis=dict(
title='y for trace1'
),
yaxis2=dict(
title='y for trace2',
titlefont=dict(
color='rgb(148, 103, 189)'
),
tickfont=dict(
color='rgb(148, 103, 189)'
),
overlaying='y',
side='right'
)
)
fig = go.Figure(data=data, layout=layout)
which produces
But I can't figure out how to get the first subplot in the first example to look like the plot in the second example: with a distinct axis for the second trace there.
How do I add an axis for a second trace in a Plotly subplot?
This is a bit of a workaround, but it seems to work:
import plotly as py
import plotly.graph_objs as go
from plotly import tools
import numpy as np
left_trace = go.Scatter(x = np.random.randn(1000), y = np.random.randn(1000), yaxis = "y1", mode = "markers")
right_traces = []
right_traces.append(go.Scatter(x = np.random.randn(1000), y = np.random.randn(1000), yaxis = "y2", mode = "markers"))
right_traces.append(go.Scatter(x = np.random.randn(1000) * 10, y = np.random.randn(1000) * 10, yaxis = "y3", mode = "markers"))
fig = tools.make_subplots(rows = 1, cols = 2)
fig.append_trace(left_trace, 1, 1)
for trace in right_traces:
yaxis = trace["yaxis"] # Store the yaxis
fig.append_trace(trace, 1, 2)
fig["data"][-1].update(yaxis = yaxis) # Update the appended trace with the yaxis
fig["layout"]["yaxis1"].update(range = [0, 3], anchor = "x1", side = "left")
fig["layout"]["yaxis2"].update(range = [0, 3], anchor = "x2", side = "left")
fig["layout"]["yaxis3"].update(range = [0, 30], anchor = "x2", side = "right", overlaying = "y2")
py.offline.plot(fig)
Produces this, where trace0 is in the first subplot plotted on yaxis1, and trace1 and trace2 are in the second subplot, plotted on yaxis2 (0-3) and yaxis3 (0-30) respectively:
When traces are appended to subplots, the xaxis and yaxis seem to be overwritten, or that's my understanding of this discussion anyway.

Resources