Matplotlib Control Spacing Between Bars - python-3.x

I am trying to insert spacing between two specific bars but cannot find any easy way to do this. I can manually add a dummy row with with 0 height to create and empty space but doesn't give me control of how wide the space should be. Is there a more programmatic method I can use to control the spacing between bars at any position?
Example Code:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
mydict = {
'Event': ['Running', 'Swimming', 'Biking', '', 'Hiking', 'Jogging'],
'Completed': [2, 4, 3, 0, 7, 9],
'Participants': [10, 20, 35, 0, 10, 20]}
df = pd.DataFrame(mydict).set_index('Event')
df = df.assign(Completion=(df.Completed / df.Participants) * 100)
plt.subplots(figsize=(5, 4))
print(df.index)
ax = sns.barplot(x=df.Completion, y=df.index, color="orange", orient='h')
plt.xticks(rotation=60)
plt.tight_layout()
plt.show()
Example DataFrame Output:
Completed Participants Completion
Event
Running 2 10 20.000000
Swimming 4 20 20.000000
Biking 3 35 8.571429
0 0 NaN
Hiking 7 10 70.000000
Jogging 9 20 45.000000
Example output (blue arrows added outside of code to show where empty row was added.):

I think you can access the position of the boxes and the name of the labels. Then modify them. You may find an more general way depending on your use case, but this works for the given example.
#define a function to add space starting a specific label
def add_space_after(ax, label_shift='', extra_space=0):
bool_space = False
# get postion of current ticks
ticks_position = np.array(ax.get_yticks()).astype(float)
# iterate over the boxes/label
for i, (patch, label) in enumerate(zip(ax.patches, ax.get_yticklabels())):
# if the label to start the shift found
if label.get_text()==label_shift: bool_space = True
# reposition the boxes and the labels afterward
if bool_space:
patch.set_y(patch.get_y() + extra_space)
ticks_position[i] += extra_space
# in the case where the spacing is needed
if bool_space:
ax.set_yticks(ticks_position)
ax.set_ylim([ax.get_ylim()[0]+extra_space, ax.get_ylim()[1]])
#note: no more blank row
mydict = {
'Event': ['Running', 'Swimming', 'Biking', 'Hiking', 'Jogging'],
'Completed': [2, 4, 3, 7, 9],
'Participants': [10, 20, 35, 10, 20]}
df = pd.DataFrame(mydict).set_index('Event')
df = df.assign(Completion=(df.Completed / df.Participants) * 100)
ax = sns.barplot(x=df.Completion, y=df.index, color="orange", orient='h')
plt.xticks(rotation=60)
plt.tight_layout()
#use the function
add_space_after(ax, 'Hiking', 0.6)
plt.show()

Related

How to display data across, by row, in pie chart in plotly/streamlit?

I have pandas df that looks like this that I want to display as a dashboard:
fname col1 col2 col3 sum
A 2 3 3 10
B 1 2 3 12
C 6 6 3 13
If a fname is selected by row, I want to display the pie slices as the column values by row.
What is the best way to display the data by fname grouped across by row in a pie chart?
I am not sure what to display when all the column values for fname are selected.
I tried creating a sunburst chart like so, but the chart is extremely convoluted:
px.sunburst(df, values='sum', path=[
'col3',
'col2',
'col1',
'fname'],
title='pie')
Here is a basic example.
import plotly.express as px
import pandas as pd
import streamlit as st
data = {
'ctry': ['USA', 'PHI', 'CHN'],
'gold': [12, 1, 20,],
'silver': [4,4, 12],
'bronze': [8, 2, 30],
'sum': [24, 7, 62]
}
df = pd.DataFrame(data)
st.dataframe(df)
cols = st.columns([1, 1])
with cols[0]:
medal_type = st.selectbox('Medal Type', ['gold', 'silver', 'bronze'])
fig = px.pie(df, values=medal_type, names='ctry',
title=f'number of {medal_type} medals',
height=300, width=200)
fig.update_layout(margin=dict(l=20, r=20, t=30, b=0),)
st.plotly_chart(fig, use_container_width=True)
with cols[1]:
st.text_input('sunburst', label_visibility='hidden', disabled=True)
fig = px.sunburst(df, path=['ctry', 'gold', 'silver', 'bronze'],
values='sum', height=300, width=200)
fig.update_layout(margin=dict(l=20, r=20, t=30, b=0),)
st.plotly_chart(fig, use_container_width=True)
Output

How to ignore or clip negative values in altair charts from the chart code itself?

I want to NOT show the negative value in the bar chart. The main idea is to NOT have that y-axis offset(in the actual problem its a facet), so any way to achieve this is fine - maybe clipping - just not at data level, preferably from the chart itself.
I thought of using alt.Scale but the domain requires you to specify a max limit and the issue is that I do not know that first hand, and I cannot find a way to programmatically specify max over the values.
You can use the following demo chart -
import pandas as pd
import altair as alt
dd = pd.DataFrame({'a': [0,1,2,3,4,5], 'b': [10,14, -5, 15, 0, 5]})
a = alt.Chart().mark_bar().encode(
x='a',
y=alt.Y('b:Q')
)
b = alt.Chart().mark_line().transform_window(
rolling_mean = 'mean(b)',
frame=[-2, 0]).encode(
x='a',
y='rolling_mean:Q'
)
alt.layer(a, b, data=dd)
There are only two ways I know of to hide data on a chart. First, you can set an explicit scale domain and set clip=True for the relevant marks:
import pandas as pd
import altair as alt
dd = pd.DataFrame({'a': [0,1,2,3,4,5], 'b': [10,14, -5, 15, 0, 5]})
a = alt.Chart().mark_bar(clip=True).encode(
x='a',
y=alt.Y('b:Q', scale=alt.Scale(domain=[0, 16]))
)
b = alt.Chart().mark_line().transform_window(
rolling_mean = 'mean(b)',
frame=[-2, 0]).encode(
x='a',
y='rolling_mean:Q'
)
alt.layer(a, b, data=dd)
Second, you can apply a filter transform to your data to remove rows from your dataset:
import pandas as pd
import altair as alt
dd = pd.DataFrame({'a': [0,1,2,3,4,5], 'b': [10,14, -5, 15, 0, 5]})
a = alt.Chart().mark_bar().encode(
x='a',
y=alt.Y('b:Q', scale=alt.Scale(domain=[0, 16]))
)
b = alt.Chart().mark_line().transform_window(
rolling_mean = 'mean(b)',
frame=[-2, 0]).encode(
x='a',
y='rolling_mean:Q'
)
alt.layer(a, b, data=dd).transform_filter(alt.datum.b > 0)
Note that difference: because this transform was applied at the top level, it removes rows for both sub-panels. If you instead apply the filter for only one of the subcharts, the rows will only be removed from that layer:
import pandas as pd
import altair as alt
dd = pd.DataFrame({'a': [0,1,2,3,4,5], 'b': [10,14, -5, 15, 0, 5]})
a = alt.Chart().transform_filter(
alt.datum.b > 0
).mark_bar().encode(
x='a',
y=alt.Y('b:Q', scale=alt.Scale(domain=[0, 16]))
)
b = alt.Chart().mark_line().transform_window(
rolling_mean = 'mean(b)',
frame=[-2, 0]).encode(
x='a',
y='rolling_mean:Q'
)
alt.layer(a, b, data=dd)
One way to do it seems to use transform_filter as follows -
.transform_filter(alt.datum.b >= 0 )

How can I annotate a Grouped Broken Barh Chart Python Matplotlib

I have searched to exhaustion trying to annotate my grouped broken barH chart. I would like to have the "Event" from my dataframe annotated in each broken bar section. The examples I have found online manually enter the events x,y positions, AND, are not grouped broken bar examples.
the end goal is to have these events display on-hover, but I believe I wont have an issue if I can just get the events to display.
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
import matplotlib.ticker as ticker
import io
pd.plotting.register_matplotlib_converters()
inp = u""" T29,11/4/2019 0:00,11/4/2019 0:00,off,none
T29,11/4/2019 0:00,11/5/2019 0:00,off,eventa
T29,11/5/2019 0:00,11/6/2019 0:00,on,none
T35,11/4/2019 0:00,11/5/2019 0:00,off,eventb
T35,11/5/2019 0:00,11/6/2019 0:00,paused,eventa
T43,11/4/2019 0:00,11/4/2019 4:01,on,none
T43,11/4/2019 4:01,11/4/2019 12:06,off,none
T43,11/4/2019 12:06,11/5/2019 8:07,on,eventc
T43,11/5/2019 8:07,11/5/2019 10:12,paused,eventd
T43,11/5/2019 10:12,11/5/2019 16:15,on,none
T43,11/5/2019 18:12,11/5/2019 20:15,off,none
"""
mydateparser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y %H:%M")
df = pd.read_csv(io.StringIO(inp), header=0, encoding = "ISO-8859-1", parse_dates=['StartTime', 'FinishTime'], date_parser=mydateparser, names=["Name", "StartTime", "FinishTime", "Status", "Event"])
color = {"on": "g", "paused": "yellow", "off": "black"}
df["Diff"] = df.FinishTime - df.StartTime
minDate = (datetime.datetime.toordinal(min(df.StartTime)))
maxDate = (datetime.datetime.toordinal(max(df.FinishTime)))
days = mdates.DayLocator()
Mcount = 0
fig, ax = plt.subplots(figsize=(6, 3), edgecolor="black", linewidth=1)
labels = []
for i, task in enumerate(df.groupby("Name")):
Mcount += 1
labels.append(task[0])
for r in task[1].groupby("Status"):
data = r[1][["StartTime", "Diff"]]
ax.broken_barh(data.values, (i - 0.4, 0.8), edgecolor="black", alpha=1, linewidth=1,
color=color[r[0]])
ax.set_ylim(bottom=-0.8, top=Mcount)
ax.set_yticks(range(len(labels)))
ax.set_yticklabels(labels)
ax.set_ylabel("Names", rotation=90, fontdict={'family': 'DejaVu Sans', 'color': 'black', 'weight': 'bold', 'size': 14})
ax.set_xlim(left=minDate, right=maxDate)
ax.set_xlabel("Date", fontdict={'family': 'DejaVu Sans', 'color': 'black', 'weight': 'bold', 'size': 14})
ax.xaxis.set_major_formatter(mdates.DateFormatter('%m-%d-%Y'))
ax.tick_params(which='major', axis='x', rotation=0, length=11, color='black')
ax.xaxis.set_major_locator(days)
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%H:%M'))
ax.tick_params(which='minor', rotation=0, labelsize=8, length=4, color='red', size=2)
ax.xaxis.set_minor_locator(ticker.MultipleLocator(.50))
plt.show()
Hello and welcome to StackOverflow. IIUC, you can append a for loop to your enumerate statement to add text to the axes.
for i, task in enumerate(df.groupby("Name")):
Mcount += 1
labels.append(task[0])
for r in task[1].groupby("Status"):
data = r[1][["StartTime", "Diff"]]
ax.broken_barh(data.values,
(i - 0.4, 0.8),
edgecolor="black",
alpha=1,
linewidth=1,
color=color[r[0]]
)
for x1, x2 in data.values:
ax.text(x=x1 + x2/2,
y=i,
s=r[1]["Event"].values[0],
ha='center',
va='center',
color='white',
)
Modified from the docs.
Output:
You can, of course, modify the text formatting.
The text requires an x location, a y location, and a string. The hacky indexing was the quickest way I could pull the event info out of your dataframe.

Interactive Plot of Pandas Data-frame Color coding based on a group from a Column

I have an example pandas dataframe as follows:
day id cnt
2 catx 4
2 kagm 3
2 dyrt 5
3 catx 3
3 kagm 3
3 dyrt 4
5 catx 2
5 kagm 2
5 dyrt 2
I want to plot the scatter data cnt (y) vs day(x), where the points will be labeled (colored/legend) based on the id column.
Now this is pretty simple in seaborn/matplotlib which I know can be plotted and the plot can be saved to a file.
However, I am looking to have an interactive plot using plotly/bokeh/d3/mp3ld etc and finally, put that plot into an url (of my choice or maybe an account based as in plotly). My goal is also to have hover function, which will show me the value of the points when I take the cursor over a specific cursor point.
I have tried bokeh/plotly with cufflinks using ColumnDataSource and everything to try out to get the plots. However, have failed to get anything which I am looking for. Can I get some help in this direction from the experts? Thanks in anticipation.
This code plots the data the way you requested. I created a new dataframe for every category in your dataframe so the interactive legend also works. An array with hex color strings is generated with the length of the number of unique categories and added to the dataframe to give every category it's own color.
#!/usr/bin/python3
import pandas as pd
from bokeh.models import ColumnDataSource
from bokeh.palettes import all_palettes
from bokeh.plotting import figure, output_file, show
data = {'day': [2, 2, 2, 3, 3, 3, 5, 5, 5], 'id': ['catx', 'kagm', 'dyrt', 'catx', 'kagm', 'dyrt', 'catx', 'kagm', 'dyrt'], 'cnt': [4, 3, 5, 3, 3, 4, 2, 2, 2]}
df = pd.DataFrame.from_dict(data)
output_file('plot.html')
tooltips = [
("day", "#day"),
("id", "#$name"),
("count", "#cnt")]
p = figure(tooltips=tooltips, plot_width=800, plot_height=800)
sources = []
colors = all_palettes['Viridis'][len(set(df['id'].tolist()))]
pd.options.mode.chained_assignment = None #Supress false positive warning
for ID, color in zip(set(df['id'].tolist()), colors):
dfSubset = df.loc[df['id'] == ID]
dfSubset['color'] = color
sources.append(ColumnDataSource(dfSubset))
p.circle(x = 'day', y = 'cnt', legend = 'id', color = 'color', name = 'id', alpha = 0.5, size = 15, source = sources[-1])
p.legend.click_policy="hide"
show(p)

Append to dataframe with for loop. Python3

I'm trying to loop through a list(y) and output by appending a row for each item to a dataframe.
y=[datetime.datetime(2017, 3, 29), datetime.datetime(2017, 3, 30), datetime.datetime(2017, 3, 31)]
Desired Output:
Index Mean Last
2017-03-29 1.5 .76
2017-03-30 2.3 .4
2017-03-31 1.2 1
Here is the first and last part of the code I currently have:
import pandas as pd
import datetime
df5=pd.DataFrame(columns=['Mean','Last'],index=index)
for item0 in y:
.........
.........
df=df.rename(columns = {0:'Mean'})
df4=pd.concat([df, df3], axis=1)
print (df4)
df5.append(df4)
print (df5)
My code only puts one row into the dataframe like as opposed to a row for each item in y:
Index Mean Last
2017-03-29 1.5 .76
Try:
y = [datetime(2017, 3, 29), datetime(2017, 3, 30),datetime(2017, 3, 31)]
m = [1.5,2.3,1.2]
l = [0.76, .4, 1]
df = pd.DataFrame([],columns=['time','mean','last'])
for y0, m0, l0 in zip(y,m,l):
data = {'time':y0,'mean':m0,'last':l0}
df = df.append(data, ignore_index=True)
and if you want y to be the index:
df.index = df.time
There are a few ways to skin this, and it's hard to know which approach makes the most sense with the limited info given. But one way is to start with a dataframe that has only the index, iterate through the dataframe by row and populate the values from some other process. Here's an example of that approach:
import datetime
import numpy as np
import pandas as pd
y=[datetime.datetime(2017, 3, 29), datetime.datetime(2017, 3, 30), datetime.datetime(2017, 3, 31)]
main_df = pd.DataFrame(y, columns=['Index'])
#pop in the additional columns you want, but leave them blank
main_df['Mean'] = None
main_df['Last'] = None
#set the index
main_df.set_index(['Index'], inplace=True)
that gives us the following:
Mean Last
Index
2017-03-29 None None
2017-03-30 None None
2017-03-31 None None
Now let's loop and plug in some made up random values:
## loop through main_df and add values
for (index, row) in main_df.iterrows():
main_df.ix[index].Mean = np.random.rand()
main_df.ix[index].Last = np.random.rand()
this results in the following dataframe which has the None values filled:
Mean Last
Index
2017-03-29 0.174714 0.718738
2017-03-30 0.983188 0.648549
2017-03-31 0.07809 0.47031

Resources