Plot crosstab results using All row as benchmark lines - python-3.x

I have this sample dataframe:
test = pd.DataFrame({'cluster':['1','1','1','1','2','2','2','2','2','3','3','3'],
'type':['a','b','c','a','a','b','c','c','a','b','c','a']})
I use crosstab to produce a new dataframe and plot results:
pd.crosstab(test.cluster,test.type,normalize='index',margins=True).plot(kind='bar')
I would like to plot the row All as dotted horizontal benchmark lines of the same colour corresponding to each type to improve interpretation of the plot. Will appreciate help of this community!

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
test = pd.DataFrame(
{'cluster': ['1', '1', '1', '1', '2', '2', '2', '2', '2', '3', '3', '3'],
'type': ['a', 'b', 'c', 'a', 'a', 'b', 'c', 'c', 'a', 'b', 'c', 'a']})
tab = pd.crosstab(test.cluster, test.type, normalize='index', margins=True)
fig, ax = plt.subplots()
# find the default colors
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
# make a bar plot using all rows but the last
tab.iloc[:-1].plot(ax=ax, kind='bar', color=colors)
# draw the horizontal dotted lines
for y, c in zip(tab.loc['All'], colors):
ax.axhline(y=y, color=c, linestyle=':', alpha=0.5)
plt.show()

Related

How do I replace all occurrences of '+' with '.5' in a dataframe?

I have a dataframe below:
data = {'Name': ['A', 'B', 'C', 'D'],
'Lower': ['+', '2', '2+', '3'],
'Upper': ['2','3+','4+','5']}
df= pd.DataFrame(data)
The expected output should be:
data = {'Name': ['A', 'B', 'C', 'D'],
'Lower': ['.5', '2', '2.5', '3'],
'Upper': ['2','3.5','4.5','5']}
I have tried using the code below but it only replaces + and not 2+, 3+, 4+
df.replace('+','.5', regex=False)
I also tried using str.replace but the rest of the values become NaN:
df['Lower'].str.replace('+', '.5')
you can override the value by looping, but it's not the fastest solution
import pandas as pd
data = {'Name': ['A', 'B', 'C', 'D'],
'Lower': ['+', '2', '2+', '3'],
'Upper': ['2','3+','4+','5']}
lower = []
upper = []
newdata = {'Name': ['A', 'B', 'C', 'D'],
'Lower': lower,
'Upper': upper}
for i in data['Lower']:
if "+" in i:
lower.append(i.replace("+", ".5"))
else:
lower.append(i)
for j in data['Upper']:
if "+" in j:
upper.append(j.replace("+", ".5"))
else:
upper.append(j)
df= pd.DataFrame(newdata)
print(df)

Python function for plotting multiple figures

The following function creates rows = 1 and columns = 9 plots.
def plot_percentiles(df_list, site_names=['a','b','c', 'd', 'e', 'f', 'i', 'j', 'k'],
xlabel=r"PA [$m^2 m^{-3}$]",ylim=(0,50),xlim=(0,0.6)):
figure, ax = plt.subplots(1,9, figsize=[10,3], squeeze=True)
figure.tight_layout()
for i, df in enumerate(df_list):
ax[i].fill_betweenx(x1=df["10th percentile"], x2=df["90th percentile"], y=df["Height"],
color="darkgreen", alpha=.5, linewidth=0)
ax[i].fill_betweenx(x1=df["25th percentile"], x2=df["75th percentile"], y=df["Height"],
color="darkgreen", alpha=.5, linewidth=0)
ax[i].plot(df["Median"], df["Height"], color = "darkgreen", linewidth=1)
ax[i].set_ylabel("Height [m]", fontsize=10)
ax[i].set_xlabel(xlabel, fontsize=10)
ax[i].set(ylim=ylim, xlim=xlim)
ax[i].set_title(site_names[i], fontsize=12)
ax[i].set_facecolor('white')
plt.show()
How do I change it to create rows = 3, columns = 3 plots? Simply changing figure, ax = plt.subplots(3,3, figsize=[10,3], squeeze=True) doesn't work.

Pandas error TypeError: bad operand type for unary ~: 'float'

I have inherited this piece of code
dummy_data1 = {
'id': ['1', '2', '3', '4', '5'],
'Feature1': ['A', 'C', 'E', 'G', 'I'],
'Feature2': ['Mouse', 'dog', 'house and parrot', '23', np.NaN],
'dates': ['12/12/2020','12/12/2020','12/12/2020','12/12/2020','12/12/2020']}
df1 = pd.DataFrame(dummy_data1, columns = ['id', 'Feature1', 'Feature2', 'dates'])
df1 = df1.assign(
Feature2=lambda df: df.Feature2.where(
~df.Feature2.str.isnumeric(),
pd.to_numeric(df.Feature2, errors="coerce").astype("Int64"),
)
)
print(df1)
I know that this is because of the np.NAN value. What does the code do? My understanding is that it tries to convert the String to Int, if it is of type integer. Also please tell me how to overcome this issue.
You can try via pd.to_numeric() and then fill NaN's:
df['Feature2']=pd.to_numeric(df['Feature2'], errors="coerce").fillna(df['Feature2'])
OR
go with the where() condition by filling those NaN's with fillna() in your condition ~df.Feature2.str.isnumeric():
df['Feature2']=df['Feature2'].where(~df.Feature2.str.isnumeric().fillna(True),
pd.to_numeric(df.Feature2, errors="coerce").astype("Int64")
)

Label and color glyph in bokeh

I am trying out bokeh. It's quite fun so far. But I am not totally getting the hang of it. My goal is to make a simple but interactive scatter chart.
I have three main issues:
I want to label the scatter plot with names
I want the scatter to be colored in accordance to colors
I would love widgets where I can decide if the colors and names are displayed.
Here is what I have done so far. I tried to use LabelSet but I am stuck. Any help is greatly appreciated!
# interactive widget bokeh figure
from bokeh.io import curdoc
from bokeh.layouts import row, widgetbox
from bokeh.models import ColumnDataSource
from bokeh.models.widgets import Slider, TextInput
from bokeh.plotting import figure
from bokeh.models import Range1d, LabelSet, Label
import numpy as np
# data
x = [-4, 3, 2, 4, 10, 11, -2, 6]
y = [-3, 2, 2, 9, 11, 12, -5, 6]
names = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
colors =['r', 'y', 'y', 'r', 'g', 'g', 'g', 'g']
p = figure(plot_height=400, plot_width=400, title="a little interactive chart",
tools="crosshair,pan,reset,save,wheel_zoom",
x_range=[-10, 10], y_range=[-10, 10])
labels = LabelSet(x='x', y='y', text='names', level='glyph',
x_offset=5, y_offset=5)
p.add_layout(labels)
p.circle(x, y, fill_color="red", line_color="red", size=6)
# Set up widgets
text = TextInput(title="title", value='a little interavtive chart')
# Set up callbacks
def update_title(attrname, old, new):
p.title.text = text.value
text.on_change('value', update_title)
# # Set up layouts and add to document
inputs = widgetbox(text, names)
curdoc().add_root(row(inputs, p, width=800))
curdoc().title = "Sliders"
Typically you use LabelSet by configuring it with the same data source as some glyph renderer. I find whenever sharing column data sources, its best to just also create them explicitly. Here is an updated version of your code that renders:
# interactive widget bokeh figure
from bokeh.io import curdoc
from bokeh.layouts import row, widgetbox
from bokeh.models import ColumnDataSource, Range1d, LabelSet, Label
from bokeh.models.widgets import Slider, TextInput
from bokeh.plotting import figure
# data
x = [-4, 3, 2, 4, 10, 11, -2, 6]
y = [-3, 2, 2, 9, 11, 12, -5, 6]
names = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
colors =['r', 'y', 'y', 'r', 'g', 'g', 'g', 'g']
# create a CDS by hand
source = ColumnDataSource(data=dict(x=x, y=y, names=names, colors=colors))
p = figure(plot_height=400, plot_width=400, title="a little interactive chart",
tools="crosshair,pan,reset,save,wheel_zoom",
x_range=[-10, 10], y_range=[-10, 10])
# pass the CDS here, and column names (not the arrays themselves)
p.circle('x', 'y', fill_color="red", line_color="red", size=6, source=source)
# pass the CDS here too
labels = LabelSet(x='x', y='y', text='names', level='glyph',
x_offset=5, y_offset=5, source=source)
p.add_layout(labels)
# Set up widgets
text = TextInput(title="title", value='a little interavtive chart')
# Set up callbacks
def update_title(attrname, old, new):
p.title.text = text.value
text.on_change('value', update_title)
# # Set up layouts and add to document
inputs = widgetbox(text)
curdoc().add_root(row(inputs, p, width=800))
curdoc().title = "Sliders"
I also removed names from the widgetbox because widget boxes can only contain widget models. Maybe you intend to use the names in a Select widget or something?

How to add column next to Seaborn heat map

Given the code below, which produces a heat map, how can I get column "D" (the total column)
to display as a column to the right of the heat map with no color, just aligned total values per cell? I'm also trying to move the labels to the top. I don't mind that the labels on the left are horizontal as this does not occur with my actual data.
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline
df = pd.DataFrame(
{'A' : ['A', 'A', 'B', 'B','C', 'C', 'D', 'D'],
'B' : ['A', 'B', 'A', 'B','A', 'B', 'A', 'B'],
'C' : [2, 4, 5, 2, 0, 3, 9, 1],
'D' : [6, 6, 7, 7, 3, 3, 10, 10]})
df=df.pivot('A','B','C')
fig, ax = plt.subplots(1, 1, figsize =(4,6))
sns.heatmap(df, annot=True, linewidths=0, cbar=False)
plt.show()
Here's the desired result:
Thanks in advance!
I think the cleanest way (although probably not the shortest), would be to plot Total as one of the columns, and then access colors of the facets of the heatmap and change some of them to white.
The element that is responsible for color on heatmap is matplotlib.collections.QuadMesh. It contains all facecolors used for each facet of the heatmap, from left to right, bottom to top.
You can modify some colors and pass them back to QuadMesh before you plt.show().
There is a slight problem that seaborn changes text color of some of the annotations to make them visible on dark background, and they become invisible when you change to white color. So for now I set color of all text to black, you will need to figure out what is best for your plots.
Finally, to put x axis ticks and label on top, use:
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
The final version of the code:
import matplotlib.pyplot as plt
from matplotlib.collections import QuadMesh
from matplotlib.text import Text
import seaborn as sns
import pandas as pd
import numpy as np
%matplotlib inline
df = pd.DataFrame(
{'A' : ['A', 'A', 'B', 'B','C', 'C', 'D', 'D'],
'B' : ['A', 'B', 'A', 'B','A', 'B', 'A', 'B'],
'C' : [2, 4, 5, 2, 0, 3, 9, 1],
'D' : [6, 6, 7, 7, 3, 3, 10, 10]})
df=df.pivot('A','B','C')
# create "Total" column
df['Total'] = df['A'] + df['B']
fig, ax = plt.subplots(1, 1, figsize =(4,6))
sns.heatmap(df, annot=True, linewidths=0, cbar=False)
# find your QuadMesh object and get array of colors
quadmesh = ax.findobj(QuadMesh)[0]
facecolors = quadmesh.get_facecolors()
# make colors of the last column white
facecolors[np.arange(2,12,3)] = np.array([1,1,1,1])
# set modified colors
quadmesh.set_facecolors = facecolors
# set color of all text to black
for i in ax.findobj(Text):
i.set_color('black')
# move x ticks and label to the top
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
plt.show()
P.S. I am on Python 2.7, some syntax adjustments might be required, though I cannot think of any.

Resources