Python Bokeh donut chart category, subcategory mean calculation - python-3.x

I am using the following code to draw a bokeh donut chart to visualize the mean prices across different categories and subcategories.
d = Donut(train.groupby(['main_cat','sub_cat']).price.mean(), hover_text='mean',width=500,height=500)
show(d)
For sub_cat, the values are calculated correctly, but for main_cat, instead of showing the mean for main_cat, it is showing the sum of means of sub_cat under the particular main_cat. What change either in bokeh code or python code should be made to correctly show the mean values main_cat?
Your support is highly appreciated.

There probably is not a way. Donut was part of the old bokeh.charts API that was deprecated and sub sequently removed from Bokeh last year. In particular, any problems, issues, or missing features will never receive any additional work. It is abandoned and unmaintained, and should not be used. If you want to use Bokeh to display Donut charts, you can use the annular_wedge glyph to diplauy to donut pieces explicity:
from math import pi
import pandas as pd
from bokeh.io import output_file, show
from bokeh.palettes import Category20c
from bokeh.plotting import figure
from bokeh.transform import cumsum
x = { 'United States': 157, 'United Kingdom': 93, 'Japan': 89, 'China': 63,
'Germany': 44, 'India': 42, 'Italy': 40, 'Australia': 35,
'Brazil': 32, 'France': 31, 'Taiwan': 31, 'Spain': 29 }
data = pd.Series(x).reset_index(name='value').rename(columns={'index':'country'})
data['angle'] = data['value']/data['value'].sum() * 2*pi
data['color'] = Category20c[len(x)]
p = figure(plot_height=350)
p.annular_wedge(x=0, y=1, inner_radius=0.2, outer_radius=0.4,
start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
line_color="white", fill_color='color', legend='country', source=data)
show(p)

Related

Plotly Dash scatter plot: pointNumber is assigned to multiple points in hover data

I ran into an issue when using Plotly and Dash for retrieving hover data via hovering the cursor over points in a scatter plot.
The hover data retrieved from the Dash app seems to contain the same pointNumber and pointIndex for multiple points in the same plot. This makes it impossible to display the correct information associated to a given instance when hovering over the respective point.
Here is a simplified example which can be run in a Jupyter notebook. In the end I will want to display images on hovering.
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd
from jupyter_dash import JupyterDash
from dash import dcc, html, Input, Output, no_update
import plotly.express as px
# Loading iris data to pandas dataframe
data = load_iris()
images = data.data
labels = data.target
df = pd.DataFrame(images[:, :2], columns=["feat1", "feat2"])
df["label"] = labels
# Color for each class
color_map = {0: "setosa",
1: "versicolor",
2: "virginica"}
colors = [color_map[l] for l in labels]
df["color"] = colors
pd.set_option("display.max_rows", None, "display.max_columns", None)
print(df)
# Setup plotly scatter plot
fig = px.scatter(df, x="feat1", y="feat2", color="color")
fig.update_traces(hoverinfo="none",
hovertemplate=None)
# Setup Dash
app = JupyterDash(__name__)
app.layout = html.Div(className="container",
children=[dcc.Graph(id="graph-5", figure=fig, clear_on_unhover=True),
dcc.Tooltip(id="graph-tooltip-5", direction="bottom")])
#app.callback(Output("graph-tooltip-5", "show"),
Output("graph-tooltip-5", "bbox"),
Output("graph-tooltip-5", "children"),
Input("graph-5", "hoverData"))
def display_hover(hoverData):
if hoverData is None:
return False, no_update, no_update
print(hoverData)
hover_data = hoverData["points"][0]
bbox = hover_data["bbox"]
num = hover_data["pointNumber"]
children = [html.Div([html.Img(style={"height": "50px",
"width": "50px",
"display": "block",
"margin": "0 auto"}),
html.P("Feat1: {}".format(str(df.loc[num]["feat1"]))),
html.P("Feat2: {}".format(str(df.loc[num]["feat2"])))])]
return True, bbox, children
if __name__ == "__main__":
app.run_server(mode="inline", debug=True)
The problem can be observed for example with the following two instances retrieved via print(df):
index feat1 feat2 label color
31 5.4 3.4 0 setosa
131 7.9 3.8 2 virginica
Both are assigned the same pointNumber and pointIndex retrieved via print(HoverData):
{'points': [{'curveNumber': 2, 'pointNumber': 31, 'pointIndex': 31,
'x': 7.9, 'y': 3.8, 'bbox': {'x0': 1235.5, 'x1': 1241.5, 'y0': 152.13,
'y1': 158.13}}]}
{'points': [{'curveNumber': 0, 'pointNumber': 31,
'pointIndex': 31, 'x': 5.4, 'y': 3.4, 'bbox': {'x0': 481.33, 'x1':
487.33, 'y0': 197.38, 'y1': 203.38}}]}
This is the visualization when hovering over the two instances. The hovering information is wrong for the image on the right side.
Interestingly, the issue resolves when using
fig = px.scatter(df, x="feat1", y="feat2", color="label")
However, this will cause the legend to be displayed in a continuous manner and disable the possibility to selectively visualize instances associated to specific classes in the HTML.
Is this a bug or am I overlooking something?
Any help is much appreciated!
It turned out that I incorrectly expected pointNumber and pointIndex to be unique. The point numbers and indices are renumbered for each class as soon as a non-numeric column is used as color parameter in px.scatter(). Points in the scatterplot can be uniquely identified by combining curveNumber and one of pointNumber and pointIndex.
A potential solution is to generate separate indices for each class and add them to the dataframe:
curve_indices = np.array([np.arange(0, num_samples) for num_samples in np.unique(class_annot, return_counts=True)[1]], dtype="object")
curve_indices = np.concatenate(curve_indices).ravel()
df["curve_index"] = curve_indices
In the callback function the correct indices in the dataframe for each instance can then be identified using
df_index = df[(df.label == curve) & (df.curve_index == num)].index[0]

My Bar Plot is not showing bars for all the data values

I have a DataFrame that contains two features namely LotFrontage and LotArea.
I want to plot a bar graph to show the relation between them.
My code is:
import matplotlib.pyplot as plt
visual_df=pd.DataFrame()
visual_df['area']=df_encoded['LotArea']
visual_df['frontage']=df_encoded['LotFrontage']
visual_df.dropna(inplace=True)
plt.figure(figsize=(15,10))
plt.bar(visual_df['area'],visual_df['frontage'])
plt.show()
The column LotFrontage is in Float datatype.
What is wrong with my code and How can I correct it?
To see a relationship between two features, a scatter plot is usually much more informative than a bar plot. To draw a scatter plot via matplotlib: plt.scatter(visual_df['area'], visual_df['frontage']). You can also invoke pandas scatter plot, which automatically adds axis labels: df.plot(kind='scatter', x='area', y='frontage').
For a lot of statistical purposes, seaborn can be handy. sns.regplot not only creates the scatter plot but automatically also tries to fit the data with a linear regression and shows a confidence interval.
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
area = [8450, 9600, 11250, 9550, 14260, 14115, 10084, 6120, 7420, 11200, 11924, 10652, 6120, 10791, 13695, 7560, 14215, 7449, 9742, 4224, 14230, 7200]
frontage = [65, 80, 68, 60, 84, 85, 75, 51, 50, 70, 85, 91, 51, 72, 68, 70, 101, 57, 75, 44, 110, 60]
df = pd.DataFrame({'area': area, 'frontage': frontage})
sns.regplot(x='area', y='frontage', data=df)
plt.show()
PS: The main problem with the intented bar plot is that the x-values lie very far apart. Moreover, the default width is one and very narrow bars can get too narrow to see in the plot. Adding an explicit edge color can make them visible:
plt.bar(visual_df['area'], visual_df['frontage'], ec='blue')
You could set a larger width, but then some bars would start to overlap.
Alternatively, pandas barplot would treat the x-axis as categorical, showing all x-values next to each other, as if they were strings. The bars are drawn in the order of the dataframe, so you might want to sort first:
df.sort_values('area').plot(kind='bar', x='area', y='frontage')
plt.tight_layout()

grouped bar chart in pyplot not rendering

Simple problem I'm sure, but after doing tons of hunting around I cant find any reason this shouldn't be working. While trying to plot a grouped bar chart of 2010 and 2014 population data side by side, only 2014 is visible and the side for 2010 is blank. What is my mistake here?
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import numpy as np
myFrame = pd.read_csv('top12cities.csv', sep = ',', engine = 'python')
fig = plt.figure(figsize=(10,6))
ind = np.arange(len(myFrame['City']))
width = .35
p1 = plt.bar(ind, myFrame['2014 estimate'], width)
p2 = plt.bar(ind + .35, myFrame['2010 Census'], width, color='y')
plt.legend([p1[0],p2[0]], ['2014', '2010'], loc = 'best')
plt.xticks(ind + width /2,myFrame['City'],rotation = 90)
plt.xlabel("Name")
plt.ylabel('population')
plt.savefig('popbar.png')
plt.show()
Your code ran fine on my machine with a test csv with the schema
City,2010 Census,2014 estimate
New York, 34, 123
L.A., 14, 89
San Fran, 14, 30
It may be worth trying to check if your csv file has issues of some kind, like non-numeric entries in the problematic column. I got this:

Can't make dates appear on x-axis in pyplot

So I've been trying to plot some data. I have got the data to fetch from a database and placed it all correctly into the variable text_. This is the snippet of the code:
import sqlite3
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from dateutil.parser import parse
fig, ax = plt.subplots()
# Twin the x-axis twice to make independent y-axes.
axes = [ax, ax.twinx(), ax.twinx()]
# Make some space on the right side for the extra y-axis.
fig.subplots_adjust(right=0.75)
# Move the last y-axis spine over to the right by 20% of the width of the axes
axes[-1].spines['right'].set_position(('axes', 1.2))
# To make the border of the right-most axis visible, we need to turn the frame on. This hides the other plots, however, so we need to turn its fill off.
axes[-1].set_frame_on(True)
axes[-1].patch.set_visible(False)
# And finally we get to plot things...
text_ = [('01/08/2017', 6.5, 143, 88, 60.2, 3), ('02/08/2017', 7.0, 146, 90, 60.2, 4),
('03/08/2017', 6.7, 142, 85, 60.2, 5), ('04/08/2017', 6.9, 144, 86, 60.1, 6),
('05/08/2017', 6.8, 144, 88, 60.2, 7), ('06/08/2017', 6.7, 147, 89, 60.2, 8)]
colors = ('Green', 'Red', 'Blue')
label = ('Blood Sugar Level (mmol/L)', 'Systolic Blood Pressure (mm Hg)', 'Diastolic Blood Pressure (mm Hg)')
y_axisG = [text_[0][1], text_[1][1], text_[2][1], text_[3][1], text_[4][1], text_[5][1]] #Glucose data
y_axisS = [text_[0][2], text_[1][2], text_[2][2], text_[3][2], text_[4][2], text_[5][2]] # Systolic Blood Pressure data
y_axisD = [text_[0][3], text_[1][3], text_[2][3], text_[3][3], text_[4][3], text_[5][3]] # Diastolic Blood Pressure data
AllyData = [y_axisG, y_axisS, y_axisD] #list of the lists of data
dates = [text_[0][0], text_[1][0], text_[2][0], text_[3][0], text_[4][0], text_[5][0]] # the dates as strings
x_axis = [(parse(x, dayfirst=True)) for x in dates] #converting the dates to datetime format for the graph
Blimits = [5.5, 130, 70] #lower limits of the axis
Tlimits = [8, 160, 100] #upper limits of the axis
for ax, color, label, AllyData, Blimits, Tlimits in zip(axes, colors, label, AllyData, Blimits, Tlimits):
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y')) #format's the date
plt.gca().xaxis.set_major_locator(mdates.DayLocator())
data = AllyData
ax.plot(data, color=color) #plots all the y-axis'
ax.set_ylim([Blimits, Tlimits]) #limits
ax.set_ylabel(label, color=color) #y-axis labels
ax.tick_params(axis='y', colors=color)
axes[0].set_xlabel('Date', labelpad=20)
plt.gca().set_title("Last 6 Month's Readings",weight='bold',fontsize=15)
plt.show()
The code currently makes this graph:
Graph with no x-values
I understand the problem is probably in the ax.plot part but I'm not sure what exactly. I tried putting that line of code as ax.plot(data, x_axis, color=color however, this made the whole graph all messed up and the dates didn't show up on the x-axis like i wanted them to.
Is there something I've missed?
If this has been answered elsewhere, please can you show me how to implement that into my code by editing my code?
Thanks a ton
Apparently x_data is never actually used in the code. Instead of
ax.plot(data, color=color)
which plots the data against its indices, you would want to plot the data against the dates stored in x_axis.
ax.plot(x_axis, data, color=color)
Finally, adding plt.gcf().autofmt_xdate() just before plt.show will rotate the dates nicely, such that they don't overlap.

Matplotlib Color y-tick Labels via Loop

Given the following data frame:
import pandas as pd
import numpy as np
df=pd.DataFrame({'A':['A','B','C','D','E','F','G','H','I','J','K','L','M','N'],
'B':[20,25,39,43,32,17,40, 40, 34, 56, 76, 23, 54, 34]})
I'd like to create a bubble chart where each y-tick label is the same color as its respective dot. The code below works great if I only had say 4 rows of data and 4 colors in my color list. However, for some reason, when I have more than 9 or so rows of data (and colors in my color list), it only takes the first 9 elements of colors in the l.set_color(i) line. Any thoughts as to why this occurs? Is it a limitation of zip when iterating? Related to the data frame?
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
labels=df.A[::-1]
vals=df.B[::-1]
ind=np.arange(len(labels))
colors1=['r','g','b','c','y','y','y','g','b','c','y','y','y','g']
fig, ax = plt.subplots(1, 1, figsize = (6,4))
for i in ind:
plt.plot(vals[i],i,marker='o',markeredgecolor='none', markersize=17, alpha=.5, linestyle='none', color=colors1[i])
ax.tick_params(axis='x',which='both',bottom='on',top='off',color='grey',labelcolor='grey')
ax.tick_params(axis='y',which='both',left='off',right='off',color='grey',labelcolor='grey')
ax.spines['top'].set_visible(False);ax.spines['right'].set_visible(False);
ax.spines['bottom'].set_visible(False);ax.spines['left'].set_visible(False)
ax.set_xlim([0,50])
ax.set_ylim([min(ind)-1,max(ind)+1])
fontcols=colors1[::-1]
for l,i in zip(ax.yaxis.get_ticklabels(),fontcols):
l.set_color(i)
l.set_fontsize(11)
print(l,i) #This shows that only 9 members are being colored for some reason
plt.yticks(ind,labels,fontsize=14)
plt.show()
Thanks in advance!
You just need to set the yticks before you try and set the colours. As it is, matplotlib creates 9 ticks by default, you set their colours, then you tell it you want 14 ticks after. With just a little reordering, it all works:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
import numpy as np
df=pd.DataFrame({'A':['A','B','C','D','E','F','G','H','I','J','K','L','M','N'],
'B':[20,25,39,43,32,17,40, 40, 34, 56, 76, 23, 54, 34]})
labels=df.A[::-1]
vals=df.B[::-1]
ind=np.arange(len(labels))
colors1=['r','g','b','c','y','y','y','g','b','c','y','y','y','g']
fig, ax = plt.subplots(1, 1, figsize = (6,4))
for i in ind:
plt.plot(vals[i],i,marker='o',markeredgecolor='none', markersize=17, alpha=.5, linestyle='none', color=colors1[i])
ax.tick_params(axis='x',which='both',bottom='on',top='off',color='grey',labelcolor='grey')
ax.tick_params(axis='y',which='both',left='off',right='off',color='grey',labelcolor='grey')
ax.spines['top'].set_visible(False);ax.spines['right'].set_visible(False);
ax.spines['bottom'].set_visible(False);ax.spines['left'].set_visible(False)
ax.set_xlim([0,80]) # I increased this to fit all your data in
ax.set_ylim([min(ind)-1,max(ind)+1])
fontcols=colors1 # NOTE: you don't need to reverse this
plt.yticks(ind,labels,fontsize=14)
for l,i in zip(ax.yaxis.get_ticklabels(),fontcols):
l.set_color(i)
l.set_fontsize(11)
print(l,i)
plt.show()
Also note, you don't need to reverse the colour list before setting the tick colours

Resources