Matplotlib X-ticks not distributing correctly - python-3.x

I have a plot as shown below that has over 1000 x-axis points. I'm trying to scale the x-axis into 3 values, the min, mid & max value instead of having 1000 labels.
Despite my efforts denoted within the hashtags, all 3 values are written onto the same tick (on top of each other) or simply only 1 tick is randomly placed along the x-axis.
import matplotlib.pyplot as plt
from matlplotlib.pyplot import figure
figure (num = None, figsize=(20,10), dpi=80, facecolor='w', edgecolor='k')
ax =plt.gca()
data.plot(kind='bar', x='colA', y='colB', ax=ax)
######
plt.xticks(np.arrange(0,3, step 1)
**ALSO TRIED**
plt.xticks = ([1,2,3], ["a","b","c"])
######
plt.show()
How can I distribute the min,mid and max value evenly across the X-axis?

If 'colA' is numerical:
x_min = min(data['colA'])
x_max = max(data['colA'])
x_mid = (x_min + x_max) / 2
# use regular division if the numbers are floats, use integer division in case all numbers are integers
plt.xticks([x_min, x_mid, x_max], ["a","b","c"])
# plt.xticks([x_min, x_mid, x_max]) # leave out the labels if the default labels are OK
If, on the contrary, 'colA' is categorical (so, some strings), they are numbered internally as 0, 1, 2, ... up till the number of strings minus one:
x_min = 0
x_max = len(data['colA']) - 1
x_mid = x_max // 2 # integer division
plt.xticks([x_min, x_mid, x_max])

You could try:
min_ = min(data['colA'])
max_ = max(data['colA'])
mid_ = (min_ + max_) / 2.
ax.set_xticks([min_, mid_, max_])
ax.set_xticklabels(['min', 'mid', 'max'])

Related

Bokeh colorbar, assign a tick to each color

I'm trying to plot an heatmap of a matrix containing some counts (called mat in my code, then df after change the structure to use it with Bokeh). The structure is like this:
X
element 1
element 2
element 3
category 1
0
6
4
category 2
1
7
3
category 3
5
2
10
category 4
0
1
4
Now with my code I'm using df.value.unique() both for the color mapper and the ticks, but in the heatmap the colorbar's ticks doesn't correspond to the colors:
How can I make the ticks coincide each one to one color? I'm quite sure I have to use the CategoricalColorMapper but with that I get only a white screen. Thank you for the help.
Here's my code:
mat = pd.read_csv("tests/count_50.dat", sep="\t", index_col=0)
mat.index.name = 'MGI_id'
mat.columns.name = 'phen_sys'
#set data as float numbers
mat=mat.astype(float)
#Create a custom palette and add a specific mapper to map color with values
df = mat.stack(dropna=False).rename("value").reset_index()
pal=bokeh.palettes.brewer['YlGnBu'][len(df.value.unique())]
mapper = LinearColorMapper(palette=pal, low=df.value.min(), high=df.value.max(), nan_color = 'gray')
#Define a figure
p = figure(
plot_width=1280,
plot_height=800,
title="Heatmap",
x_range=list(df.MGI_id.drop_duplicates()),
y_range=list(df.phen_sys.drop_duplicates()[::-1]),
tooltips=[('Phenotype system','#phen_sys'),('Gene','#MGI_id'),('Phenotypes','#value')],
x_axis_location="above",
output_backend="webgl")
#Create rectangles for heatmap
p.rect(
x="MGI_id",
y="phen_sys",
width=1,
height=1,
source=ColumnDataSource(df),
fill_color=transform('value', mapper))
p.xaxis.major_label_orientation = 45
#Add legend
t = df.value.unique()
t.sort()
color_bar = ColorBar(
color_mapper=mapper,
ticker=FixedTicker(ticks=t, desired_num_ticks=len(df.value.unique())),
label_standoff=6,
border_line_color=None)
p.add_layout(color_bar, 'right')
show(p)
I found a solution:
I create a factor list by ordering the values and then converting both the dataframe values and the factors. At that point I created a CategoricalColorMapper instead of the linear one and the plot now is correct:
Your list of values goes from 0 to 10, so ColorBar will go up to 10. You can change mapper 'high' value to '9':
mapper = LinearColorMapper(palette=colors, low=0, high=9, nan_color = 'gray')
Or a ColorBar that goes from 1 to 10:
mapper = LinearColorMapper(palette=colors, low=1, high=10, nan_color = 'gray')

How to remove repeating and empty or unmarked values on subplot of x-axis

I'm developing a set of graphs to paint some Pandas DataFrame values. For that I'm using various pandas, numpy and matplotlib modules and functions using the following code:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
data = {'Name': ['immoControlCmd', 'BrkTerrMde', 'GlblClkYr', 'HsaStat', 'TesterPhysicalResGWM', 'FapLc','FirstRowBuckleDriver', 'GlblClkDay'],
'Value': [0, 5, 0, 4, 0, 1, 1, 1],
'Id_Par': [0, 0, 3, 3, 3, 3, 0, 0]
}
signals_df = pd.DataFrame(data)
def plot_signals(signals_df):
# Count signals by par
signals_df['Count'] = signals_df.groupby('Id_Par').cumcount().add(1).mask(signals_df['Id_Par'].eq(0), 0)
# Subtract Par values from the index column
signals_df['Sub'] = signals_df.index - signals_df['Count']
id_par_prev = signals_df['Id_Par'].unique()
id_par = np.delete(id_par_prev, 0)
signals_df['Prev'] = [1 if x in id_par else 0 for x in signals_df['Id_Par']]
signals_df['Final'] = signals_df['Prev'] + signals_df['Sub']
# signals_df['Finall'] = signals_df['Final'].unique()
# print(signals_df['Finall'])
# Convert and set Subtract to index
signals_df.set_index('Final', inplace=True)
# pos_x = len(signals_df.index.unique()) - 1
# print(pos_x)
# Get individual names and variables for the chart
names_list = [name for name in signals_df['Name'].unique()]
num_names_list = len(names_list)
num_axis_x = len(signals_df["Name"])
# Creation Graphics
fig, ax = plt.subplots(nrows=num_names_list, figsize=(10, 10), sharex=True)
plt.xticks(np.arange(0, num_axis_x), color='SteelBlue', fontweight='bold')
for pos, (a_, name) in enumerate(zip(ax, names_list)):
# Get data
data = signals_df[signals_df["Name"] == name]["Value"]
# Get values axis-x and axis-y
x_ = np.hstack([-1, data.index.values, len(signals_df) - 1])
# print(data.index.values)
y_ = np.hstack([0, data.values, data.iloc[-1]])
# Plotting the data by position
ax[pos].plot(x_, y_, drawstyle='steps-post', marker='*', markersize=8, color='k', linewidth=2)
ax[pos].set_ylabel(name, fontsize=8, fontweight='bold', color='SteelBlue', rotation=30, labelpad=35)
ax[pos].yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.1f'))
ax[pos].yaxis.set_tick_params(labelsize=6)
ax[pos].grid(alpha=0.4, color='SteelBlue')
plt.show()
plot_signals(signals_df)
What I want is to remove the points or positions of the x-axis where nothing is painted or they are not marked on the graph, but leave the values ​​and names as in the image at the end; Seen from Pandas it would be the "Final" column that, before painting the subplots, assigned it as an index and it is where some of the values ​​in this column are repeated; would be to remove the values ​​enclosed in the red box from the graph, but leave the values ​​and names as in the image at the end:
Name Value Id_Par Count Sub Prev
Final
0 immoControlCmd 0 0 0 0 0
1 BrkTerrMde 5 0 0 1 0
2 GlblClkYr 0 3 1 1 1
2 HsaStat 4 3 2 1 1
2 TesterPhysicalResGWM 0 3 3 1 1
2 FapLc 1 3 4 1 1
6 FirstRowBuckleDriver 1 0 0 6 0
7 GlblClkDay 1 0 0 7 0
I've been trying to bring the unique values ​​of the last column, which would be the value that the x-axis should be, but since the dataframe is of another size or dimension, I get an error: ValueError: Length of values ​​(5) does not match length of index (8), and then I have to resize my chart, but in this case I don't understand how to do it:
signals_df['Final'] = signals_df['Prev'] + signals_df['Sub']
signals_df['Finall'] = signals_df['Final'].unique()
print(signals_df['Finall'])
I've also tried to bring the size of the unique index, previously assigned to apply a subtraction to data.index.values ​​of the variable x_, but it does not bring me what I want because it is gathering all the values ​​and subtracting them in bulk and not separately , as is data.index.values:
signals_df.set_index('Final', inplace=True)
pos_x = len(signals_df.index.unique()) - 1
...
..
.
x_ = np.hstack([-1, data.index.values-pos-x, len(signals_df) - 1])
Is there a Pandas and/or Matplotlib function that allows me? or Could someone give me a suggestion that will help me better understand how to do it? what I expect to achieve would be the plot below:
I really appreciate your help, any comments help.
I've Python version: 3.6.5, Pandas version: 1.1.5 and Matplotlib version: 3.3.2
One possible way to do this is if you make your x-axis values into strings, which means that matplotlib will make a "categorical" plot. See examples of that here.
For your case, because you have subplots which would have different values, and they are not always in the right order, we need to do a bit of trickery first to make sure the ticks appear in the correct order. For that, we can use the approach from this answer, where they plot something that uses all of the x values in the correct order, and then remove it.
To gather all the xtick values together, you can do something like this, where you create a list of the values, reduce it to the unique values using a set, then sort those values, and convert to strings using a list comprehension and str():
# First make a list of all the xticks we want
xvals = [-1,]
for name in names_list:
xvals.append(signals_df[signals_df["Name"] == name]["Value"].index.values[0])
xvals.append(len(signals_df)-1)
# Reduce to only unique values, sorted, and then convert to strings
xvals = [str(i) for i in sorted(set(xvals))]
Once you have those, you can make a dummy plot, and then remove it, like so (this is to fix the tick positions in the correct order). NOTE that this needs to be inside your plotting loop for matplotlib versions 3.3.4 and earlier:
# To get the ticks in the right order on all subplots, we need to make
# a dummy plot here and then remove it
dummy, = ax[0].plot(xvals, np.zeros_like(xvals))
dummy.remove()
Finally, when you actually plot the real data inside the loop, you just need to convert x_ to strings as you plot them:
ax[pos].plot(x_.astype('str'), y_, drawstyle='steps-post', marker='*', markersize=8, color='k', linewidth=2)
Note the only other change I made was to not explicitly set the xtick positions (which you did, with plt.xticks), but you can still use that command to set the font colour and weight
plt.xticks(color='SteelBlue', fontweight='bold')
And this is the output:
For completeness, here I have put it all together in your script:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
import matplotlib
print(matplotlib.__version__)
data = {'Name': ['immoControlCmd', 'BrkTerrMde', 'GlblClkYr', 'HsaStat', 'TesterPhysicalResGWM', 'FapLc',
'FirstRowBuckleDriver', 'GlblClkDay'],
'Value': [0, 5, 0, 4, 0, 1, 1, 1],
'Id_Par': [0, 0, 3, 3, 3, 3, 0, 0]
}
signals_df = pd.DataFrame(data)
def plot_signals(signals_df):
# Count signals by par
signals_df['Count'] = signals_df.groupby('Id_Par').cumcount().add(1).mask(signals_df['Id_Par'].eq(0), 0)
# Subtract Par values from the index column
signals_df['Sub'] = signals_df.index - signals_df['Count']
id_par_prev = signals_df['Id_Par'].unique()
id_par = np.delete(id_par_prev, 0)
signals_df['Prev'] = [1 if x in id_par else 0 for x in signals_df['Id_Par']]
signals_df['Final'] = signals_df['Prev'] + signals_df['Sub']
# signals_df['Finall'] = signals_df['Final'].unique()
# print(signals_df['Finall'])
# Convert and set Subtract to index
signals_df.set_index('Final', inplace=True)
# pos_x = len(signals_df.index.unique()) - 1
# print(pos_x)
# Get individual names and variables for the chart
names_list = [name for name in signals_df['Name'].unique()]
num_names_list = len(names_list)
num_axis_x = len(signals_df["Name"])
# Creation Graphics
fig, ax = plt.subplots(nrows=num_names_list, figsize=(10, 10), sharex=True)
# No longer any need to define where the ticks go, but still set the colour and weight here
plt.xticks(color='SteelBlue', fontweight='bold')
# First make a list of all the xticks we want
xvals = [-1, ]
for name in names_list:
xvals.append(signals_df[signals_df["Name"] == name]["Value"].index.values[0])
xvals.append(len(signals_df) - 1)
# Reduce to only unique values, sorted, and then convert to strings
xvals = [str(i) for i in sorted(set(xvals))]
for pos, (a_, name) in enumerate(zip(ax, names_list)):
# To get the ticks in the right order on all subplots,
# we need to make a dummy plot here and then remove it
dummy, = ax[pos].plot(xvals, np.zeros_like(xvals))
dummy.remove()
# Get data
data = signals_df[signals_df["Name"] == name]["Value"]
# Get values axis-x and axis-y
x_ = np.hstack([-1, data.index.values, len(signals_df) - 1])
y_ = np.hstack([0, data.values, data.iloc[-1]])
# Plotting the data by position
# NOTE: here we convert x_ to strings as we plot, to make sure they are plotted as catagorical values
ax[pos].plot(x_.astype('str'), y_, drawstyle='steps-post', marker='*', markersize=8, color='k', linewidth=2)
ax[pos].set_ylabel(name, fontsize=8, fontweight='bold', color='SteelBlue', rotation=30, labelpad=35)
ax[pos].yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.1f'))
ax[pos].yaxis.set_tick_params(labelsize=6)
ax[pos].grid(alpha=0.4, color='SteelBlue')
plt.show()
plot_signals(signals_df)

Sort simmilarity matrix according to plot colors

I have this similarity matrix plot of some documents. I want to sort the values of the matrix, which is a numpynd array, to group colors, while maintaining their relative position (diagonal yellow line), and labels as well.
path = "C:\\Users\\user\\Desktop\\texts\\dataset"
text_files = os.listdir(path)
#print (text_files)
tfidf_vectorizer = TfidfVectorizer()
documents = [open(f, encoding="utf-8").read() for f in text_files if f.endswith('.txt')]
sparse_matrix = tfidf_vectorizer.fit_transform(documents)
labels = []
for f in text_files:
if f.endswith('.txt'):
labels.append(f)
pairwise_similarity = sparse_matrix * sparse_matrix.T
pairwise_similarity_array = pairwise_similarity.toarray()
fig, ax = plt.subplots(figsize=(20,20))
cax = ax.matshow(pairwise_similarity_array, interpolation='spline16')
ax.grid(True)
plt.title('News articles similarity matrix')
plt.xticks(range(23), labels, rotation=90);
plt.yticks(range(23), labels);
fig.colorbar(cax, ticks=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
plt.show()
Here is one possibility.
The idea is to use the information in the similarity matrix and put elements next to each other if they are similar. If two items are similar they should also be similar with respect to other elements ie have similar colors.
I start with the element which has the most in common with all other elements (this choice is a bit arbitrary) [a] and as next element I choose from the remaining elements the one which is closest to the current [b].
import numpy as np
import matplotlib.pyplot as plt
def create_dummy_sim_mat(n):
sm = np.random.random((n, n))
sm = (sm + sm.T) / 2
sm[range(n), range(n)] = 1
return sm
def argsort_sim_mat(sm):
idx = [np.argmax(np.sum(sm, axis=1))] # a
for i in range(1, len(sm)):
sm_i = sm[idx[-1]].copy()
sm_i[idx] = -1
idx.append(np.argmax(sm_i)) # b
return np.array(idx)
n = 10
sim_mat = create_dummy_sim_mat(n=n)
idx = argsort_sim_mat(sim_mat)
sim_mat2 = sim_mat[idx, :][:, idx] # apply reordering for rows and columns
# Plot results
fig, ax = plt.subplots(1, 2)
ax[0].imshow(sim_mat)
ax[1].imshow(sim_mat2)
def ticks(_ax, ti, la):
_ax.set_xticks(ti)
_ax.set_yticks(ti)
_ax.set_xticklabels(la)
_ax.set_yticklabels(la)
ticks(_ax=ax[0], ti=range(n), la=range(n))
ticks(_ax=ax[1], ti=range(n), la=idx)
After meTchaikovsky's answer I also tested my idea on a clustered similarity matrix (see first image) this method works but is not perfect (see second image).
Because I use the similarity between two elements as approximation to their similarity to all other elements, it is quite clear why this does not work perfectly.
So instead of using the initial similarity to sort the elements one could calculate a second order similarity matrix which measures how similar the similarities are (sorry).
This measure describes better what you are interested in. If two rows / columns have similar colors they should be close to each other. The algorithm to sort the matrix is the same as before
def add_cluster(sm, c=3):
idx_cluster = np.array_split(np.random.permutation(np.arange(len(sm))), c)
for ic in idx_cluster:
cluster_noise = np.random.uniform(0.9, 1.0, (len(ic),)*2)
sm[ic[np.newaxis, :], ic[:, np.newaxis]] = cluster_noise
def get_sim_mat2(sm):
return 1 / (np.linalg.norm(sm[:, np.newaxis] - sm[np.newaxis], axis=-1) + 1/n)
sim_mat = create_dummy_sim_mat(n=100)
add_cluster(sim_mat, c=4)
sim_mat2 = get_sim_mat2(sim_mat)
idx = argsort_sim_mat(sim_mat)
idx2 = argsort_sim_mat(sim_mat2)
sim_mat_sorted = sim_mat[idx, :][:, idx]
sim_mat_sorted2 = sim_mat[idx2, :][:, idx2]
# Plot results
fig, ax = plt.subplots(1, 3)
ax[0].imshow(sim_mat)
ax[1].imshow(sim_mat_sorted)
ax[2].imshow(sim_mat_sorted2)
The results with this second method are quite good (see third image)
but I guess there exist cases where this approach also fails, so I would be happy about feedback.
Edit
I tried to explain it and did also link the ideas to the code with [a] and [b], but obviously I did not do a good job, so here is a second more verbose explanation.
You have n elements and a n x n similarity matrix sm where each cell (i, j) describes how similar element i is to element j. The goal is to order the rows / columns in such a way that one can see existing patterns in the similarity matrix. My idea to achieve this is really simple.
You start with an empty list and add elements one by one. The criterion for the next element is the similarity to the current element. If element i was added in the last step, I chose the element argmax(sm[i, :]) as next, ignoring the elements already added to the list. I ignore the elements by setting the values of those elements to -1.
You can use the function ticks to reorder the labels:
labels = np.array(labels) # make labels an numpy array, to index it with a list
ticks(_ax=ax[0], ti=range(n), la=labels[idx])
#scleronomic's solution is very elegant, but it also has one shortage, which is we cannot set the number of clusters in the sorted correlation matrix. Assume we are working with a set of variables, in which some of them are weakly correlated
import string
import numpy as np
import pandas as pd
n_variables = 20
n_clusters = 10
n_samples = 100
np.random.seed(100)
names = list(string.ascii_lowercase)[:n_variables]
belongs_to_cluster = np.random.randint(0,n_clusters,n_variables)
latent = np.random.randn(n_clusters,n_samples)
variables = np.random.rand(n_variables,n_samples)
for ind in range(n_clusters):
mask = belongs_to_cluster == ind
# weakening the correlation
if ind % 2 == 0:variables[mask] += latent[ind]*0.1
variables[mask] += latent[ind]
df = pd.DataFrame({key:val for key,val in zip(names,variables)})
corr_mat = np.array(df.corr())
As you can see, there are 10 clusters of variables by construction, however, variables within clusters that has an even index are weakly correlated. If we only want to see roughly 5 clusters in the sorted correlation matrix, maybe we need to find another way.
Based on this post, which is the accepted answer to the question "Clustering a correlation matrix", to sort a correlation matrix into blocks, what we need to find are blocks, where correlations within blocks are high and correlations between blocks are low. However, the solution provided by this accepted answer works best when we know how many blocks are there in the first place, and more importantly, the sizes of the underlying blocks are the same, or at least similar. Therefore, I improved the solution with a new function sort_corr_mat
def sort_corr_mat(corr_mat,clusters_guess):
def _swap_rows(corr_mat, var1, var2):
rs = corr_mat.copy()
rs[var2, :],rs[var1, :]= corr_mat[var1, :],corr_mat[var2, :]
cs = rs.copy()
cs[:, var2],cs[:, var1] = rs[:, var1],rs[:, var2]
return cs
# analysis
max_iter = 500
best_score,current_score,best_count = -1e8,-1e8,0
num_minimua_to_visit = 20
best_corr = corr_mat
best_ordering = np.arange(n_variables)
for i in range(max_iter):
for row1 in range(n_variables):
for row2 in range(n_variables):
if row1 == row2: continue
option_ordering = best_ordering.copy()
option_ordering[row1],option_ordering[row2] = best_ordering[row2],best_ordering[row1]
option_corr = _swap_rows(best_corr,row1,row2)
option_score = score(option_corr,n_variables,clusters_guess)
if option_score > best_score:
best_corr = option_corr
best_ordering = option_ordering
best_score = option_score
if best_score > current_score:
best_count += 1
current_corr = best_corr
current_ordering = best_ordering
current_score = best_score
if best_count >= num_minimua_to_visit:
return best_corr#,best_ordering
return best_corr#,best_ordering
With this function and the corr_mat constructed in the first place, I compared the result obtained with my function (on the right) with that obtained with #scleronomic's solution (in the middle)
sim_mat_sorted = corr_mat[argsort_sim_mat(corr_mat), :][:, argsort_sim_mat(corr_mat)]
corr_mat_sorted = sort_corr_mat(corr_mat,clusters_guess=5)
# Plot results
fig, ax = plt.subplots(1,3,figsize=(18,6))
ax[0].imshow(corr_mat)
ax[1].imshow(sim_mat_sorted)
ax[2].imshow(corr_mat_sorted)
Clearly, #scleronomic's solution works much better and faster, but my solution offers more control to the pattern of the output.

Adding minor tick marks to a histogram

I am working through this:
https://medium.com/diogo-menezes-borges/introduction-to-statistics-for-data-science-6c246ed2468d
About 3/4 of the way through there is a histogram, but the author does not supply the code used to generate it.
So I decided to give it a go...
I have everything working, but I would like to add minor ticks to my plot.
X-axis only, spaced 200 units apart (matching the bin width used in my code).
In particular, I would like to add minor ticks in the style from the last example from here:
https://matplotlib.org/3.1.0/gallery/ticks_and_spines/major_minor_demo.html
I have tried several times but I just can't get that exact 'style' to work on my plot.
Here is my working code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
print('NumPy: {}'.format(np.__version__))
print('Pandas: {}'.format(pd.__version__))
print('\033[1;31m' + '--------------' + '\033[0m') # Bold red
display_settings = {
'max_columns': 15,
'max_colwidth': 60,
'expand_frame_repr': False, # Wrap to multiple pages
'max_rows': 50,
'precision': 6,
'show_dimensions': False
}
# pd.options.display.float_format = '{:,.2f}'.format
for op, value in display_settings.items():
pd.set_option("display.{}".format(op), value)
file = "e:\\python\\pandas\\medium\\sets.csv"
lego = pd.read_csv(file, encoding="utf-8")
print(lego.shape, '\n')
print(lego.info(), '\n')
print(lego.head(), '\n')
print(lego.isnull().sum(), '\n')
dfs = [lego]
names = ['lego']
def NaN_percent(_df, column_name):
# empty_values = row_count - _df[column_name].count()
empty_values = _df[column_name].isnull().sum()
return (100.0 * empty_values)/row_count
c = 0
print('Columns with missing values expressed as a percentage.')
for df in dfs:
print('\033[1;31m' + ' ' + names[c] + '\033[0m')
row_count = df.shape[0]
for i in list(df):
x = NaN_percent(df, i)
if x > 0:
print(' ' + i + ': ' + str(x.round(4)) + '%')
c += 1
print()
# What is the average number of parts in the sets of legos?
print(lego['num_parts'].mean(), '\n')
# What is the median number of parts in the sets of legos?
print(lego['num_parts'].median(), '\n')
print(lego['num_parts'].max(), '\n')
# Create Bins for Data Ranges
bins = []
for i in range(lego['num_parts'].min(), 6000, 200):
bins.append(i + 1)
# Use 'right' to determine which bin overlapping values fall into.
cuts = pd.cut(lego['num_parts'], bins=bins, right=False)
# Count values in each bin.
print(cuts.value_counts(), '\n')
plt.hist(lego['num_parts'], color='red', edgecolor='black', bins=bins)
plt.title('Histogram of Number of parts')
plt.xlabel('Bin')
plt.ylabel('Number of values per bin')
plt.axvline(x=162.2624, color='blue')
plt.axvline(x=45.0, color='green', linestyle='--')
# https://matplotlib.org/gallery/text_labels_and_annotations/custom_legends.html
legend_elements = [Line2D([0], [0], color='blue', linewidth=2, linestyle='-'),
Line2D([0], [1], color='green', linewidth=2, linestyle='--')
]
labels = ['mean: 162.2624', 'median: 45.0']
plt.legend(legend_elements, labels)
plt.show()
You can just add:
ax = plt.gca()
ax.xaxis.set_minor_locator(AutoMinorLocator())
ax.tick_params(which='minor', length=4, color='r')
See this post to get a better idea about the difference between plt, ax and fig. In broad terms, plt refers to the pyplot library of matplotlib. fig is one "plot" that can consist of one or more subplots. ax refers to one subplot and the x and y-axis defined for them, including the measuring units, tick marks, tick labels etc.. Many function in matplotlib are often called as plt.hist, but in the underlying code they are drawing on the "current axes". These axes can be obtained via plt.gca() or "get current axes". It is not always clear which functions can be called via plt. and which only exist via ax.. Also, sometimes the get slightly different names. You'll need to look in the documentation or search StackOverflow which form is needed in each specific case.

Data and X axis labels not align

Trying to plot X axis (Event) values on their respective x Axis. Y axis is relative to Time (of the day) when and how long the event lasted. The first label and data plotted are correct. However, the second set of data appears to skip over the major x axis tick and is placed afterwards but before the next major x axis tick. This is repeated for each additional x Axis value plotted. The data does not show a problem with which X axis it should appear on.
Defined the data (source) and can plot the issue with about 50 lines of code.
from bokeh.io import output_file
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show
from bokeh.models.formatters import NumeralTickFormatter
import pandas as pd
import math
output_file("events.html", mode="inline")
x1 = []
y1 = []
x2 = []
y2 = []
colorList = []
shortNames = []
nameAndId = ["Event1", 0]
x1.append(nameAndId)
y1.append(33470)
x2.append(nameAndId)
y2.append(33492)
colorList.append("red")
shortNames.append("Evt1")
nameAndId = ["Event2", 1]
x1.append(nameAndId)
y1.append(34116)
x2.append(nameAndId)
y2.append(34151)
colorList.append("green")
shortNames.append("Evt2")
xAxisLabels = ["Event1", "Event2"]
data = {"x1": x1, "y1": y1, "x2": x2, "y2": y2, "color": colorList,\
"shortName": shortNames}
eventDF = pd.DataFrame(data=data,
columns=("x1", "y1", "x2", "y2", "color",\
"shortName"))
source = ColumnDataSource(eventDF)
yRange = [34151, 33470]
p = figure(plot_width=700, plot_height=750, x_range=xAxisLabels,\
y_range=yRange, output_backend="webgl")
p.xaxis.major_label_orientation = math.pi / -2
p.segment(x0="x1",y0="y1",x1="x2",y1="y2", source=source, color="color"\
line_width=12)
p.yaxis[0].formatter = NumeralTickFormatter(format="00:00:00")
p.xaxis.axis_label = "Events"
labels = LabelSet(x="x2",y="y2", text="shortName", text_font_size="8pt"\
text_color="black", level="glyph", x_offset=-6,\
y_offset=-5, render_mode="canvas", angle=270,\
angle_units="deg", source=source)
p.add_layout(labels)
show(p)
I'm thinking this is something simple I've over-looked like a xAxis formatter. I've tried to define one but none seem to work for my use case. The data doesn't seem to be associated to the xAxisLabel. I Expect Event 1 to show on the first X axis tick with Event 2 on the second X axis tick. Event 1 is correct but for each event afterwards, every major X axis tick is skipped with the data residing between tick marks.
The issue in your code is that the actual value for the x-coordinate you are supplying is:
nameAndId = ["Event2", 1]
This kind of list with a category name and a number in a list is understood by Bokeh as a categorical offset. You are explicitly telling Bokeh to position the glyph a distance of 1 (in "synthetic" coordinates) away from the location of "Event2". The reason things "work" for the Event1 case is that the offset in that case is 0:
nameAndId = ["Event1", 0]
I'm not sure what you are trying to accomplish by passing these lists with the second numerical value, so I can't really offer any additional suggestion except to say that it should probably not be passed on to Bokeh.

Resources