Python: Plot histograms with customized bins - python-3.x

I am using matplotlib.pyplot to make a histogram. Due to the distribution of the data, I want manually set up the bins. The details are as follows:
Any value = 0 in one bin;
Any value > 60 in the last bin;
Any value > 0 and <= 60 are in between the bins described above and the bin size is 5.
Could you please give me some help? Thank you.

I'm not sure what you mean by "the bin size is 5". You can either plot a histogramm by specifying the bins with a sequence:
import matplotlib.pyplot as plt
data = [0, 0, 1, 2, 3, 4, 5, 6, 35, 60, 61, 82, -5] # your data here
plt.hist(data, bins=[0, 0.5, 60, max(data)])
plt.show()
But the bin size will match the corresponding interval, meaning -in this example- that the "0-case" will be barely visible:
(Note that 60 is moved to the last bin when specifying bins as a sequence, changing the sequence to [0, 0.5, 59.5, max(data)] would fix that)
What you (probably) need is first to categorize your data and then plot a bar chart of the categories:
import matplotlib.pyplot as plt
import pandas as pd
data = [0, 0, 1, 2, 3, 4, 5, 6, 35, 60, 61, 82, -5] # your data here
df = pd.DataFrame()
df['data'] = data
def find_cat(x):
if x == 0:
return "0"
elif x > 60:
return "> 60"
elif x > 0:
return "> 0 and <= 60"
df['category'] = df['data'].apply(find_cat)
df.groupby('category', as_index=False).count().plot.bar(x='category', y='data', rot=0, width=0.8)
plt.show()
Output:

building off Tranbi's answer, you could specify the bin edges as detailed in the link they shared.
import matplotlib.pyplot as plt
import pandas as pd
data = [0, 0, 1, 2, 3, 4, 5, 6, 35, 60, 61, 82, -6] # your data here
df = pd.DataFrame()
df['data'] = data
bin_edges = [-5, 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65]
bin_edges_offset = [x+0.000001 for x in bin_edges]
plt.figure()
plt.hist(df['data'], bins=bin_edges_offset)
plt.show()
histogram

IIUC you want a classic histogram for value between 0 (not included) and 60 (included) and add two bins for 0 and >60 on the side.
In that case I would recommend plotting the 3 regions separately:
import matplotlib.pyplot as plt
data = [0, 0, 1, 2, 3, 4, 5, 6, 35, 60, 61, 82, -3] # your data here
fig, axes = plt.subplots(1,3, sharey=True, width_ratios=[1, 12, 1])
fig.subplots_adjust(wspace=0)
# counting 0 values and drawing a bar between -5 and 0
axes[0].bar(-5, data.count(0), width=5, align='edge')
axes[0].xaxis.set_visible(False)
axes[0].spines['right'].set_visible(False)
axes[0].set_xlim((-5, 0))
# histogram between (0, 60]
axes[1].hist(data, bins=12, range=(0.0001, 60.0001))
axes[1].yaxis.set_visible(False)
axes[1].spines['left'].set_visible(False)
axes[1].spines['right'].set_visible(False)
axes[1].set_xlim((0, 60))
# counting values > 60 and drawing a bar between 60 and 65
axes[2].bar(60, len([x for x in data if x > 60]), width=5, align='edge')
axes[2].xaxis.set_visible(False)
axes[2].yaxis.set_visible(False)
axes[2].spines['left'].set_visible(False)
axes[2].set_xlim((60, 65))
plt.show()
Output:
Edit: If you wanna plot probability density, I would edit the data and simply use hist:
import matplotlib.pyplot as plt
data = [0, 0, 1, 2, 3, 4, 5, 6, 35, 60, 61, 82, -3] # your data here
data2 = []
for el in data:
if el < 0:
pass
elif el > 60:
data2.append(61)
else:
data2.append(el)
plt.hist(data2, bins=14, density=True, range=(-4.99,65.01))
plt.show()

Related

Numpy choice based on another array

I want to select some elements from an array main_array if the indexes correspond indexes with a True value in another array. For example y should contain [14,15,16] in arbitrary order
import numpy as np
main_array = np.array([11,12,13,14,15,16])
selector = np.array([0,1,2,3,3,3])
x = np.random.choice(main_array, 3, replace=False) # This works
y = np.random.choice(main_array, 3, replace=False, p=np.where(selector>2)) # This fails
However, I get ValueError: 'p' must be 1-dimensional
What is the correct way to limit selection to indexes based on another array?
A way to do it is just make by parts:
import numpy as np
main_array = np.array([11, 12, 13, 14, 15, 16])
selector = np.array([0, 1, 2, 3, 3, 3])
x = np.random.choice(main_array, 3, replace=False)
z = main_array[selector > 2]
y = np.random.choice(z, len(z), replace=False)
print(f"x={x}")
print(f"z={z}")
print(f"y={y}")
The output is
x=[16 14 13]
z=[14 15 16]
y=[16 15 14]
Another way to make it is to put the probabilities equal to zero where the mask doesn't apply:
import numpy as np
main_array = np.array([11, 12, 13, 14, 15, 16])
selector = np.array([0, 1, 2, 3, 3, 3])
x = np.random.choice(main_array, 3, replace=False)
p = 1 * (selector > 2)
y = np.random.choice(main_array, 3, replace=False, p=p / np.sum(p))
print(y)

How to remove empty x-axis coordinates in Matplotlib

I'm developing in Python using the pandas, numpy and matplotlib modules, to paint various subplots of a dataframe, using the following code:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
data = {'Name': ['Status', 'Status', 'HMI', 'Allst', 'Drvr', 'CurrTUBand', 'RUSource', 'RUReqstrPriority', 'RUReqstrSystem', 'RUResReqstStat', 'CurrTUBand', 'DSP', 'SetDSP', 'SetDSP', 'DSP', 'RUSource', 'RUReqstrPriority', 'RUReqstrSystem', 'RUResReqstStat', 'Status', 'Delay', 'Status', 'Delay', 'HMI', 'Status', 'Status', 'HMI', 'DSP'],
'Value': [4, 4, 2, 1, 1, 1, 0, 7, 0, 4, 1, 1, 3, 0, 3, 0, 7, 0, 4, 1, 0, 1, 0, 1, 4, 4, 2, 3],
'Id_Par': [0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 0, 0, 22, 22, 28, 28, 28, 28, 0, 0, 38, 38, 0, 0, 0, 0, 0]
}
signals_df = pd.DataFrame(data)
def plot_signals(signals_df):
# Count signals by parallel
signals_df['Count'] = signals_df.groupby('Id_Par').cumcount().add(1).mask(signals_df['Id_Par'].eq(0), 0)
# Subtract Parallel values from the index column
signals_df['Sub'] = signals_df.index - signals_df['Count']
id_par_prev = signals_df['Id_Par'].unique()
id_par = np.delete(id_par_prev, 0)
signals_df['Prev'] = [1 if x in id_par else 0 for x in signals_df['Id_Par']]
signals_df['Final'] = signals_df['Prev'] + signals_df['Sub']
# Convert and set Subtract to index
signals_df.set_index('Final', inplace=True)
# Get individual names and variables for the chart
names_list = [name for name in signals_df['Name'].unique()]
num_names_list = len(names_list)
# Creation Graphics
fig, ax = plt.subplots(nrows=num_names_list, figsize=(10, 10), sharex=True)
plt.xticks(color='SteelBlue', fontweight='bold')
# Matplotlib's categorical feature to convert x-axis values to string
x_values = [-1, ]
for name in all_names_list:
x_values.append(signals_df[signals_df["Name"] == name]["Value"].index.values[0])
x_values.append(len(signals_df) - 1)
x_values = [str(i) for i in sorted(set(x_values))]
print(x_values)
for pos, (a_, name) in enumerate(zip(ax, names_list)):
# Creating a dummy plot and then remove it
dummy, = ax[pos].plot(x_values, np.zeros_like(x_values))
dummy.remove()
# Get data
data = signals_df[signals_df["Name"] == name]["Value"]
# Get values axis-x and axis-y
x_ = np.hstack([-1, data.index.values, len(signals_df) - 1])
y_ = np.hstack([0, data.values, data.iloc[-1]])
# Plotting the data by position
ax[pos].plot(x_.astype('str'), y_, drawstyle='steps-post', marker='*', markersize=8, color='k', linewidth=2)
ax[pos].set_ylabel(name, fontsize=8, fontweight='bold', color='SteelBlue', rotation=30, labelpad=35)
ax[pos].yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.1f'))
ax[pos].yaxis.set_tick_params(labelsize=6)
ax[pos].grid(alpha=0.4, color='SteelBlue')
# Labeling the markers with CAN-Values
for i in range(len(y_)):
if i == 0:
xy = [x_[0].astype('str'), y_[0]]
else:
xy = [x_[i - 1].astype('str'), y_[i - 1]]
ax[pos].text(x=xy[0], y=xy[1], s=str(xy[1]), color='k', fontweight='bold', fontsize=12)
plt.show()
plot_signals(signals_df)
I'm having trouble when names get repeated, using Matplotlib's categorical feature and converting x-axis values to string; taking into consideration the focus of the answer; this is what is bringing me:
I have been trying to change the pandas conditions, since it is the condition that I am using in this line: x_values.append(signals_df[signals_df["Name"] == name]["Value"].index.values[0]) and when I print the variable x_values it brings me the wrong indices: ['-1', '0', '2', '3', '4', '5', '6', '11', '12', '20', '27'] and I can't make it work well.
I expect to achieve is a graph like the following:
The yellow shading is the jumps that it must make on the x-axis and that it are not painting on the y-axis. Thank you very much to anyone who can help me, any comments help.
I leave this answer for possible searches later for someone with the same topic. I found my error, the way I was handling the for loop was not correct, I replaced it and modified it as follows:
# Matplotlib's categorical feature and to convert x-axis values to string
x_values = [-1,]
x_values + = (list (set (can_signals.index)))
x_values = [str (i) for i in sorted (x_values)]
This now allows to bring up the graph as below:

How to stop chart from printing in this cell but the next cell with really short code

These codes produce a chart
import numpy as np
import matplotlib.pyplot as plt
N = 5
menMeans = (20, 35, 30, 35, 27)
womenMeans = (25, 32, 34, 20, 25)
menStd = (2, 3, 4, 1, 2)
womenStd = (3, 5, 2, 3, 3)
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, menMeans, width, yerr=menStd)
p2 = plt.bar(ind, womenMeans, width,
bottom=menMeans, yerr=womenStd)
plt.ylabel('Scores')
plt.title('Scores by group and gender')
plt.xticks(ind, ('G1', 'G2', 'G3', 'G4', 'G5'))
plt.yticks(np.arange(0, 81, 10))
plt.legend((p1[0], p2[0]), ('Men', 'Women'))
Jupyter notebook automatically print the chart, even I didn't call plt.show(). I don't want to show the chart in the same cell with the code but the next cell by running a really short code such as plt.show(). In order to keep the cell as concise as possible.
Just enclose all your plot-related statements inside a function called plot_and_show(). Then you can call the function when you are ready.
import matplotlib.pyplot as plt
import numpy as np
N = 5
menMeans = (20, 35, 30, 35, 27)
womenMeans = (25, 32, 34, 20, 25)
menStd = (2, 3, 4, 1, 2)
womenStd = (3, 5, 2, 3, 3)
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars: can also be len(x) sequence
def plot_and_show():
p1 = plt.bar(ind, menMeans, width, yerr=menStd)
p2 = plt.bar(ind, womenMeans, width,
bottom=menMeans, yerr=womenStd)
plt.ylabel('Scores')
plt.title('Scores by group and gender')
plt.xticks(ind, ('G1', 'G2', 'G3', 'G4', 'G5'))
plt.yticks(np.arange(0, 81, 10))
plt.legend((p1[0], p2[0]), ('Men', 'Women'))
plot_and_show()

How to find the index position of items in a pandas list which satisfy a certain condition?

How can I find the index position of items in a list which satisfy a certain condition?
Like suppose, I have a list like:
myList = [0, 100, 335, 240, 300, 450, 80, 500, 200]
And the condition is to find out the position of all elements within myList which lie between 0 and 300 (both inclusive).
I am expecting the output as:
output = [0, 1, 3, 4, 6, 8]
How can I do this in pandas?
Also, how to find out the index of the maximum element in the subset of elements which satisfy the condition? Like, in the above case, out of the elements which satisfy the given condition 300 is the maximum and its index is 4. So, need to retrieve its index.
I have been trying many ways but not getting the desired result. Please help, I am new to the programming world.
You can try this,
>>> import pandas as pd
>>> df = pd.DataFrame({'a': [0, 100, 335, 240, 300, 450, 80, 500, 200]})
>>> index = list(df[(df.a >= 0) & (df.a <= 300)].index)
>>> df.loc[index,].idxmax()
a 4
dtype: int64
or using the list,
>>> l = [0, 100, 335, 240, 300, 450, 80, 500, 200]
>>> index = [(i, v) for i, v in enumerate(l) if v >= 0 and v <= 300]
>>> [t[0] for t in index]
[0, 1, 3, 4, 6, 8]
>>> sorted(index, key=lambda x: x[1])[-1][0]
4
As Grzegorz Skibinski says, if we use numpy to get rid of many computations,
>>> import numpy as np
>>> l = [0, 100, 335, 240, 300, 450, 80, 500, 200]
>>> index = np.array([[i, v] for i, v in enumerate(l) if v >= 0 and v <= 300])
>>> index[:,0]
array([0, 1, 3, 4, 6, 8])
>>> index[index.argmax(0)[1]][0]
4
You can use numpy for that purpose:
import numpy as np
myList =np.array( [0, 100, 335, 240, 300, 450, 80, 500, 200])
res=np.where((myList>=0)&(myList<=300))[0]
print(res)
###and to get maximum:
res2=res[myList[res].argmax()]
print(res2)
Output:
[0 1 3 4 6 8]
4
[Program finished]
This is between in pandas:
myList = [0, 100, 335, 240, 300, 450, 80, 500, 200]
s= pd.Series(myList)
s.index[s.between(0,300)]
Output:
Int64Index([0, 1, 3, 4, 6, 8], dtype='int64')

how to plot a single line with different types of line dash using bokeh?

I am trying to plot the line for a set of points. Currently, I have set of points as Column names X, Y and Type in the form of a data frame. Whenever the type is 1, I would like to plot the points as dashed and whenever the type is 2, I would like to plot the points as a solid line.
Currently, I am using for loop to iterate over all points and plot each point using plt.dash. However, this is slowing down my run time since I want to plot more than 40000 points.
So, is an easy way to plot the line overall points with different line dash type?
You could realize it by drawing multiple line segments like this
(Bokeh v1.1.0)
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, Range1d, LinearAxis
line_style = {1: 'solid', 2: 'dashed'}
data = {'name': [1, 1, 1, 2, 2, 2, 1, 1, 1, 1],
'counter': [1, 2, 3, 3, 4, 5, 5, 6, 7, 8],
'score': [150, 150, 150, 150, 150, 150, 150, 150, 150, 150],
'age': [20, 21, 22, 22, 23, 24, 24, 25, 26, 27]}
df = pd.DataFrame(data)
plot = figure(y_range = (100, 200))
plot.extra_y_ranges = {"Age": Range1d(19, 28)}
plot.add_layout(LinearAxis(y_range_name = "Age"), 'right')
for i, g in df.groupby([(df.name != df.name.shift()).cumsum()]):
source = ColumnDataSource(g)
plot.line(x = 'counter', y = 'score', line_dash = line_style[g.name.unique()[0]], source = source)
plot.circle(x = 'counter', y = 'age', color = "blue", size = 10, y_range_name = "Age", source = source)
show(plot)

Resources