How can I add a normal distribution curve to multiple histograms? - python-3.x

With the following code I create four histograms:
import numpy as np
import pandas as pd
data = pd.DataFrame(np.random.normal((1, 2, 3 , 4), size=(100, 4)))
data.hist(bins=10)
I want the histograms to look like this:
I know how to make it one graph at the time, see here
But how can I do it for multiple histograms without specifying each single one? Ideally I could use 'pd.scatter_matrix'.

Plot each histogram seperately and do the fit to each histogram as in the example you linked or take a look at the hist api example here. Essentially what should be done is
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
fig = plt.figure()
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)
for ax in [ax1, ax2, ax3, ax4]:
n, bins, patches = ax.hist(**your_data_here**, 50, normed=1, facecolor='green', alpha=0.75)
bincenters = 0.5*(bins[1:]+bins[:-1])
y = mlab.normpdf( bincenters, mu, sigma)
l = ax.plot(bincenters, y, 'r--', linewidth=1)
plt.show()

Related

How to plot vertical stacked graph from different text files?

I have 5 txt files which contain data give me the effect of increasing heat on my samples and I want plot them in a vertical stacked graph, Where the final figure is 5 vertical stacked chart sharing the same X-axis and each line in a separate one to reveal the difference between them.
I wrote this code:
import glob
import pandas as pd
import matplotlib.axes._axes as axes
import matplotlib.pyplot as plt
input_files = glob.glob('01-input/RR_*.txt')
for file in input_files:
data = pd.read_csv(file, header=None, delimiter="\t").values
x = data[:,0]
y = data[:,1]
plt.subplot(2, 1, 1)
plt.plot(x, y, linewidth=2, linestyle=':')
plt.tight_layout()
plt.xlabel('x-axis')
plt.ylabel('y-axis')
But the result is only one graph containing all the lines:
I want to get the following chart:
import matplotlib.pyplot as plt
import numpy as np
# just a dummy data
x = np.linspace(0, 2700, 50)
all_data = [np.sin(x), np.cos(x), x**0.3, x**0.4, x**0.5]
n = len(all_data)
n_rows = n
n_cols = 1
fig, ax = plt.subplots(n_rows, n_cols) # each element in "ax" is a axes
for i, y in enumerate(all_data):
ax[i].plot(x, y, linewidth=2, linestyle=':')
ax[i].set_ylabel('y-axis')
# You can to use a list of y-labels. Example:
# my_labels = ['y1', 'y2', 'y3', 'y4', 'y5']
# ax[i].set_ylabel(my_labels[i])
# The "my_labels" lenght must be "n" too
plt.xlabel('x-axis') # add xlabel at last axes
plt.tight_layout()

Is there a library that will help me fit data easily? I found fitter and i will provide the code but it shows some errors

So, here is my code:
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
from fitter import Fitter, get_common_distributions
df = pd.read_csv("project3.csv")
bins = [282.33, 594.33, 906.33, 1281.33, 15030.33, 1842.33, 2154.33, 2466.33, 2778.33, 3090.33, 3402.33]
#declaring
facecolor = '#EAEAEA'
color_bars = '#3475D0'
txt_color1 = '#252525'
txt_color2 = '#004C74'
fig, ax = plt.subplots(1, figsize=(16, 6), facecolor=facecolor)
ax.set_facecolor(facecolor)
n, bins, patches = plt.hist(df.City1, color=color_bars, bins=10)
#grid
minor_locator = AutoMinorLocator(2)
plt.gca().xaxis.set_minor_locator(minor_locator)
plt.grid(which='minor', color=facecolor, lw = 0.5)
xticks = [(bins[idx+1] + value)/2 for idx, value in enumerate(bins[:-1])]
xticks_labels = [ "{:.0f}-{:.0f}".format(value, bins[idx+1]) for idx, value in enumerate(bins[:-1])]
plt.xticks(xticks, labels=xticks_labels, c=txt_color1, fontsize=13)
#beautify
ax.tick_params(axis='x', which='both',length=0)
plt.yticks([])
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
for idx, value in enumerate(n):
if value > 0:
plt.text(xticks[idx], value+5, int(value), ha='center', fontsize=16, c=txt_color1)
plt.title('Histogram of rainfall in City1\n', loc = 'right', fontsize = 20, c=txt_color1)
plt.xlabel('\nCentimeters of rainfall', c=txt_color2, fontsize=14)
plt.ylabel('Frequency of occurrence', c=txt_color2, fontsize=14)
plt.tight_layout()
#plt.savefig('City1_Raw.png', facecolor=facecolor)
plt.show()
city1 = df['City1'].values
f = Fitter(city1, distributions=get_common_distributions())
f.fit()
fig = f.plot_pdf(names=None, Nbest=4, lw=1, method='sumsquare_error')
plt.show()
print(f.get_best(method = 'sumsquare_error'))
The issue is with the plots it shows. The first histogram it generates is
Next I get another graph with best fitted distributions which is
Then an output statement
{'chi2': {'df': 10.692966790090342, 'loc': 16.690849400411103, 'scale': 118.71595997157786}}
Process finished with exit code 0
I have a couple of questions. Why is chi2, the best fitted distribution not plotted on the graph?
How do I plot these distributions on top of the histograms and not separately? The hist() function in fitter library can do that but there I don't get to control the bins and so I end up getting like 100 bins with some flat looking data.
How do I solve this issue? I need to plot the best fit curve on the histogram that looks like image1. Can I use any other module/package to get the work done in similar way? This uses least squares fit but I am OK with least likelihood or log likelihood too.
Simple way of plotting things on top of each other (using some properties of the Fitter class)
import scipy.stats as st
import matplotlib.pyplot as plt
from fitter import Fitter, get_common_distributions
from scipy import stats
numberofpoints=50000
df = stats.norm.rvs( loc=1090, scale=500, size=numberofpoints)
fig, ax = plt.subplots(1, figsize=(16, 6))
n, bins, patches = ax.hist( df, bins=30, density=True)
f = Fitter(df, distributions=get_common_distributions())
f.fit()
errorlist = sorted(
[
[f._fitted_errors[dist], dist]
for dist in get_common_distributions()
]
)[:4]
for err, dist in errorlist:
ax.plot( f.x, f.fitted_pdf[dist] )
plt.show()
Using the histogram normalization, one would need to play with scaling to generalize again.

Bar missing while plotting using Matplotlib's Twinx

I'm using matplotlib.axes.Axes.twinx to have a shared x-axis in matplotlib for both . I dont know why instead of 13 bars to be plotted, only 12 of them are getting plotted.
Link of Data set
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
dataFrame=pd.read_csv("NEM.csv",sep=',')
dataFrame['ratio']=dataFrame['Expert']/dataFrame['Novice']
fig, ax1 = plt.subplots(figsize=(9, 6))
ax1.set_title('N-E Analysis')
xticklabels=dataFrame['Task'].tolist()
ax1.plot('Novice', data=dataFrame, marker='', color='dodgerblue', linewidth=2,label='Novice',zorder=100)
ax1.plot('Expert', data=dataFrame, marker='', color='darkorange', linewidth=2,label='Expert',zorder=200)
plt.ylim(0,120)
ax2 = ax1.twinx()
ax2.bar('Task','ratio', data=dataFrame, color='gray',width=0.35,label='NE',zorder=0)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax1.set_xticklabels(xticklabels, rotation = 45, ha="right")
ax1.yaxis.grid()
ax1.tick_params(left='off',bottom='off')
ax2.tick_params(right='off')
plt.ylim(0,12)
h1, l1 = ax1.get_legend_handles_labels()
h2, l2 = ax2.get_legend_handles_labels()
p=ax1.legend(h2+h1, l2+l1, loc=2,frameon=False)
fig.tight_layout()
plt.show()
When using plots, it could be good practice to say explicitily how many bars or points you are going to plot. For instance, you can create an x-axis this way:
x_axis = np.arange(len(dataFrame[Task].tolist())
then:
ax1.plot(x_axis, dataFrame['Novice'].tolist(), ...)
after that you rename the xticklabels like this:
ax1.set_xticks(x_axis)
ax1.set_xticklabels(dataFrame[Task].tolist())
Do the same with the bar graph:
ax2.bar(x_axis, dataFrame['Ratio'].tolist(), ...)
This should do the trick.
Hope it helps.

How to show horizontal lines at tips of error bar plot using matplotlib?

I can generate an error-bar plot using the code below. The graph produced by the code shows vertical lines that represent the errors in y. I would like to have horizontal lines at the tips of these errors ("error bars") and am not sure how to do so.
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(1, 10, 10, dtype=int)
y = 2**x
yerr = np.sqrt(y)*10
fig, ax = plt.subplots()
ax.errorbar(x, y, yerr, solid_capstyle='projecting')
ax.grid(alpha=0.5, linestyle=':')
plt.show()
plt.close(fig)
The code generates the figure below. I've played with the solid_capstyle kwarg. Is there a specific kwarg that does what I am trying to do?
And as an example of what I'd like, the figure below:
In case it's relevant, I am using matplotlib 2.2.2
The argument you are looking for is capsize= in ax.errorbar(). The default is None so the length of the cap will default to the value of matplotlib.rcParams["errorbar.capsize"]. The number you give will be the length of the cap in points:
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(1, 10, 10, dtype=int)
y = 2**x
yerr = np.sqrt(y)*10
fig, ax = plt.subplots()
ax.errorbar(x, y, yerr, solid_capstyle='projecting', capsize=5)
ax.grid(alpha=0.5, linestyle=':')
plt.show()

MatPlotLib + GeoPandas: Plot Multiple Layers, Control Figsize

Given the shape file available here: I know can produce the basic map that I need with county labels and even some points on the map (see below). The issue I'm having is that I cannot seem to control the size of the figure with figsize.
Here's what I have:
import geopandas as gpd
import matplotlib.pyplot as plt
%matplotlib inline
figsize=5,5
fig = plt.figure(figsize=(figsize),dpi=300)
shpfileshpfile=r'Y:\HQ\TH\Groups\NR\PSPD\Input\US_Counties\cb_2015_us_county_20m.shp'
c=gpd.read_file(shpfile)
c=c.loc[c['GEOID'].isin(['26161','26093','26049','26091','26075','26125','26163','26099','26115','26065'])]
c['coords'] = c['geometry'].apply(lambda x: x.representative_point().coords[:])
c['coords'] = [coords[0] for coords in c['coords']]
ax=c.plot()
#Control some attributes regarding the axis (for the plot above)
ax.spines['top'].set_visible(False);ax.spines['bottom'].set_visible(False);ax.spines['left'].set_visible(False);ax.spines['right'].set_visible(False)
ax.tick_params(axis='y',which='both',left='off',right='off',color='none',labelcolor='none')
ax.tick_params(axis='x',which='both',top='off',bottom='off',color='none',labelcolor='none')
for idx, row in c.iterrows():
ax.annotate(s=row['NAME'], xy=row['coords'],
horizontalalignment='center')
lat2=[42.5,42.3]
lon2=[-84,-83.5]
#Add another plot...
ax.plot(lon2,lat2,alpha=1,marker='o',linestyle='none',markeredgecolor='none',markersize=15,color='white')
plt.show()
As you can see, I opted to call the plots by the axis name because I need to control attributes of the axis, such as tick_params. I'm not sure if there is a better approach. This seems like a "no-brainer" but I can't seem to figure out why I can't control the figure size.
Thanks in advance!
I just had to do the following:
Use fig, ax = plt.subplots(1, 1, figsize = (figsize))
2.use the ax=ax argument in c.plot()
import geopandas as gpd
import matplotlib.pyplot as plt
%matplotlib inline
figsize=5,5
#fig = plt.figure(figsize=(figsize),dpi=300)
#ax = fig.add_subplot(111)
fig, ax = plt.subplots(1, 1, figsize = (figsize))
shpfileshpfile=r'Y:\HQ\TH\Groups\NR\PSPD\Input\US_Counties\cb_2015_us_county_20m.shp'
c=gpd.read_file(shpfile)
c=c.loc[c['GEOID'].isin(['26161','26093','26049','26091','26075','26125','26163','26099','26115','26065'])]
c['coords'] = c['geometry'].apply(lambda x: x.representative_point().coords[:])
c['coords'] = [coords[0] for coords in c['coords']]
c.plot(ax=ax)
ax.spines['top'].set_visible(False);ax.spines['bottom'].set_visible(False);ax.spines['left'].set_visible(False);ax.spines['right'].set_visible(False)
ax.tick_params(axis='y',which='both',left='off',right='off',color='none',labelcolor='none')
ax.tick_params(axis='x',which='both',top='off',bottom='off',color='none',labelcolor='none')
for idx, row in c.iterrows():
ax.annotate(s=row['NAME'], xy=row['coords'],
horizontalalignment='center')
lat2=[42.5,42.3]
lon2=[-84,-83.5]
ax.plot(lon2,lat2,alpha=1,marker='o',linestyle='none',markeredgecolor='none',markersize=15,color='white')

Resources