stats linregress slope and intercept wrong - python-3.x

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
xx = np.load('./x.npy')
yy = np.load('./y.npy')
fig, ax = plt.subplots()
fig = plt.gcf()
fig.set_size_inches(16, 8)
labels = ['C1', 'C2']
colors = ['r', 'b']
for idx in range(2):
df = pd.DataFrame({'x': xx, 'y': yy[idx]})
ax.set(xlim=(np.min(df.x), np.max(df.x)),
ylim=(np.min(df.y), np.max(df.y)))
p = sns.regplot('x',
'y',
df,
scatter=False,
order=2,
ax=ax,
label=labels[idx],
color=colors[idx])
slope, intercept, r_value, p_value, std_err = stats.linregress(
x=p.get_lines()[0].get_xdata(),
y=p.get_lines()[0].get_ydata())
formula = str(slope) + ' x\N{SUPERSCRIPT TWO} ' + str(intercept)
print('formula: ', formula)
I am trying to calculate the slope and intercept of the sns.regplot fit line and it gives me:
formula: 82.53958162912909 x² 130.19916935648575
formula: 82.53958162912909 x² 130.19916935648575
which:
Is wrong as you can see for the plot, for x value 6 , we expect y value around 600.
Slope and intercept is the same for the two lines. We would expect a small difference.
You can find the x, y files here

I don't know why you are getting the data from the Line2D object, even though you already have the data in xx and yy, but anyway:
When you calculate the regression in the loop, you are passing the same set of data (line [0]) at each iteration. I guess you mean to write
slope, intercept, r_value, p_value, std_err = stats.linregress(
x=xx,
y=yy[idx])

Related

How to plot vertical stacked graph from different text files?

I have 5 txt files which contain data give me the effect of increasing heat on my samples and I want plot them in a vertical stacked graph, Where the final figure is 5 vertical stacked chart sharing the same X-axis and each line in a separate one to reveal the difference between them.
I wrote this code:
import glob
import pandas as pd
import matplotlib.axes._axes as axes
import matplotlib.pyplot as plt
input_files = glob.glob('01-input/RR_*.txt')
for file in input_files:
data = pd.read_csv(file, header=None, delimiter="\t").values
x = data[:,0]
y = data[:,1]
plt.subplot(2, 1, 1)
plt.plot(x, y, linewidth=2, linestyle=':')
plt.tight_layout()
plt.xlabel('x-axis')
plt.ylabel('y-axis')
But the result is only one graph containing all the lines:
I want to get the following chart:
import matplotlib.pyplot as plt
import numpy as np
# just a dummy data
x = np.linspace(0, 2700, 50)
all_data = [np.sin(x), np.cos(x), x**0.3, x**0.4, x**0.5]
n = len(all_data)
n_rows = n
n_cols = 1
fig, ax = plt.subplots(n_rows, n_cols) # each element in "ax" is a axes
for i, y in enumerate(all_data):
ax[i].plot(x, y, linewidth=2, linestyle=':')
ax[i].set_ylabel('y-axis')
# You can to use a list of y-labels. Example:
# my_labels = ['y1', 'y2', 'y3', 'y4', 'y5']
# ax[i].set_ylabel(my_labels[i])
# The "my_labels" lenght must be "n" too
plt.xlabel('x-axis') # add xlabel at last axes
plt.tight_layout()

InvalidArgumentError: Graph execution error

I am trying to apply LCM (linear coregionalization model), as a Gaussian process, to a CSV file dataset.
This dataset includes two inputs (FracYear, Auxiliar) and two outputs (VV,VH).
import gpflow as gpflow
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
plt.style.use('ggplot')
# %matplotlib inline
import seaborn as sns
np.random.seed(1)
def plot_gp(x, mu, var, color='k'):
plt.plot(x, mu, color=color, lw=2)
plt.plot(x, mu + 2*np.sqrt(var), '--', color=color)
plt.plot(x, mu - 2*np.sqrt(var), '--', color=color)
def plot(m):
xtest = np.linspace(0, 1, 300)[:,None]
line, = plt.plot(X1, Y1, 'x', mew=2)
mu, var = m.predict_f(np.hstack((xtest, np.zeros_like(xtest))))
plot_gp(xtest, mu, var, line.get_color())
line, = plt.plot(X2, Y2, 'x', mew=2)
mu, var = m.predict_f(np.hstack((xtest, np.ones_like(xtest))))
plot_gp(xtest, mu, var, line.get_color())
import pandas as pd
d = pd.read_csv('C://Users//Rick//Documents//UNI//PROJ//invento.csv',delimiter=',', header=None, skiprows=1, names=['FracYear', 'VH', 'VV'])
# Replace missing values with NaN
d.replace(-200.0, np.nan, inplace=True)
# Data preparation
# We start by generating some training data to fit the model with. For this example, we choose the following two correlated functions for our outputs:
# make a dataset with two outputs, correlated, heavy-tail noise. One has more noise than the other.
df = pd.DataFrame(data=d)
X1 = df['FracYear'] = pd.to_numeric(df['FracYear'])
X2 = df['Auxiliar'] = pd.to_numeric(df['Auxiliar'])
Y1 = df['VH'] = pd.to_numeric(df['VH'])
Y2 = df['VV'] = pd.to_numeric(df['VV'])
plt.plot(X1, Y1, 'x', mew=2)
plt.plot(X2, Y2, 'x', mew=2)
plt.show()
# Base Matern kernel
k1 = gpflow.kernels.Matern32(active_dims=[0])
# Build the coreg kernel
coreg = gpflow.kernels.Coregion(output_dim=2, rank=1, active_dims=[1])
kern = k1 * coreg
# Build Likelihood
lik = gpflow.likelihoods.SwitchedLikelihood([
gpflow.likelihoods.StudentT(), gpflow.likelihoods.StudentT()
])
# Augment the input with ones or zeros to indicate the required output dimension
X_augmented = np.vstack((np.hstack((X1, np.zeros_like(X1))), np.hstack((X2, np.ones_like(X2)))))
# Augment the Y data with ones or zeros that specify a likelihood from the list of likelihoods
Y_augmented = np.vstack((np.hstack((Y1, np.zeros_like(Y1))), np.hstack((Y2, np.ones_like(Y2)))))
# now buld the GP model as normal
m = gpflow.models.VGP((X_augmented, Y_augmented), kernel=kern, likelihood=lik)
# fit the covariance function parameters
#gpflow.train.ScipyOptimizer().minimize(m, maxiter=1000)
from gpflow.ci_utils import ci_niter
maxiter = ci_niter(10000)
gpflow.optimizers.Scipy().minimize(
m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B",
)
## Fit and plot
xtest = np.hstack([np.linspace(0, 1, 100)]*3)[:,None]
mu1, var1 = m.predict_f(np.hstack((xtest, np.zeros_like(xtest))))
mu2, var2 = m.predict_f(np.hstack((xtest, np.ones_like(xtest))))
plt.plot(X1, Y1, 'x', mew=2, color='r')
plt.plot(X2, Y2, 'x', mew=2, color='b')
plt.plot(np.linspace(0, 1, 100), np.reshape(mu2, [100,3]))
plt.plot(np.linspace(0, 1, 100), mu1, 'r')
plt.plot(np.linspace(0, 1, 100), mu2, 'b')
plt.show()
Please, note that I am using Colab. On the other hand, I am having issues when installing Tensorflow in Spyder. Having said this, I don't know which would be better: Colab, Spyder, Jupyter.
My doubt is the error resulting from the prompt with "gpflow.optimizers.Scipy().minimize".
The error is very long. It starts with the title of this thread, and ends like this: "Node: 'GatherV2_2'
indices[0] = 2019 is not in [0, 2)
[[{{node GatherV2_2}}]] [Op:__inference__tf_eval_9447]"
If anyone has any idea about this, please notice me. Also, if any of you has a LCM code which works for csv files, it may be interesting for me to keep it an eye.
Thanks!!

Is there a library that will help me fit data easily? I found fitter and i will provide the code but it shows some errors

So, here is my code:
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
from fitter import Fitter, get_common_distributions
df = pd.read_csv("project3.csv")
bins = [282.33, 594.33, 906.33, 1281.33, 15030.33, 1842.33, 2154.33, 2466.33, 2778.33, 3090.33, 3402.33]
#declaring
facecolor = '#EAEAEA'
color_bars = '#3475D0'
txt_color1 = '#252525'
txt_color2 = '#004C74'
fig, ax = plt.subplots(1, figsize=(16, 6), facecolor=facecolor)
ax.set_facecolor(facecolor)
n, bins, patches = plt.hist(df.City1, color=color_bars, bins=10)
#grid
minor_locator = AutoMinorLocator(2)
plt.gca().xaxis.set_minor_locator(minor_locator)
plt.grid(which='minor', color=facecolor, lw = 0.5)
xticks = [(bins[idx+1] + value)/2 for idx, value in enumerate(bins[:-1])]
xticks_labels = [ "{:.0f}-{:.0f}".format(value, bins[idx+1]) for idx, value in enumerate(bins[:-1])]
plt.xticks(xticks, labels=xticks_labels, c=txt_color1, fontsize=13)
#beautify
ax.tick_params(axis='x', which='both',length=0)
plt.yticks([])
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
for idx, value in enumerate(n):
if value > 0:
plt.text(xticks[idx], value+5, int(value), ha='center', fontsize=16, c=txt_color1)
plt.title('Histogram of rainfall in City1\n', loc = 'right', fontsize = 20, c=txt_color1)
plt.xlabel('\nCentimeters of rainfall', c=txt_color2, fontsize=14)
plt.ylabel('Frequency of occurrence', c=txt_color2, fontsize=14)
plt.tight_layout()
#plt.savefig('City1_Raw.png', facecolor=facecolor)
plt.show()
city1 = df['City1'].values
f = Fitter(city1, distributions=get_common_distributions())
f.fit()
fig = f.plot_pdf(names=None, Nbest=4, lw=1, method='sumsquare_error')
plt.show()
print(f.get_best(method = 'sumsquare_error'))
The issue is with the plots it shows. The first histogram it generates is
Next I get another graph with best fitted distributions which is
Then an output statement
{'chi2': {'df': 10.692966790090342, 'loc': 16.690849400411103, 'scale': 118.71595997157786}}
Process finished with exit code 0
I have a couple of questions. Why is chi2, the best fitted distribution not plotted on the graph?
How do I plot these distributions on top of the histograms and not separately? The hist() function in fitter library can do that but there I don't get to control the bins and so I end up getting like 100 bins with some flat looking data.
How do I solve this issue? I need to plot the best fit curve on the histogram that looks like image1. Can I use any other module/package to get the work done in similar way? This uses least squares fit but I am OK with least likelihood or log likelihood too.
Simple way of plotting things on top of each other (using some properties of the Fitter class)
import scipy.stats as st
import matplotlib.pyplot as plt
from fitter import Fitter, get_common_distributions
from scipy import stats
numberofpoints=50000
df = stats.norm.rvs( loc=1090, scale=500, size=numberofpoints)
fig, ax = plt.subplots(1, figsize=(16, 6))
n, bins, patches = ax.hist( df, bins=30, density=True)
f = Fitter(df, distributions=get_common_distributions())
f.fit()
errorlist = sorted(
[
[f._fitted_errors[dist], dist]
for dist in get_common_distributions()
]
)[:4]
for err, dist in errorlist:
ax.plot( f.x, f.fitted_pdf[dist] )
plt.show()
Using the histogram normalization, one would need to play with scaling to generalize again.

How to find the point in which a regression line will intersect the OY axis?

I have a file in which I provide some data, the x and y values. My program draws the regression line of those points, but what I need now is to find the value on the OY axis, which my line will intersect if it will be elongated.
What my program does now:
I need to simply make the line longer, intersect it with the OY axis, and find the exact coordinates of that point.
My code so far:
import numpy as np
import matplotlib.pyplot as plt # To visualize
import pandas as pd # To read data
from sklearn.linear_model import LinearRegression
data = pd.read_csv('data.csv') # load data set
X = data.iloc[:, 0].values.reshape(-1, 1) # values converts it into a numpy array
Y = data.iloc[:, 1].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column
linear_regressor = LinearRegression() # create object for the class
linear_regressor.fit(X, Y) # perform linear regression
Y_pred = linear_regressor.predict(X) # make predictions
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='red')
plt.show()
My code requires a file called "data.csv" which contains the coordinates of the given values. My example has the values:
5,0.8
10,0.7
15,0.66
20,0.493
25,0.5
30,0.21
Did you want something like this, where you can use the intercept_ attribute of your LinearRegressor object to get the y-intercept at x equal to zero:
import numpy as np
import matplotlib.pyplot as plt # To visualize
import pandas as pd # To read data
from io import StringIO
from sklearn.linear_model import LinearRegression
txtfile = StringIO("""5,0.8
10,0.7
15,0.66
20,0.493
25,0.5
30,0.21""")
data = pd.read_csv(txtfile, header=None) # load data set
X = data.iloc[:, 0].values.reshape(-1, 1) # values converts it into a numpy array
Y = data.iloc[:, 1].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column
linear_regressor = LinearRegression() # create object for the class
linear_regressor.fit(X, Y) # perform linear regression
Y_pred = linear_regressor.predict(X) # make predictions
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='red')
plt.plot([0, X[0]], [linear_regressor.intercept_, Y_pred[0]], c="green", linestyle='--')
ax = plt.gcf().gca()
ax.spines['left'].set_position('zero')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()
Output:

Specifying the color Increments of heat-map in python

Is there a way to specify in Seaborn or Matplotlib the color increments of heat-map color scale. For instance, for data-frame that contains normalized values between 0-1, to specify 100,discrete, color increments so each value is distinguished from other values?
Thank you in advance
There are two principle approaches to discetize a heatmap into n colors:
Supply the data rounded to the n values.
Use a discrete colormap.
The following code shows those two options.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
x, y = np.meshgrid(range(15),range(6))
v = np.random.rand(len(x.flatten()))
df = pd.DataFrame({"x":x.flatten(), "y":y.flatten(),"value":v})
df = df.pivot(index="y", columns="x", values="value")
n = 4.
fig, (ax0, ax, ax2) = plt.subplots(nrows=3)
### original
im0 = ax0.imshow(df.values, cmap="viridis", vmin=0, vmax=1)
ax0.set_title("original")
### Discretize array
arr = np.floor(df.values * n)/n
im = ax.imshow(arr, cmap="viridis", vmin=0, vmax=1)
ax.set_title("discretize values")
### Discretize colormap
cmap = plt.cm.get_cmap("viridis", n)
im2 = ax2.imshow(df.values, cmap=cmap, vmin=0, vmax=1 )
ax2.set_title("discretize colormap")
#colorbars
fig.colorbar(im0, ax=ax0)
fig.colorbar(im, ax=ax)
fig.colorbar(im2, ax=ax2, ticks=np.arange(0,1,1./n), )
plt.tight_layout()
plt.show()

Resources