Index error index 14238 is out of bounds for axis 0 with size 2 - python-3.x

%pylab inline
import numpy as np
import pandas as pd
import random
import time
import scipy
import sklearn.feature_extraction
import pickle
from sklearn.cross_validation import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
bedsizes = {'None':0,
'Rest All':1}
invbedsizes = {v: k for k, v in bedsizes.items()}
model = joblib.load('model_bed_size.pkl')
vocab = pickle.load(open('dictionary', 'rb'))
var=pd.read_csv('Train_variables.csv')
dtest = pd.read_csv('/home/ubuntu/test_null_new.csv', usecols= ("Bed_size","title","short_description","long_description","primary_shelf.all_paths_str","attributes.all_shelves.0","attributes.all_shelves.1","attributes.all_shelves.2","attributes.all_shelves.3","attributes.all_shelves.4","attributes.type.0","attributes.type.1","attributes.type.2","item_id","last_updated_at"),encoding='ISO-8859-1')
lentest = len(dtest)
vocab=vocab["Vocabulary"].to_dict()
Xall = []
i=1
for col in var['Variable']:
vectorizer = CountVectorizer(min_df=1, vocabulary=(vocab[i]), token_pattern = '\\b\\w+\\b')
Xall.append(vectorizer.transform(dtest[col].astype(str)))
j=i
i=j+1
print (col, 'Done', shape(Xall[-1]))
Xspall = scipy.sparse.hstack(Xall)
X_test_final = scipy.sparse.csr_matrix(Xspall)
print (shape(X_test_final))
ypred = model.decision_function(X_test_final)
ypredc = model.classes_[np.argmax(ypred, axis = 0)]
ypredcon = (np.max(ypred, axis = 1) + 2.) / 8.
ypredcon[ypredcon < 0.] = 0 .
ypredcon[ypredcon > 1.] = 1.
dfinal = pd.DataFrame()
dfinal['item_id '] = dtest['item_id']
dfinal['Predictions'] = ypredc
dfinal['Predictions'].replace(invbedsizes, inplace = True)
dfinal['confidence_score'] = ypredcon
The above code is giving an Index error saying that index 14328 is out of bounds for axis 0 and size 2.
The error is coming at this line
ypredc = model.classes_[np.argmax(ypred, axis = 0)]
Can anyone help me on this?

Without knowing much about the variables in your code, the error indicates that at
ypred = model.decision_function(X_test_final)
ypredc = model.classes_[np.argmax(ypred, axis = 0)]
error: index 14328 is out of bounds for axis 0 and size 2
model.classes_ is 1 or more dimensions, and the first is size 2, in other words 2 rows/classes, and possibly many columns.
ypred is probably quite large, and np.argmax(ypred...) is the index of its largest values (along axis 0), i.e. 14328.
Maye the correct use is model.classes_[:, np.argmax...].
You need to look at the shape of ypred, andmodel.classes_`, and possibly other variables in this area.

Related

Generating ande valuating 200 Normals and reducing them

I am trying to estimate a normal density using a quadratic approximation in tensorflow (code 4.14 from McElreath's Statistical Rethinking).
The code I have so far is:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
_BASE_URL = "https://raw.githubusercontent.com/rmcelreath/rethinking/Experimental/data"
HOWELL_DATASET_PATH = f"{_BASE_URL}/Howell1.csv"
df = pd.read_csv(HOWELL_DATASET_PATH, sep=';')
df = df[df['age'] >= 18]
mu = tf.linspace(start=140.0, stop=160.0, num=200)
sigma= tf.linspace(start=4.0, stop=9.0, num=200)
tf.reduce_sum(tfd.Normal(loc=mu, scale=sigma).log_prob(df.height))
This fails due to df having shape (352,) whilst I am creating (200,) points for my normal distribution to be evaluated on.
However
tf.reduce_sum(tfd.Normal(loc=mu, scale=sigma).log_prob(2))
and
tf.reduce_sum(tfd.Normal(loc=mu[0], scale=sigma[0]).log_prob(df.height))
both work.
I need to create a (200, 352) tensor - one Normal for each mu, sigma on my grid, and then evaluate it with my sample data - df. The question I have is: how do I do this?
I think TFP's joint distribution is a nice way to express this:
mu = tf.linspace(start=140.0, stop=160.0, num=200)
sigma = tf.linspace(start=7.0, stop=9.0, num=200)
def mk_joint(nobs):
return tfd.JointDistributionNamed(dict(
mu=tfd.Normal(178, 20),
sigma=tfd.Uniform(0, 50),
height=lambda mu, sigma: tfd.Sample(tfd.Normal(loc=mu, scale=sigma), nobs)
))
joint = mk_joint(len(df))
joint.sample()
print(f'joint event shape: {joint.event_shape}')
lp = joint.log_prob(dict(mu=mu[:,tf.newaxis], sigma=sigma, height=df.height))
import matplotlib.pyplot as plt
plt.imshow(lp)
plt.xlabel('sigma')
plt.xticks(np.arange(len(sigma))[::10], sigma[::10].numpy().round(2), rotation=90)
plt.ylabel('mu')
plt.yticks(np.arange(len(mu))[::10], mu[::10].numpy().round(2))
plt.show()
=>
joint event shape: {'sigma': TensorShape([]), 'mu': TensorShape([]), 'height': TensorShape([352])}
So, I figured out that one way to do it would be to create a (200, 200, 352) grid and then reshape, and the rest of the calculations follow straightforwardly.
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
_BASE_URL = "https://raw.githubusercontent.com/rmcelreath/rethinking/Experimental/data"
HOWELL_DATASET_PATH = f"{_BASE_URL}/Howell1.csv"
df = pd.read_csv(HOWELL_DATASET_PATH, sep=';')
df = df[df['age'] >= 18]
mu = tf.linspace(start=140.0, stop=160.0, num=200)
sigma = tf.linspace(start=7.0, stop=9.0, num=200)
means, variances, _ = tf.meshgrid(mu, sigma, np.zeros((352,)).astype(np.float32))
means = tf.reshape(means, [40000, 352])
variances = tf.reshape(variances, [40000, 352])
normal = tfd.Normal(loc=means, scale=variances)
log_lik = tf.reduce_sum(normal.log_prob(df.height), axis=1)
logprob_mu = tfd.Normal(178.0, 20.0).log_prob(means)
logprob_sigma = tfd.Uniform(low=0.0, high=50.0).log_prob(variances)
log_joint_prod = log_lik + logprob_mu[:, 0] + logprob_sigma[:, 0]
joint_prob_tf = tf.exp(log_joint_prod - tf.reduce_max(log_joint_prod))

I am using Python to implement linear regression on some dataset, but on this step I am continously getting this error

I wrote this linear regression code and now it is giving me an error:
at def iterate_weights function.error = index 200 is out of bounds for
axis 0 with size 200
I don't know what is wrong. Also when I am uploading my weights they are coming the same as above which I chose at random. I am using Jupyter notebook.
Are there any mistakes?
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
#importing dataset
data = pd.read_csv('F:\WOC\linearreg.csv')
print(data.shape)
data.head()
data_arr = np.genfromtxt("F:\WOC\linearreg.csv", delimiter=",", skip_header=1)
print(data_arr)
# In[3]:
#collecting x and y
x_train = data_arr[:,1:4]
y_train = data_arr[:,4:5]
print(x_train)
print(y_train)
# In[4]:
weights_shape = y_train.shape
print(weights_shape)
r,c = x_train.shape
print(r,c)
w = np.random.randn(c,1)
w_num = len(w)
print(w)
# In[5]:
h = np.dot(x_train,w)
def cost_function():
print(h)
j = (1/2*r)*((h-y_train)**2)
print('j',j)
cost_function()
# In[6]:
def iterate_weights():
L=0.01
iterations = 1000
for iterations_proceed in range(1,1001):
for i in range(w_num):
for m in range(1,201):
w[i,0] = w[i,0]-L*((1/r)*(sum(h-y_train)*(x_train[m,i])))
print(w)
iterate_weights()
# In[7]:
h = np.dot(x_train,w)
def cost_function1():
j = np.sum((1/2*r)*((h-y_train)**2))
print(j)

When should I use matrix.astype and when matrix.view?

I have a matrix of int32 integers (50k x 50k or so) which I need to convert to float32. I can do that with
# Preparation for the example
import numpy as np
n = 50_000
matrix = np.random.randint(0, 10, (n, n), dtype='int32')
# Way 1:
matrix = matrix.astype(np.float32, copy=False)
# Way 2:
matrix = matrix.view(np.float32)
When should I use which one? Is there a speed-disadvantage in later use of a view compared to a "real" numpy array?
What I tried
Execution time analysis (creation, not later access)
import numpy as np
import timeit
def create_boxplot(duration_list, showfliers=False):
import seaborn as sns
import matplotlib.pyplot as plt
import operator
plt.figure(num=None, figsize=(8, 4), dpi=300, facecolor="w", edgecolor="k")
sns.set(style="whitegrid")
sorted_keys, sorted_vals = zip(
*sorted(duration_list.items(), key=operator.itemgetter(1))
)
flierprops = dict(markerfacecolor="0.75", markersize=1, linestyle="none")
ax = sns.boxplot(
data=sorted_vals,
width=0.3,
orient="h",
flierprops=flierprops,
showfliers=showfliers,
)
ax.set(xlabel="Time in ms", ylabel="")
plt.yticks(plt.yticks()[0], sorted_keys)
plt.tight_layout()
plt.savefig("output.png")
n = 5_000
matrix = np.random.randint(0, 2, (n, n), dtype='int32')
print(matrix.dtype)
matrix = matrix.view(np.float32)
print(matrix.dtype)
timeit_d = {}
timeit_d["repeat"] = 500
timeit_d["number"] = 3
timeit_d["setup"] = "import numpy as np; n=5_000; matrix = np.random.randint(0, 2, (n, n), dtype='int32')"
duration_list = {}
# Way 1
durations = timeit.repeat(
"matrix2 = matrix.view(np.float32)",
setup=timeit_d["setup"],
repeat=timeit_d["repeat"],
number=timeit_d["number"],
)
duration_list["view"] = durations
print("Done views")
# Way 2
durations = timeit.repeat(
"matrix2 = matrix.astype(np.float32)",
setup=timeit_d["setup"],
repeat=timeit_d["repeat"],
number=timeit_d["number"],
)
duration_list["astype"] = durations
print("Done astype")
# Visualize
create_boxplot(duration_list)
Clearly views are way faster than astype.
Memory analysis
$ valgrind --tool=massif python3 foobar.py
$ massif-visualizer massif.out.view
Clearly shows that the view option uses WAY less memory.

Solving simple ODE using scipy odeint gives straight line at 0

I am trying to solve a simple ODE:
dN/dt = N*(rho(t)-beta)/lambda
Rho is a function of time and I've generated it using linspace. The code is working for other equations but somehow gives a flat straight line at 0. (You can see it in the graph). Any guidelines about how to correct it?
import numpy as np
from scipy.integrate import odeint
import matplotlib.pyplot as plt
def model2(N, t, rho):
beta_val = 0.0065
lambda_val = 0.00002
k = (rho - beta_val) / lambda_val
dNdt = k*N
print(rho)
return dNdt
# initial condition
N0 = [0]
# number of time points
n = 200
# time points
t = np.linspace(0,200,n)
rho = np.linspace(6,9,n)
#rho =np.array([6,6.1,6.2,6.3,6.4,6.5,6.6,6.7,6.8,6.9,7.0,7.1,7.2,7.3,7.4,7.5,7.6,7.7,7.8,7.9]) # Array of constants
# store solution
NSol = np.empty_like(t)
# record initial conditions
NSol[0] = N0[0]
# solve ODE
for i in range(1,n):
# span for next time step
tspan = [t[i-1],t[i]]
# solve for next step
N = odeint(model2,N0,tspan,args=(rho[i],))
print(N)
# store solution for plotting
NSol[i] = N[0][0]
# next initial condition
#z0 = N0[0]
# plot results
plt.plot(t,rho,'g:',label='rho(t)')
plt.plot(t,NSol,'b-',label='NSol(t)')
plt.ylabel('values')
plt.xlabel('time')
plt.legend(loc='best')
plt.show()
This is the graph I get after running this code
I modified your code (and the coefficients) to make it work.
When coefficients are also dependent of t, they have to be python functions called by the derivative function:
import numpy as np
from scipy.integrate import odeint
import matplotlib.pyplot as plt
# Define
def model2(N, t, rho):
beta_val = 0.0065
lambda_val = 0.02
k = ( rho(t) - beta_val )/lambda_val
dNdt = k*N
return dNdt
def rho(t):
return .001 + .003/20*t
# Solve
tspan = np.linspace(0, 20, 10)
N0 = .01
N = odeint(model2, N0 , tspan, args=(rho,))
# Plot
plt.plot(tspan, N, label='NS;ol(t)');
plt.ylabel('N');
plt.xlabel('time'); plt.legend(loc='best');

Wrong intercept in Spark linear regression

I am starting with Spark Linear Regression. I am trying to fit a line to a linear dataset. It seems that the intercept is not correctly adjusting, or probably I am missing something..
With intercept=False:
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=100, step=0.0001, intercept=False)
This seems normal. But when I use intercept=True:
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=100, step=0.0001, intercept=True)
The model that I get in the last case is exactly:
(weights=[0.0353471289751], intercept=1.0005127185289888)
I have tried with different datasets, step sizes and iterations, but always the model converges the intercept is about 1
EDIT - This is the code I am using:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD
import numpy as np
import matplotlib.pyplot as plt
from pyspark import SparkContext
sc = SparkContext("local", "regression")
# Generate data
SIZE = 300
SLOPE = 0.1
BASE = -30
NOISE = 10
x = np.arange(SIZE)
delta = np.random.uniform(-NOISE,NOISE, size=(SIZE,))
y = BASE + SLOPE*x + delta
data = zip(range(len(y)), y) # zip with index
dataRDD = sc.parallelize(data)
# Normalize data
# mean = np.mean(data)
# std = np.std(data)
# dataRDD = dataRDD.map(lambda r: (r[0], (float(r[1])-mean)/std))
labeledData = dataRDD.map(lambda r: LabeledPoint(float(r[1]), [float(r[0])]))
# Create linear model
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=1000, step=0.0002, intercept=True, convergenceTol=0.000001)
print linear_model
true_vs_predicted = labeledData.map(lambda p: (p.label, linear_model.predict(p.features))).collect()
# PLOT
fig = plt.figure()
ax = fig.add_subplot(111)
ax.grid()
y_real = [x[0] for x in true_vs_predicted]
y_pred = [x[1] for x in true_vs_predicted]
plt.plot(range(len(y_real)), y_real, 'o', markersize=5, c='b')
plt.plot(range(len(y_pred)), y_pred, 'o', markersize=5, c='r')
plt.show()
This is because the number of iterations and the step size are both smaller. As a result, The trial process is ending before reaching the local optima.

Resources