ValueError: Incompatible dimension for X and Y matrices in cosine similarity - python-3.x

I'm trying to find cosine similarity between two set of documents in Python 3.x. So I wrote following code
count_vectorizer = CountVectorizer(stop_words=stopwords)
sparse_matrix = count_vectorizer.fit_transform(formatted0)
doc_term_matrix = sparse_matrix.todense()
sparse_matrix = count_vectorizer.fit_transform(formatted)
doc_term_matrix1 = sparse_matrix.todense()
z=cosine_similarity(doc_term_matrix,doc_term_matrix1)
Length of doc_term_matrix is 29982 & doc_term_matrix1 is 346. But I'm getting error message
/opt/conda/lib/python3.9/site-packages/sklearn/utils/validation.py:593: FutureWarning: np.matrix usage is deprecated in 1.0 and will raise a TypeError in 1.2. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html
warnings.warn(
/opt/conda/lib/python3.9/site-packages/sklearn/utils/validation.py:593: FutureWarning: np.matrix usage is deprecated in 1.0 and will raise a TypeError in 1.2. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html
warnings.warn(
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_888/79579735.py in <module>
----> 1 z419=cosineSimilarity(splittedCosine419,doc_term_matrix)
2 z419
/tmp/ipykernel_888/2223236548.py in cosineSimilarity(splitted_german, doc_term_matrix)
8 sparse_matrix = count_vectorizer.fit_transform(formatted)
9 doc_term_matrix1 = sparse_matrix.todense()
---> 10 z=cosine_similarity(doc_term_matrix1,doc_term_matrix)
11 return z
/opt/conda/lib/python3.9/site-packages/sklearn/metrics/pairwise.py in cosine_similarity(X, Y, dense_output)
1249 # to avoid recursive import
1250
-> 1251 X, Y = check_pairwise_arrays(X, Y)
1252
1253 X_normalized = normalize(X, copy=True)
/opt/conda/lib/python3.9/site-packages/sklearn/metrics/pairwise.py in check_pairwise_arrays(X, Y, precomputed, dtype, accept_sparse, force_all_finite, copy)
179 )
180 elif X.shape[1] != Y.shape[1]:
--> 181 raise ValueError(
182 "Incompatible dimension for X and Y matrices: "
183 "X.shape[1] == %d while Y.shape[1] == %d" % (X.shape[1], Y.shape[1])
ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 1027 while Y.shape[1] == 10346
Can you suggest me the steps to resolve this issue?

Related

I keep getting "TypeError: only integer scalar arrays can be converted to a scalar index" while using custom-defined metric in KNeighborsClassifier

I am using a custom-defined metric in SKlearn's KNeighborsClassifier. Here's my code:
def chi_squared(x,y):
return np.divide(np.square(np.subtract(x,y)), np.sum(x,y))
Above function implementation of chi squared distance function. I have used NumPy functions because according to scikit-learn docs, metric function takes two one-dimensional numpy arrays.
I have passed the chi_squared function as an argument to KNeighborsClassifier().
knn = KNeighborsClassifier(algorithm='ball_tree', metric=chi_squared)
However, I keep getting following error:
TypeError Traceback (most recent call last)
<ipython-input-29-d2a365ebb538> in <module>
4
5 knn = KNeighborsClassifier(algorithm='ball_tree', metric=chi_squared)
----> 6 knn.fit(X_train, Y_train)
7 predictions = knn.predict(X_test)
8 print(accuracy_score(Y_test, predictions))
~/.local/lib/python3.8/site-packages/sklearn/neighbors/_classification.py in fit(self, X, y)
177 The fitted k-nearest neighbors classifier.
178 """
--> 179 return self._fit(X, y)
180
181 def predict(self, X):
~/.local/lib/python3.8/site-packages/sklearn/neighbors/_base.py in _fit(self, X, y)
497
498 if self._fit_method == 'ball_tree':
--> 499 self._tree = BallTree(X, self.leaf_size,
500 metric=self.effective_metric_,
501 **self.effective_metric_params_)
sklearn/neighbors/_binary_tree.pxi in sklearn.neighbors._ball_tree.BinaryTree.__init__()
sklearn/neighbors/_binary_tree.pxi in sklearn.neighbors._ball_tree.BinaryTree._recursive_build()
sklearn/neighbors/_ball_tree.pyx in sklearn.neighbors._ball_tree.init_node()
sklearn/neighbors/_binary_tree.pxi in sklearn.neighbors._ball_tree.BinaryTree.rdist()
sklearn/neighbors/_dist_metrics.pyx in sklearn.neighbors._dist_metrics.DistanceMetric.rdist()
sklearn/neighbors/_dist_metrics.pyx in sklearn.neighbors._dist_metrics.PyFuncDistance.dist()
sklearn/neighbors/_dist_metrics.pyx in sklearn.neighbors._dist_metrics.PyFuncDistance._dist()
<ipython-input-29-d2a365ebb538> in chi_squared(x, y)
1 def chi_squared(x,y):
----> 2 return np.divide(np.square(np.subtract(x,y)), np.sum(x,y))
3
4
5 knn = KNeighborsClassifier(algorithm='ball_tree', metric=chi_squared)
<__array_function__ internals> in sum(*args, **kwargs)
~/.local/lib/python3.8/site-packages/numpy/core/fromnumeric.py in sum(a, axis, dtype, out, keepdims, initial, where)
2239 return res
2240
-> 2241 return _wrapreduction(a, np.add, 'sum', axis, dtype, out, keepdims=keepdims,
2242 initial=initial, where=where)
2243
~/.local/lib/python3.8/site-packages/numpy/core/fromnumeric.py in _wrapreduction(obj, ufunc, method, axis, dtype, out, **kwargs)
85 return reduction(axis=axis, out=out, **passkwargs)
86
---> 87 return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
88
89
TypeError: only integer scalar arrays can be converted to a scalar index
I can reproduce your error message with:
In [173]: x=np.arange(3); y=np.array([2,3,4])
In [174]: np.sum(x,y)
Traceback (most recent call last):
File "<ipython-input-174-1a1a267ebd82>", line 1, in <module>
np.sum(x,y)
File "<__array_function__ internals>", line 5, in sum
File "/usr/local/lib/python3.8/dist-packages/numpy/core/fromnumeric.py", line 2247, in sum
return _wrapreduction(a, np.add, 'sum', axis, dtype, out, keepdims=keepdims,
File "/usr/local/lib/python3.8/dist-packages/numpy/core/fromnumeric.py", line 87, in _wrapreduction
return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
TypeError: only integer scalar arrays can be converted to a scalar index
Correct use(s) of np.sum:
In [175]: np.sum(x)
Out[175]: 3
In [177]: np.sum(np.arange(6).reshape(2,3), axis=0)
Out[177]: array([3, 5, 7])
In [178]: np.sum(np.arange(6).reshape(2,3), 0)
Out[178]: array([3, 5, 7])
(re)read the np.sum docs if necessary!
Using np.add instead of np.sum:
In [179]: np.add(x,y)
Out[179]: array([2, 4, 6])
In [180]: x+y
Out[180]: array([2, 4, 6])
The following should be equivalent:
np.divide(np.square(np.subtract(x,y)), np.add(x,y))
(x-y)**2/(x+y)

Specifying the columns using strings is only supported for pandas DataFrames

I want to One-hot-encoding several columns and used several solutions include simple one-hot-encoding, ColumnTransformer, make_column_transformer, Pipeline, and get_dummies but anytime I have got different errors.
x = dataset.iloc[:, :11].values
y = dataset.iloc[:, 11].values
""" data encoding """
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# oe = OrdinalEncoder()
# x = oe.fit_transform(x)
non_cat = ["Make", "Model", "Vehicle", "Transmission", "Fuel"]
onehot_cat = ColumnTransformer([
("categorical", OrdinalEncoder(), non_cat),
("onehot_categorical", OneHotEncoder(), non_cat)],
remainder= "passthrough")
x = onehot_cat.fit_transform(x)
error:
[['ACURA' 'ILX' 'COMPACT' ... 6.7 8.5 33]
['ACURA' 'ILX' 'COMPACT' ... 7.7 9.6 29]
['ACURA' 'ILX HYBRID' 'COMPACT' ... 5.8 5.9 48]
...
['VOLVO' 'XC60 T6 AWD' 'SUV - SMALL' ... 8.6 10.3 27]
['VOLVO' 'XC90 T5 AWD' 'SUV - STANDARD' ... 8.3 9.9 29]
['VOLVO' 'XC90 T6 AWD' 'SUV - STANDARD' ... 8.7 10.7 26]]
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
424 try:
--> 425 all_columns = X.columns
426 except AttributeError:
AttributeError: 'numpy.ndarray' object has no attribute 'columns'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-4-4008371c305f> in <module>
24 ("onehot_categorical", OneHotEncoder(), non_cat)],
25 remainder= "passthrough")
---> 26 x = onehot_cat.fit_transform(x)
27
28 print('OneHotEncode = ', x.shape)
~\Anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in fit_transform(self, X, y)
527 self._validate_transformers()
528 self._validate_column_callables(X)
--> 529 self._validate_remainder(X)
530
531 result = self._fit_transform(X, y, _fit_transform_one)
~\Anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in _validate_remainder(self, X)
325 cols = []
326 for columns in self._columns:
--> 327 cols.extend(_get_column_indices(X, columns))
328
329 remaining_idx = sorted(set(range(self._n_features)) - set(cols))
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
425 all_columns = X.columns
426 except AttributeError:
--> 427 raise ValueError("Specifying the columns using strings is only "
428 "supported for pandas DataFrames")
429 if isinstance(key, str):
ValueError: Specifying the columns using strings is only supported for pandas DataFrames
I got a similar error trying to make prediction using a model. It was expecting a dataframe but I was sending a numpy object instead. So I changed it from:
prediction = monitor_model.predict(s_df.to_numpy())
to:
prediction = monitor_model.predict(s_df)

How to define piecewise function in Python using numpy?

Following is the function I want to implement in python. I am getting Type Errors when defining a function. I tried defining using numpy.piecewise function object and also using just elif commands as a definition. I want to be able to then evaluate this function at different points as well as expressions like f(X-1) etc
This is my code:
from numpy import piecewise
from scipy import *
from sympy.abc import x
from sympy.utilities.lambdify import lambdify, implemented_function
from sympy import Function
from sympy import *
h = 0.5
a = -1
n = 2
x = Symbol('x')
expr = piecewise((0, x-a <= -2*h), ((1/6)*(2*h+(x-a))**3, -2*h<=x-a<=-h), (2*h**3/3-0.5*(x-a)**2*(2*h+(x-a)), -h<= x-a<= 0), (2*(h**3/3)-0.5*(x-a)**2*(2*h+(x-a)), 0<=x-a<=2*h), ((1/6)*(2*h-(x-a))**3, h<=x-a<=2*h), (0, x-a<=2*h))
p = lambdify((x, a,b,h), expr)
def basis(x,a,b, h):
if x <= a-2*h:
return 0;
elif (x<=a-h) or (x >=2*h):
return (1/6)*(2*h+(x-a))**3
elif (x-a<= 0) or (x-a >= -h):
return (2*h**3/3-0.5*(x-a)**2*(2*h+(x-a)));
elif (x<=2*h+a) or (x >= 0):
return (2*(h**3/3)-0.5*(x-a)**2*(2*h+(x-a)));
elif (x<=a+2*h) or (x >= h):
return (1/6)*(2*h-(x-a))**3;
elif x-a<=2*h:
return 0
basis(x, -1,0.5,0)
Both ways I get this :
raise TypeError("cannot determine truth value of Relational")
TypeError: cannot determine truth value of Relational
You can use sympy's lambdify function to generate the numpy piecewise function. This is a simpler example but shows the general idea:
In [15]: from sympy import symbols, Piecewise
In [16]: x, a = symbols('x, a')
In [17]: expr = Piecewise((x, x>a), (0, True))
In [18]: expr
Out[18]:
⎧x for a < x
⎨
⎩0 otherwise
In [19]: from sympy import lambdify
In [20]: fun = lambdify((x, a), expr)
In [21]: fun([1, 3], [4, 2])
Out[21]: array([0., 3.])
In [22]: import inspect
In [23]: print(inspect.getsource(fun))
def _lambdifygenerated(x, a):
return (select([less(a, x),True], [x,0], default=nan))
Sorry about the length of this answer, but I think you need to see the full debugging process. I had to look at the tracebacks and test small pieces of your code to identify the exact problem. I've seen a lot of the numpy ambiguity error, but not this sympy relational error.
===
Lets look at the whole traceback, not just one line of it. At the very least we need to identify which line of your code is producing the problem.
In [4]: expr = np.piecewise((0, x-a <= -2*h), ((1/6)*(2*h+(x-a))**3, -2*h<=x-a<
...: =-h), (2*h**3/3-0.5*(x-a)**2*(2*h+(x-a)), -h<= x-a<= 0), (2*(h**3/3)-0.5
...: *(x-a)**2*(2*h+(x-a)), 0<=x-a<=2*h), ((1/6)*(2*h-(x-a))**3, h<=x-a<=2*h)
...: , (0, x-a<=2*h))
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-893bb4b36321> in <module>
----> 1 expr = np.piecewise((0, x-a <= -2*h), ((1/6)*(2*h+(x-a))**3, -2*h<=x-a<=-h), (2*h**3/3-0.5*(x-a)**2*(2*h+(x-a)), -h<= x-a<= 0), (2*(h**3/3)-0.5*(x-a)**2*(2*h+(x-a)), 0<=x-a<=2*h), ((1/6)*(2*h-(x-a))**3, h<=x-a<=2*h), (0, x-a<=2*h))
/usr/local/lib/python3.8/dist-packages/sympy/core/relational.py in __nonzero__(self)
382
383 def __nonzero__(self):
--> 384 raise TypeError("cannot determine truth value of Relational")
385
386 __bool__ = __nonzero__
TypeError: cannot determine truth value of Relational
While np.piecewise is a numpy function, because x is a sympy.Symbol, the equations are sympy expressions. numpy and sympy are not well integrated. Somethings work, many others don't.
Did you try a small expression? Good programming practice is to start with small pieces, making sure those work first.
Let's try something smaller:
In [8]: expr = np.piecewise((0, x-a <= -2*h),
...: ((1/6)*(2*h+(x-a))**3, -2*h<=x-a<=-h))
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-8-37ff62e49efb> in <module>
1 expr = np.piecewise((0, x-a <= -2*h),
----> 2 ((1/6)*(2*h+(x-a))**3, -2*h<=x-a<=-h))
/usr/local/lib/python3.8/dist-packages/sympy/core/relational.py in __nonzero__(self)
382
383 def __nonzero__(self):
--> 384 raise TypeError("cannot determine truth value of Relational")
385
386 __bool__ = __nonzero__
TypeError: cannot determine truth value of Relational
and smaller pieces:
In [10]: (0, x-a <= -2*h)
Out[10]: (0, x + 1 ≤ -1.0)
In [11]: ((1/6)*(2*h+(x-a))**3, -2*h<=x-a<=-h)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-11-7bd9f95d077d> in <module>
----> 1 ((1/6)*(2*h+(x-a))**3, -2*h<=x-a<=-h)
/usr/local/lib/python3.8/dist-packages/sympy/core/relational.py in __nonzero__(self)
382
383 def __nonzero__(self):
--> 384 raise TypeError("cannot determine truth value of Relational")
385
386 __bool__ = __nonzero__
TypeError: cannot determine truth value of Relational
In [12]: (1/6)*(2*h+(x-a))**3
Out[12]:
3
1.33333333333333⋅(0.5⋅x + 1)
But:
In [13]: -2*h<=x-a<=-h
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-13-5ffb419cd443> in <module>
----> 1 -2*h<=x-a<=-h
/usr/local/lib/python3.8/dist-packages/sympy/core/relational.py in __nonzero__(self)
382
383 def __nonzero__(self):
--> 384 raise TypeError("cannot determine truth value of Relational")
385
386 __bool__ = __nonzero__
TypeError: cannot determine truth value of Relational
Simplify further:
In [14]: 0 < x < 3
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-14-59ba4ce00627> in <module>
----> 1 0 < x < 3
/usr/local/lib/python3.8/dist-packages/sympy/core/relational.py in __nonzero__(self)
382
383 def __nonzero__(self):
--> 384 raise TypeError("cannot determine truth value of Relational")
385
386 __bool__ = __nonzero__
TypeError: cannot determine truth value of Relational
While a < b < c is allowed for regular Python variables and scalars, it does not work for numpy arrays, and evidently doesn't work for sympy variables either.
So the immediate problem has nothing to do with numpy. You are using invalid sympy expressions!
===
Your basis function reveals an aspect of the same problem. Again we need to look at the FULL traceback, and then test portions to identify the exact problem expression.
In [16]: basis(x, -1,0.5,0)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-16-b328f95b3c79> in <module>
----> 1 basis(x, -1,0.5,0)
<ipython-input-15-c6436540e3f3> in basis(x, a, b, h)
1 def basis(x,a,b, h):
----> 2 if x <= a-2*h:
3 return 0;
4 elif (x<=a-h) or (x >=2*h):
5 return (1/6)*(2*h+(x-a))**3
/usr/local/lib/python3.8/dist-packages/sympy/core/relational.py in __nonzero__(self)
382
383 def __nonzero__(self):
--> 384 raise TypeError("cannot determine truth value of Relational")
385
386 __bool__ = __nonzero__
TypeError: cannot determine truth value of Relational
This expression is a sympy relational:
In [17]: x <= -1
Out[17]: x ≤ -1
But we can't use such a relational in a Python if statement.
In [18]: if x <= -1: pass
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-18-b56148a48367> in <module>
----> 1 if x <= -1: pass
/usr/local/lib/python3.8/dist-packages/sympy/core/relational.py in __nonzero__(self)
382
383 def __nonzero__(self):
--> 384 raise TypeError("cannot determine truth value of Relational")
385
386 __bool__ = __nonzero__
TypeError: cannot determine truth value of Relational
Python if is simple True/False switch; its argument must evaluate to one or the other. The error is telling us that a sympy.Relational does not work. 0 < x < 1 is variation on that basic Python if (it tests 0<x and x<1 and performs a and).
A variation on this that we often see in numpy (and pandas) is:
In [20]: 0 < np.array([0,1,2])
Out[20]: array([False, True, True])
In [21]: 0 < np.array([0,1,2])<1
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-21-bc1039cec1fc> in <module>
----> 1 0 < np.array([0,1,2])<1
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
The numpy expression has multiple True/False values, and can't be used im a Python expression that requires a simple True/False.
edit
Correctly expanding the two sided tests:
In [23]: expr = np.piecewise((0, x-a <= -2*h),
...: ((1/6)*(2*h+(x-a))**3, (-2*h<=x-a)&(x-a<=-h)),
...: (2*h**3/3-0.5*(x-a)**2*(2*h+(x-a)), (-h<= x-a)&(x-a<= 0)),
...: (2*(h**3/3)-0.5*(x-a)**2*(2*h+(x-a)), (0<=x-a)&(x-a<=2*h)),
...: ((1/6)*(2*h-(x-a))**3, (h<=x-a)&(x-a<=2*h)), (0, x-a<=2*h))
In [24]: expr
Out[24]:
array([-0.5*(x + 1)**2*(x + 2.0) + 0.0833333333333333,
-0.5*(x + 1)**2*(x + 2.0) + 0.0833333333333333], dtype=object)
In [26]: p = lambdify((x,), expr)
x is the only sympy symbol in expr.
Looking at the resulting function:
In [27]: print(p.__doc__)
Created with lambdify. Signature:
func(x)
Expression:
[-0.5*(x + 1)**2*(x + 2.0) + 0.0833333333333333 -0.5*(x + 1)**2*(x + 2.0)...
Source code:
def _lambdifygenerated(x):
return ([-0.5*(x + 1)**2*(x + 2.0) + 0.0833333333333333, -0.5*(x + 1)**2*(x + 2.0) + 0.0833333333333333])

tfidf first time, using it on a Pandas series that has a list per entry

Data looks like this :
data_clean2.head(3)
text target
0 [deed, reason, earthquak, may, allah, forgiv, u] 1
1 [forest, fire, near, la, rong, sask, canada] 1
2 [resid, ask, shelter, place, notifi, offic, evacu, shelter, place, order, expect] 1
I got this by stemming and lemmatizing the sentence and tokenizing before that. ( Hope that is right).
Now I want to use:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(data_clean2['text'])
It gives me the following error :
AttributeError Traceback (most recent call last)
<ipython-input-140-6f68d1115c5f> in <module>
1 vectorizer = TfidfVectorizer()
----> 2 vectors = vectorizer.fit_transform(data_clean2['text'])
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1650 """
1651 self._check_params()
-> 1652 X = super().fit_transform(raw_documents)
1653 self._tfidf.fit(X)
1654 # X is already a transformed view of raw_documents so
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1056
1057 vocabulary, X = self._count_vocab(raw_documents,
-> 1058 self.fixed_vocabulary_)
1059
1060 if self.binary:
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
968 for doc in raw_documents:
969 feature_counter = {}
--> 970 for feature in analyze(doc):
971 try:
972 feature_idx = vocabulary[feature]
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(doc)
350 tokenize)
351 return lambda doc: self._word_ngrams(
--> 352 tokenize(preprocess(self.decode(doc))), stop_words)
353
354 else:
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(x)
254
255 if self.lowercase:
--> 256 return lambda x: strip_accents(x.lower())
257 else:
258 return strip_accents
AttributeError: 'list' object has no attribute 'lower'
I know that I somehow cannot use it on the list, so what is my play here, trying to return the list into a string again?
Yes, first convert to string using:
data_clean2['text'] = data_clean2['text'].apply(', '.join)
Then use:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(data_clean2['text'])
v = pd.DataFrame(vectors.toarray(), columns = vectorizer.get_feature_names())

sklearn RidgeCV with sample_weight

I'm trying to do a weighted Ridge Regression with sklearn. However, the code breaks when I call the fit method. The exception I get is :
Exception: Data must be 1-dimensional
But I'm sure (by checking through print-statements) that the data I'm passing has the right shapes.
print temp1.shape #(781, 21)
print temp2.shape #(781,)
print weights.shape #(781,)
result=RidgeCV(normalize=True).fit(temp1,temp2,sample_weight=weights)
What could be going wrong ??
Here's the whole output :
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-65-a5b1eba5d9cf> in <module>()
22
23
---> 24 result=RidgeCV(normalize=True).fit(temp2,temp1, sample_weight=weights)
25
26
/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/ridge.pyc in fit(self, X, y, sample_weight)
868 gcv_mode=self.gcv_mode,
869 store_cv_values=self.store_cv_values)
--> 870 estimator.fit(X, y, sample_weight=sample_weight)
871 self.alpha_ = estimator.alpha_
872 if self.store_cv_values:
/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/ridge.pyc in fit(self, X, y, sample_weight)
793 else alpha)
794 if error:
--> 795 out, c = _errors(weighted_alpha, y, v, Q, QT_y)
796 else:
797 out, c = _values(weighted_alpha, y, v, Q, QT_y)
/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/ridge.pyc in _errors(self, alpha, y, v, Q, QT_y)
685 w = 1.0 / (v + alpha)
686 c = np.dot(Q, self._diag_dot(w, QT_y))
--> 687 G_diag = self._decomp_diag(w, Q)
688 # handle case where y is 2-d
689 if len(y.shape) != 1:
/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/ridge.pyc in _decomp_diag(self, v_prime, Q)
672 def _decomp_diag(self, v_prime, Q):
673 # compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T))
--> 674 return (v_prime * Q ** 2).sum(axis=-1)
675
676 def _diag_dot(self, D, B):
/usr/local/lib/python2.7/dist-packages/pandas/core/ops.pyc in wrapper(left, right, name)
531 return left._constructor(wrap_results(na_op(lvalues, rvalues)),
532 index=left.index, name=left.name,
--> 533 dtype=dtype)
534 return wrapper
535
/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in __init__(self, data, index, dtype, name, copy, fastpath)
209 else:
210 data = _sanitize_array(data, index, dtype, copy,
--> 211 raise_cast_failure=True)
212
213 data = SingleBlockManager(data, index, fastpath=True)
/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in _sanitize_array(data, index, dtype, copy, raise_cast_failure)
2683 elif subarr.ndim > 1:
2684 if isinstance(data, np.ndarray):
-> 2685 raise Exception('Data must be 1-dimensional')
2686 else:
2687 subarr = _asarray_tuplesafe(data, dtype=dtype)
Exception: Data must be 1-dimensional
The error seems to be due to sample_weights being a Pandas series rather than a numpy array:
from sklearn.linear_model import RidgeCV
temp1 = pd.DataFrame(np.random.rand(781, 21))
temp2 = pd.Series(temp1.sum(1))
weights = pd.Series(1 + 0.1 * np.random.rand(781))
result = RidgeCV(normalize=True).fit(temp1, temp2,
sample_weight=weights)
# Exception: Data must be 1-dimensional
If you use a numpy array instead, the error goes away:
result = RidgeCV(normalize=True).fit(temp1, temp2,
sample_weight=weights.values)
This seems to be a bug; I've opened a scikit-learn issue to report this.

Resources