How to perform Min Max Scaler on an array which contains columns with string and numbers? - python-3.x

please i really need your help, i'm struggling with MinMaxScaler, i would like to apply this technique on the array below that contains columns with string and numbers. I only want to implement this technique on the columns that contains numbers.
clean_tweets_no_urls = pd.DataFrame(counts_no_urls.most_common(15),
columns=['words', 'count'])
clean_tweets_no_urls.head()
That's my array
minmax_scaling(clean_tweets_no_urls, columns=['words', 'count'])
For that, i'm getting this result :
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-108-eeb7b44d7121> in <module>
----> 1 minmax_scaling(clean_tweets_no_urls, columns=['words', 'count'])
C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\preprocessing\scaling.py in minmax_scaling(array, columns, min_val, max_val)
36
37 """
---> 38 ary_new = array.astype(float)
39 if len(ary_new.shape) == 1:
40 ary_new = ary_new[:, np.newaxis]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors)
5696 else:
5697 # else, only a single dtype is given
-> 5698 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors)
5699 return self._constructor(new_data).__finalize__(self)
5700
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, copy, errors)
580
581 def astype(self, dtype, copy: bool = False, errors: str = "raise"):
--> 582 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
583
584 def convert(self, **kwargs):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, filter, **kwargs)
440 applied = b.apply(f, **kwargs)
441 else:
--> 442 applied = getattr(b, f)(**kwargs)
443 result_blocks = _extend_blocks(applied, result_blocks)
444
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors)
623 vals1d = values.ravel()
624 try:
--> 625 values = astype_nansafe(vals1d, dtype, copy=True)
626 except (ValueError, TypeError):
627 # e.g. astype_nansafe can fail on object-dtype of strings
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
895 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
896 # Explicit copy, or required since NumPy can't view from / to object.
--> 897 return arr.astype(dtype, copy=True)
898
899 return arr.view(dtype)
ValueError: could not convert string to float: 'joebiden'

from sklearn.preprocessing import minmax_scale
clean_tweets_no_urls[['count']] = minmax_scale.fit_transform(clean_tweets_no_urls[['count']])
This may be used to automate finding numeric columns.

Related

FeatureTools TypeError: unhashable type: 'set'

I'm trying this code for featuretools:
features, feature_names = ft.dfs(entityset = es, target_entity = 'demo',
agg_primitives = ['count', 'max', 'time_since_first', 'median', 'time_since_last', 'avg_time_between',
'sum', 'mean'],
trans_primitives = ['is_weekend', 'year', 'week', 'divide_by_feature', 'percentile'])
But I had this error
TypeError Traceback (most recent call last)
<ipython-input-17-89e925ff895d> in <module>
3 agg_primitives = ['count', 'max', 'time_since_first', 'median', 'time_since_last', 'avg_time_between',
4 'sum', 'mean'],
----> 5 trans_primitives = ['is_weekend', 'year', 'week', 'divide_by_feature', 'percentile'])
~/.local/lib/python3.6/site-packages/featuretools/utils/entry_point.py in function_wrapper(*args, **kwargs)
44 ep.on_error(error=e,
45 runtime=runtime)
---> 46 raise e
47
48 # send return value
~/.local/lib/python3.6/site-packages/featuretools/utils/entry_point.py in function_wrapper(*args, **kwargs)
36 # call function
37 start = time.time()
---> 38 return_value = func(*args, **kwargs)
39 runtime = time.time() - start
40 except Exception as e:
~/.local/lib/python3.6/site-packages/featuretools/synthesis/dfs.py in dfs(entities, relationships, entityset, target_entity, cutoff_time, instance_ids, agg_primitives, trans_primitives, groupby_trans_primitives, allowed_paths, max_depth, ignore_entities, ignore_variables, seed_features, drop_contains, drop_exact, where_primitives, max_features, cutoff_time_in_index, save_progress, features_only, training_window, approximate, chunk_size, n_jobs, dask_kwargs, verbose, return_variable_types)
226 n_jobs=n_jobs,
227 dask_kwargs=dask_kwargs,
--> 228 verbose=verbose)
229 return feature_matrix, features
~/.local/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py in calculate_feature_matrix(features, entityset, cutoff_time, instance_ids, entities, relationships, cutoff_time_in_index, training_window, approximate, save_progress, verbose, chunk_size, n_jobs, dask_kwargs)
265 cutoff_df_time_var=cutoff_df_time_var,
266 target_time=target_time,
--> 267 pass_columns=pass_columns)
268
269 feature_matrix = pd.concat(feature_matrix)
~/.local/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py in linear_calculate_chunks(chunks, feature_set, approximate, training_window, verbose, save_progress, entityset, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns)
496 no_unapproximated_aggs,
497 cutoff_df_time_var,
--> 498 target_time, pass_columns)
499 feature_matrix.append(_feature_matrix)
500 # Do a manual garbage collection in case objects from calculate_chunk
~/.local/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py in calculate_chunk(chunk, feature_set, entityset, approximate, training_window, verbose, save_progress, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns)
341 ids,
342 precalculated_features=precalculated_features_trie,
--> 343 training_window=window)
344
345 id_name = _feature_matrix.index.name
~/.local/lib/python3.6/site-packages/featuretools/computational_backends/utils.py in wrapped(*args, **kwargs)
35 def wrapped(*args, **kwargs):
36 if save_progress is None:
---> 37 r = method(*args, **kwargs)
38 else:
39 time = args[0].to_pydatetime()
~/.local/lib/python3.6/site-packages/featuretools/computational_backends/calculate_feature_matrix.py in calc_results(time_last, ids, precalculated_features, training_window)
316 ignored=all_approx_feature_set)
317
--> 318 matrix = calculator.run(ids)
319 return matrix
320
~/.local/lib/python3.6/site-packages/featuretools/computational_backends/feature_set_calculator.py in run(self, instance_ids)
100 precalculated_trie=self.precalculated_features,
101 filter_variable=target_entity.index,
--> 102 filter_values=instance_ids)
103
104 # The dataframe for the target entity should be stored at the root of
~/.local/lib/python3.6/site-packages/featuretools/computational_backends/feature_set_calculator.py in _calculate_features_for_entity(self, entity_id, feature_trie, df_trie, full_entity_df_trie, precalculated_trie, filter_variable, filter_values, parent_data)
187 columns=columns,
188 time_last=self.time_last,
--> 189 training_window=self.training_window)
190
191 # Step 2: Add variables to the dataframe linking it to all ancestors.
~/.local/lib/python3.6/site-packages/featuretools/entityset/entity.py in query_by_values(self, instance_vals, variable_id, columns, time_last, training_window)
271
272 if columns is not None:
--> 273 df = df[columns]
274
275 return df
~/.local/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
2686 return self._getitem_multilevel(key)
2687 else:
-> 2688 return self._getitem_column(key)
2689
2690 def _getitem_column(self, key):
~/.local/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2693 # get column
2694 if self.columns.is_unique:
-> 2695 return self._get_item_cache(key)
2696
2697 # duplicate columns & possible reduce dimensionality
~/.local/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
2485 """Return the cached item, item represents a label indexer."""
2486 cache = self._item_cache
-> 2487 res = cache.get(item)
2488 if res is None:
2489 values = self._data.get(item)
TypeError: unhashable type: 'set'
I also tried the simplest code for deep feature synthesis (dfs) as shown below, but it still encountered the same error
features, feature_names = ft.dfs(entityset = es, target_entity = 'demo')
I'm not really sure why I encountered this error, any help or recommendations on how to go about from here is deeply appreciated.
Thanks in advance for your help!
I found a solution, my current version had bugs in it that was fixed by the FeatureTools team. Just run pip install directly from master,
pip install --upgrade https://github.com/featuretools/featuretools/zipball/master
This fixed and has been released in Featuretools 0.9.1. If you upgrade to the latest version of Featuretools, it will go away.

Problems with seaborn (ndim)

I am using seaborn to plot a very simple data set. Here is what I do:
import seaborn as sns
import pandas as pd
df = pd.read_excel('myfile.xlsx')
sns.set(style="white")
g = sns.PairGrid(df, diag_sharey=False)
g.map_lower(sns.kdeplot)
g.map_upper(sns.scatterplot)
g.map_diag(sns.kdeplot, lw=3)
I get the following error: AttributeError: 'NoneType' object has no attribute 'ndim'. Weirdly, the plot is ploted in parts (see below).
Any idea why that is the case and what I can do to solve the issue?
EDIT:
The dataframe has the following attributes:
plan_change int64
user_login float64
new_act_ratio float64
on_time int64
Unfortunately, I cannot upload the data set. However I can say, that plotting other seaborn graphs works just fine.
The total error message is the following:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-16-2dbc61abd2bd> in <module>()
3 g = sns.PairGrid(df, diag_sharey=False)
4 g.map_lower(sns.kdeplot)
----> 5 g.map_upper(sns.scatterplot)
6 g.map_diag(sns.kdeplot, lw=3)
7
/anaconda/lib/python3.5/site-packages/seaborn/axisgrid.py in map_upper(self, func, **kwargs)
1488 color = self.palette[k] if kw_color is None else kw_color
1489 func(data_k[x_var], data_k[y_var], label=label_k,
-> 1490 color=color, **kwargs)
1491
1492 self._clean_axis(ax)
/anaconda/lib/python3.5/site-packages/seaborn/relational.py in scatterplot(x, y, hue, style, size, data, palette, hue_order, hue_norm, sizes, size_order, size_norm, markers, style_order, x_bins, y_bins, units, estimator, ci, n_boot, alpha, x_jitter, y_jitter, legend, ax, **kwargs)
1333 x_bins=x_bins, y_bins=y_bins,
1334 estimator=estimator, ci=ci, n_boot=n_boot,
-> 1335 alpha=alpha, x_jitter=x_jitter, y_jitter=y_jitter, legend=legend,
1336 )
1337
/anaconda/lib/python3.5/site-packages/seaborn/relational.py in __init__(self, x, y, hue, size, style, data, palette, hue_order, hue_norm, sizes, size_order, size_norm, dashes, markers, style_order, x_bins, y_bins, units, estimator, ci, n_boot, alpha, x_jitter, y_jitter, legend)
850
851 plot_data = self.establish_variables(
--> 852 x, y, hue, size, style, units, data
853 )
854
/anaconda/lib/python3.5/site-packages/seaborn/relational.py in establish_variables(self, x, y, hue, size, style, units, data)
155 units=units
156 )
--> 157 plot_data = pd.DataFrame(plot_data)
158
159 # Option 3:
/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
264 dtype=dtype, copy=copy)
265 elif isinstance(data, dict):
--> 266 mgr = self._init_dict(data, index, columns, dtype=dtype)
267 elif isinstance(data, ma.MaskedArray):
268 import numpy.ma.mrecords as mrecords
/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _init_dict(self, data, index, columns, dtype)
400 arrays = [data[k] for k in keys]
401
--> 402 return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
403
404 def _init_ndarray(self, values, index, columns, dtype=None, copy=False):
/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
5382
5383 # don't force copy because getting jammed in an ndarray anyway
-> 5384 arrays = _homogenize(arrays, index, dtype)
5385
5386 # from BlockManager perspective
/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _homogenize(data, index, dtype)
5693 v = lib.fast_multiget(v, oindex.values, default=NA)
5694 v = _sanitize_array(v, index, dtype=dtype, copy=False,
-> 5695 raise_cast_failure=False)
5696
5697 homogenized.append(v)
/anaconda/lib/python3.5/site-packages/pandas/core/series.py in _sanitize_array(data, index, dtype, copy, raise_cast_failure)
2917
2918 # scalar like
-> 2919 if subarr.ndim == 0:
2920 if isinstance(data, list): # pragma: no cover
2921 subarr = np.array(data, dtype=object)
AttributeError: 'NoneType' object has no attribute 'ndim'

fit_transform error using CountVectorizer

So I have a dataframe X which looks something like this:
X.head()
0 My wife took me here on my birthday for breakf...
1 I have no idea why some people give bad review...
3 Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4 General Manager Scott Petello is a good egg!!!...
6 Drop what you're doing and drive here. After I...
Name: text, dtype: object
And then,
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(X)
But I get this error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-61-8ff79b91e317> in <module>()
----> 1 X = cv.fit_transform(X)
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
867
868 vocabulary, X = self._count_vocab(raw_documents,
--> 869 self.fixed_vocabulary_)
870
871 if self.binary:
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
790 for doc in raw_documents:
791 feature_counter = {}
--> 792 for feature in analyze(doc):
793 try:
794 feature_idx = vocabulary[feature]
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(doc)
264
265 return lambda doc: self._word_ngrams(
--> 266 tokenize(preprocess(self.decode(doc))), stop_words)
267
268 else:
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(x)
230
231 if self.lowercase:
--> 232 return lambda x: strip_accents(x.lower())
233 else:
234 return strip_accents
~/anaconda3/lib/python3.6/site-packages/scipy/sparse/base.py in __getattr__(self, attr)
574 return self.getnnz()
575 else:
--> 576 raise AttributeError(attr + " not found")
577
578 def transpose(self, axes=None, copy=False):
AttributeError: lower not found
No idea why.
You need to specify the column name of the text data even if the dataframe has single column.
X_countMatrix = cv.fit_transform(X['text'])
Because a CountVectorizer expects an iterable as input and when you supply a dataframe as an argument, only thing thats iterated is the column names. So even if you did not have any errors, that would be incorrect. Lucky that you got an error and got a chance to correct it.

Tensorflow: function.defun with a a while loop in the body is throwing shape error

I am using a while loop to calculate a cost function for memory reasons. When calculating the gradient, tensorflow will store Nm tensors where Nm is the number of iterations in my while loop (this cuases the same memory issues I had with the original energy functions). I do not want that as I don't have enough memory. So I want to register a new op along with a gradient function that both use a while loop. However I am having issues with using function.defun and a while loop. To simplify things, I have a small test example below:
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import sparse_ops
from tensorflow.python.framework import function
def _run(tensor):
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
res = sess.run(tensor)
return res
#function.Defun(tf.float32,tf.float32,func_name ='tf_test_log')#,grad_func=tf_test_logGrad)
def tf_test_log(t_x,t_y):
#N = t_x.shape[0].value
condition = lambda i,m1: i<N
def body(index,x):
#return[(index+1),tf.concat([x, tf.expand_dims(tf.exp( tf.add( t_x[:,index],t_y[:,index]) ),1) ],1 ) ]
return[(index+1),tf.add(x, tf.exp( tf.add( t_x[:,0],t_y[:,0]) ) ) ]
i0 = tf.constant(0,dtype=tf.int32)
m0 = tf.zeros([N,1],dType)
ijk_0 = [i0,m0]
L,t_log_x = tf.while_loop(condition,body,ijk_0,
shape_invariants=[i0.get_shape(),
tf.TensorShape([N,None])]
)
return t_log_x
dType = tf.float32
N = np.int32(100)
t_N = tf.constant(N,dtype = tf.int32)
t_x = tf.constant(np.random.randn(N,N),dtype = dType)
t_y = tf.constant(np.random.randn(N,N),dtype = dType)
ys = _run(tf_test_log(t_x,t_y))
I then try to test the new op:
I get a Value error: The shape for while/Merge_1:0 is not an invariant for the loop. It enters the loop with shape (100, ?), but has shape after one iteration. Provide shape invariants using either the shape_invariants argument of tf.while_loop or set_shape() on the loop variables.
Note that calling
If i use a concatenate operation (instead of the add operation that gets returned by my while loop), I do not get any issues.
However, If I do not set N as a global variable (i.e. I do N = t_x.shape[0]) inside the body of the tf_test_log function, I get a Value error.
ValueError: Cannot convert a partially known TensorShape to a Tensor: (?, 1)
What is wrong with my code? Any help is greatly appreciated!
I am using python 3.5 on ubuntu 16.04 and tensorflow 1.4
full output:
ValueError Traceback (most recent call last)
~/Documents/TheEffingPhDHatersGonnaHate/PAM/defun_while.py in <module>()
51 t_x = tf.constant(np.random.randn(N,N),dtype = dType)
52 t_y = tf.constant(np.random.randn(N,N),dtype = dType)
---> 53 ys = _run(tf_test_log(t_x,t_y))
54
55
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/framework/function.py in __call__(self, *args, **kwargs)
503
504 def __call__(self, *args, **kwargs):
--> 505 self.add_to_graph(ops.get_default_graph())
506 args = [ops.convert_to_tensor(_) for _ in args] + self._extra_inputs
507 ret, op = _call(self._signature, *args, **kwargs)
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/framework/function.py in add_to_graph(self, g)
484 def add_to_graph(self, g):
485 """Adds this function into the graph g."""
--> 486 self._create_definition_if_needed()
487
488 # Adds this function into 'g'.
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/framework/function.py in _create_definition_if_needed(self)
319 """Creates the function definition if it's not created yet."""
320 with context.graph_mode():
--> 321 self._create_definition_if_needed_impl()
322
323 def _create_definition_if_needed_impl(self):
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/framework/function.py in _create_definition_if_needed_impl(self)
336 # Call func and gather the output tensors.
337 with vs.variable_scope("", custom_getter=temp_graph.getvar):
--> 338 outputs = self._func(*inputs)
339
340 # There is no way of distinguishing between a function not returning
~/Documents/TheEffingPhDHatersGonnaHate/PAM/defun_while.py in tf_test_log(t_x, t_y)
39 L,t_log_x = tf.while_loop(condition,body,ijk_0,
40 shape_invariants=[i0.get_shape(),
---> 41 tf.TensorShape([N,None])]
42 )
43 return t_log_x
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py in while_loop(cond, body, loop_vars, shape_invariants, parallel_iterations, back_prop, swap_memory, name)
2814 loop_context = WhileContext(parallel_iterations, back_prop, swap_memory) # pylint: disable=redefined-outer-name
2815 ops.add_to_collection(ops.GraphKeys.WHILE_CONTEXT, loop_context)
-> 2816 result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants)
2817 return result
2818
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py in BuildLoop(self, pred, body, loop_vars, shape_invariants)
2638 self.Enter()
2639 original_body_result, exit_vars = self._BuildLoop(
-> 2640 pred, body, original_loop_vars, loop_vars, shape_invariants)
2641 finally:
2642 self.Exit()
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py in _BuildLoop(self, pred, body, original_loop_vars, loop_vars, shape_invariants)
2619 for m_var, n_var in zip(merge_vars, next_vars):
2620 if isinstance(m_var, ops.Tensor):
-> 2621 _EnforceShapeInvariant(m_var, n_var)
2622
2623 # Exit the loop.
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py in _EnforceShapeInvariant(merge_var, next_var)
576 "Provide shape invariants using either the `shape_invariants` "
577 "argument of tf.while_loop or set_shape() on the loop variables."
--> 578 % (merge_var.name, m_shape, n_shape))
579 else:
580 if not isinstance(var, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
ValueError: The shape for while/Merge_1:0 is not an invariant for the loop. It enters the loop with shape (100, ?), but has shape <unknown> after one iteration. Provide shape invariants using either the `shape_invariants` argument of tf.while_loop or set_shape() on the loop variables.
Thanks #Alexandre Passos for the suggestion in the comment above!
The following piece of code is a modification of the original with a set_shape function added inside the body.
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import sparse_ops
from tensorflow.python.framework import function
def _run(tensor):
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
res = sess.run(tensor)
return res
#function.Defun(tf.float32,tf.float32,tf.float32,func_name ='tf_test_logGrad')
def tf_test_logGrad(t_x,t_y,grad):
return grad
#function.Defun(tf.float32,tf.float32,func_name ='tf_test_log')#,grad_func=tf_test_logGrad)
def tf_test_log(t_x,t_y):
#N = t_x.shape[0].value
condition = lambda i,m1: i<N
def body(index,x):
#return[(index+1),tf.concat([x, tf.expand_dims(tf.exp( tf.add( t_x[:,index],t_y[:,index]) ),1) ],1 ) ]
x = tf.add(x, tf.exp( tf.add( t_x[:,0],t_y[:,0]) ) )
x.set_shape([N])
return[(index+1), x]
i0 = tf.constant(0,dtype=tf.int32)
m0 = tf.zeros([N],dType)
ijk_0 = [i0,m0]
L,t_log_x = tf.while_loop(condition,body,ijk_0,
shape_invariants=[i0.get_shape(),
tf.TensorShape([N])]
)
return t_log_x
dType = tf.float32
N = np.int32(100)
t_N = tf.constant(N,dtype = tf.int32)
t_x = tf.constant(np.random.randn(N,N),dtype = dType)
t_y = tf.constant(np.random.randn(N,N),dtype = dType)
ys = _run(tf_test_log(t_x,t_y))
The Issue of global N still persists.
You still need to set the shape of the loop tensors as a global variable outside of the defun decorator. If you try to get it from the shape of the inputs of the defun decorator, you get:
TypeError Traceback (most recent call last)
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py in zeros(shape, dtype, name)
1438 shape = tensor_shape.as_shape(shape)
-> 1439 output = constant(zero, shape=shape, dtype=dtype, name=name)
1440 except (TypeError, ValueError):
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py in constant(value, dtype, shape, name, verify_shape)
207 tensor_util.make_tensor_proto(
--> 208 value, dtype=dtype, shape=shape, verify_shape=verify_shape))
209 dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/framework/tensor_util.py in make_tensor_proto(values, dtype, shape, verify_shape)
379 # exception when dtype is set to np.int64
--> 380 if shape is not None and np.prod(shape, dtype=np.int64) == 0:
381 nparray = np.empty(shape, dtype=np_dt)
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/numpy/core/fromnumeric.py in prod(a, axis, dtype, out, keepdims)
2517 return _methods._prod(a, axis=axis, dtype=dtype,
-> 2518 out=out, **kwargs)
2519
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/numpy/core/_methods.py in _prod(a, axis, dtype, out, keepdims)
34 def _prod(a, axis=None, dtype=None, out=None, keepdims=False):
---> 35 return umr_prod(a, axis, dtype, out, keepdims)
36
TypeError: __int__ returned non-int (type NoneType)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
~/Documents/TheEffingPhDHatersGonnaHate/PAM/defun_while.py in <module>()
52 t_x = tf.constant(np.random.randn(N,N),dtype = dType)
53 t_y = tf.constant(np.random.randn(N,N),dtype = dType)
---> 54 ys = _run(tf_test_log(t_x,t_y))
55
56
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/framework/function.py in __call__(self, *args, **kwargs)
503
504 def __call__(self, *args, **kwargs):
--> 505 self.add_to_graph(ops.get_default_graph())
506 args = [ops.convert_to_tensor(_) for _ in args] + self._extra_inputs
507 ret, op = _call(self._signature, *args, **kwargs)
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/framework/function.py in add_to_graph(self, g)
484 def add_to_graph(self, g):
485 """Adds this function into the graph g."""
--> 486 self._create_definition_if_needed()
487
488 # Adds this function into 'g'.
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/framework/function.py in _create_definition_if_needed(self)
319 """Creates the function definition if it's not created yet."""
320 with context.graph_mode():
--> 321 self._create_definition_if_needed_impl()
322
323 def _create_definition_if_needed_impl(self):
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/framework/function.py in _create_definition_if_needed_impl(self)
336 # Call func and gather the output tensors.
337 with vs.variable_scope("", custom_getter=temp_graph.getvar):
--> 338 outputs = self._func(*inputs)
339
340 # There is no way of distinguishing between a function not returning
~/Documents/TheEffingPhDHatersGonnaHate/PAM/defun_while.py in tf_test_log(t_x, t_y)
33
34 i0 = tf.constant(0,dtype=tf.int32)
---> 35 m0 = tf.zeros([N],dType)
36
37
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py in zeros(shape, dtype, name)
1439 output = constant(zero, shape=shape, dtype=dtype, name=name)
1440 except (TypeError, ValueError):
-> 1441 shape = ops.convert_to_tensor(shape, dtype=dtypes.int32, name="shape")
1442 output = fill(shape, constant(zero, dtype=dtype), name=name)
1443 assert output.dtype.base_dtype == dtype
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/framework/ops.py in convert_to_tensor(value, dtype, name, preferred_dtype)
834 name=name,
835 preferred_dtype=preferred_dtype,
--> 836 as_ref=False)
837
838
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/framework/ops.py in internal_convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, ctx)
924
925 if ret is None:
--> 926 ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
927
928 if ret is NotImplemented:
~/environments/tf_1_4_gpu/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py in _tensor_shape_tensor_conversion_function(s, dtype, name, as_ref)
248 if not s.is_fully_defined():
249 raise ValueError(
--> 250 "Cannot convert a partially known TensorShape to a Tensor: %s" % s)
251 s_list = s.as_list()
252 int64_value = 0
ValueError: Cannot convert a partially known TensorShape to a Tensor: (?,)

sklearn RidgeCV with sample_weight

I'm trying to do a weighted Ridge Regression with sklearn. However, the code breaks when I call the fit method. The exception I get is :
Exception: Data must be 1-dimensional
But I'm sure (by checking through print-statements) that the data I'm passing has the right shapes.
print temp1.shape #(781, 21)
print temp2.shape #(781,)
print weights.shape #(781,)
result=RidgeCV(normalize=True).fit(temp1,temp2,sample_weight=weights)
What could be going wrong ??
Here's the whole output :
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-65-a5b1eba5d9cf> in <module>()
22
23
---> 24 result=RidgeCV(normalize=True).fit(temp2,temp1, sample_weight=weights)
25
26
/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/ridge.pyc in fit(self, X, y, sample_weight)
868 gcv_mode=self.gcv_mode,
869 store_cv_values=self.store_cv_values)
--> 870 estimator.fit(X, y, sample_weight=sample_weight)
871 self.alpha_ = estimator.alpha_
872 if self.store_cv_values:
/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/ridge.pyc in fit(self, X, y, sample_weight)
793 else alpha)
794 if error:
--> 795 out, c = _errors(weighted_alpha, y, v, Q, QT_y)
796 else:
797 out, c = _values(weighted_alpha, y, v, Q, QT_y)
/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/ridge.pyc in _errors(self, alpha, y, v, Q, QT_y)
685 w = 1.0 / (v + alpha)
686 c = np.dot(Q, self._diag_dot(w, QT_y))
--> 687 G_diag = self._decomp_diag(w, Q)
688 # handle case where y is 2-d
689 if len(y.shape) != 1:
/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/ridge.pyc in _decomp_diag(self, v_prime, Q)
672 def _decomp_diag(self, v_prime, Q):
673 # compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T))
--> 674 return (v_prime * Q ** 2).sum(axis=-1)
675
676 def _diag_dot(self, D, B):
/usr/local/lib/python2.7/dist-packages/pandas/core/ops.pyc in wrapper(left, right, name)
531 return left._constructor(wrap_results(na_op(lvalues, rvalues)),
532 index=left.index, name=left.name,
--> 533 dtype=dtype)
534 return wrapper
535
/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in __init__(self, data, index, dtype, name, copy, fastpath)
209 else:
210 data = _sanitize_array(data, index, dtype, copy,
--> 211 raise_cast_failure=True)
212
213 data = SingleBlockManager(data, index, fastpath=True)
/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in _sanitize_array(data, index, dtype, copy, raise_cast_failure)
2683 elif subarr.ndim > 1:
2684 if isinstance(data, np.ndarray):
-> 2685 raise Exception('Data must be 1-dimensional')
2686 else:
2687 subarr = _asarray_tuplesafe(data, dtype=dtype)
Exception: Data must be 1-dimensional
The error seems to be due to sample_weights being a Pandas series rather than a numpy array:
from sklearn.linear_model import RidgeCV
temp1 = pd.DataFrame(np.random.rand(781, 21))
temp2 = pd.Series(temp1.sum(1))
weights = pd.Series(1 + 0.1 * np.random.rand(781))
result = RidgeCV(normalize=True).fit(temp1, temp2,
sample_weight=weights)
# Exception: Data must be 1-dimensional
If you use a numpy array instead, the error goes away:
result = RidgeCV(normalize=True).fit(temp1, temp2,
sample_weight=weights.values)
This seems to be a bug; I've opened a scikit-learn issue to report this.

Resources