How do I pass the values to Catboost? - python-3.x

I'm trying to work with catboost and I've got a problem that I'm really stuck with right now. I have a dataframe with 28 columns, 2 of them are categorical. When the data is numerical there are some even and some fractional numbers, also some 0.00 values that should represent not an empty values but the actual nulls (like 1-1=0).
I'm trying to run this:
train_cl = cb.Pool(data=ret_df.iloc[:580000, :-1], label=ret_df.iloc[:580000, -1], cat_features=cats)
evl_cl = cb.Pool(data=ret_df.iloc[580000:, :-1], label=ret_df.iloc[580000:, -1], cat_features=cats)
But I have this error
---------------------------------------------------------------------------
CatBoostError Traceback (most recent call last)
<ipython-input-112-a515b0ab357b> in <module>
1 train_cl = cb.Pool(data=ret_df.iloc[:580000, :-1], label=ret_df.iloc[:580000, -1], cat_features=cats)
----> 2 evl_cl = cb.Pool(data=ret_df.iloc[580000:, :-1], label=ret_df.iloc[580000:, -1], cat_features=cats)
~\AppData\Local\Programs\Python\Python36\lib\site-packages\catboost\core.py in __init__(self, data, label, cat_features, text_features, embedding_features, column_description, pairs, delimiter, has_header, ignore_csv_quoting, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, feature_names, thread_count, log_cout, log_cerr)
615 )
616
--> 617 self._init(data, label, cat_features, text_features, embedding_features, pairs, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, feature_names, thread_count)
618 super(Pool, self).__init__()
619
~\AppData\Local\Programs\Python\Python36\lib\site-packages\catboost\core.py in _init(self, data, label, cat_features, text_features, embedding_features, pairs, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, feature_names, thread_count)
1081 if label is not None:
1082 self._check_label_type(label)
-> 1083 self._check_label_empty(label)
1084 label = self._label_if_pandas_to_numpy(label)
1085 if len(np.shape(label)) == 1:
~\AppData\Local\Programs\Python\Python36\lib\site-packages\catboost\core.py in _check_label_empty(self, label)
723 """
724 if len(label) == 0:
--> 725 raise CatBoostError("Labels variable is empty.")
726
727 def _check_label_shape(self, label, samples_count):
CatBoostError: Labels variable is empty.
I've googled this trouble, but found nothing. My hypothesis is that there is a problem with 0.00 values, but I do not know how to solve this because I literally can't replace these values with anything.
Please, help me!

Related

ImageDataBunch.from_df positional indexers are out-of-bounds

scratching my head on this issue. i dont know how to identify the positional indexers. am i even passing them?
attempting this for my first kaggle comp, can pass in the csv to a dataframe and make the needed edits. trying to create the ImageDataBunch so training a cnn can begin. This error pops up no matter which method is tried. Any advice would be appreciated.
data = ImageDataBunch.from_df(path, df, ds_tfms=tfms, size=24)
data.classes
Backtrace
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-25-5588812820e8> in <module>
----> 1 data = ImageDataBunch.from_df(path, df, ds_tfms=tfms, size=24)
2 data.classes
/opt/conda/lib/python3.7/site-packages/fastai/vision/data.py in from_df(cls, path, df, folder, label_delim, valid_pct, seed, fn_col, label_col, suffix, **kwargs)
117 src = (ImageList.from_df(df, path=path, folder=folder, suffix=suffix, cols=fn_col)
118 .split_by_rand_pct(valid_pct, seed)
--> 119 .label_from_df(label_delim=label_delim, cols=label_col))
120 return cls.create_from_ll(src, **kwargs)
121
/opt/conda/lib/python3.7/site-packages/fastai/data_block.py in _inner(*args, **kwargs)
477 assert isinstance(fv, Callable)
478 def _inner(*args, **kwargs):
--> 479 self.train = ft(*args, from_item_lists=True, **kwargs)
480 assert isinstance(self.train, LabelList)
481 kwargs['label_cls'] = self.train.y.__class__
/opt/conda/lib/python3.7/site-packages/fastai/data_block.py in label_from_df(self, cols, label_cls, **kwargs)
283 def label_from_df(self, cols:IntsOrStrs=1, label_cls:Callable=None, **kwargs):
284 "Label `self.items` from the values in `cols` in `self.inner_df`."
--> 285 labels = self.inner_df.iloc[:,df_names_to_idx(cols, self.inner_df)]
286 assert labels.isna().sum().sum() == 0, f"You have NaN values in column(s) {cols} of your dataframe, please fix it."
287 if is_listy(cols) and len(cols) > 1 and (label_cls is None or label_cls == MultiCategoryList):
/opt/conda/lib/python3.7/site-packages/pandas/core/indexing.py in __getitem__(self, key)
1760 except (KeyError, IndexError, AttributeError):
1761 pass
-> 1762 return self._getitem_tuple(key)
1763 else:
1764 # we by definition only have the 0th axis
/opt/conda/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
2065 def _getitem_tuple(self, tup: Tuple):
2066
-> 2067 self._has_valid_tuple(tup)
2068 try:
2069 return self._getitem_lowerdim(tup)
/opt/conda/lib/python3.7/site-packages/pandas/core/indexing.py in _has_valid_tuple(self, key)
701 raise IndexingError("Too many indexers")
702 try:
--> 703 self._validate_key(k, i)
704 except ValueError:
705 raise ValueError(
/opt/conda/lib/python3.7/site-packages/pandas/core/indexing.py in _validate_key(self, key, axis)
2007 # check that the key does not exceed the maximum size of the index
2008 if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis):
-> 2009 raise IndexError("positional indexers are out-of-bounds")
2010 else:
2011 raise ValueError(f"Can only index by location with a [{self._valid_types}]")
IndexError: positional indexers are out-of-bounds
I faced this error while creating a DataBunch when my dataframe/CSV did not have a class label explicitly defined.
I created a dummy column which stored 1's for all my rows in the dataframe and it seemed to work. Also please be sure to store your independent variable in the second column and the label(dummy variable in this case) in the first column.
I believe this error happens if there's just one column in the Pandas DataFrame.
Thanks.
Code:
df = pd.DataFrame(lines, columns=["dummy_value", "text"])
df.to_csv("./train.csv")
data_lm = TextLMDataBunch.from_csv(path, "train.csv", min_freq=1)
Note: This is my first attempt at answering a StackOverflow question. Hope it helped!
This error also appears when your dataset is not correctly split between test and validation.
In the case of dataframes, it assumes there is a column is_valid that indicates which rows are in validation set.
If all rows have True, then the training set is empty, so fastai cannot index into it to prepare the first example, thus raising this error.
Example:
data = pd.DataFrame({
'fname': [f'{x}.png' for x in range(10)],
'label': np.arange(10)%2,
'is_valid': True
})
blk = DataBlock((ImageBlock, CategoryBlock),
splitter=ColSplitter(),
get_x=ColReader('fname'),
get_y=ColReader('label'),
item_tfms=Resize(224, method=ResizeMethod.Squish),
)
blk.summary(data)
Results in the error.
Solution
The solution is to check that your data can be split correctly into train and valid sets. In the above example, it suffices to have one row that is not in validation set:
data.loc[0, 'is_valid'] = False
How to figure it out?
Work in a jupyter notebook. After the error, type %debug in a cell, and enter the post mortem debugging. Go to the frame of the setup function ( fastai/data/core.py(273) setup() ) by going up 5 frames.
This takes you to this line that is throwing the error.
You can then print(self.splits) and observe that the first one is empty.

Problems with seaborn (ndim)

I am using seaborn to plot a very simple data set. Here is what I do:
import seaborn as sns
import pandas as pd
df = pd.read_excel('myfile.xlsx')
sns.set(style="white")
g = sns.PairGrid(df, diag_sharey=False)
g.map_lower(sns.kdeplot)
g.map_upper(sns.scatterplot)
g.map_diag(sns.kdeplot, lw=3)
I get the following error: AttributeError: 'NoneType' object has no attribute 'ndim'. Weirdly, the plot is ploted in parts (see below).
Any idea why that is the case and what I can do to solve the issue?
EDIT:
The dataframe has the following attributes:
plan_change int64
user_login float64
new_act_ratio float64
on_time int64
Unfortunately, I cannot upload the data set. However I can say, that plotting other seaborn graphs works just fine.
The total error message is the following:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-16-2dbc61abd2bd> in <module>()
3 g = sns.PairGrid(df, diag_sharey=False)
4 g.map_lower(sns.kdeplot)
----> 5 g.map_upper(sns.scatterplot)
6 g.map_diag(sns.kdeplot, lw=3)
7
/anaconda/lib/python3.5/site-packages/seaborn/axisgrid.py in map_upper(self, func, **kwargs)
1488 color = self.palette[k] if kw_color is None else kw_color
1489 func(data_k[x_var], data_k[y_var], label=label_k,
-> 1490 color=color, **kwargs)
1491
1492 self._clean_axis(ax)
/anaconda/lib/python3.5/site-packages/seaborn/relational.py in scatterplot(x, y, hue, style, size, data, palette, hue_order, hue_norm, sizes, size_order, size_norm, markers, style_order, x_bins, y_bins, units, estimator, ci, n_boot, alpha, x_jitter, y_jitter, legend, ax, **kwargs)
1333 x_bins=x_bins, y_bins=y_bins,
1334 estimator=estimator, ci=ci, n_boot=n_boot,
-> 1335 alpha=alpha, x_jitter=x_jitter, y_jitter=y_jitter, legend=legend,
1336 )
1337
/anaconda/lib/python3.5/site-packages/seaborn/relational.py in __init__(self, x, y, hue, size, style, data, palette, hue_order, hue_norm, sizes, size_order, size_norm, dashes, markers, style_order, x_bins, y_bins, units, estimator, ci, n_boot, alpha, x_jitter, y_jitter, legend)
850
851 plot_data = self.establish_variables(
--> 852 x, y, hue, size, style, units, data
853 )
854
/anaconda/lib/python3.5/site-packages/seaborn/relational.py in establish_variables(self, x, y, hue, size, style, units, data)
155 units=units
156 )
--> 157 plot_data = pd.DataFrame(plot_data)
158
159 # Option 3:
/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
264 dtype=dtype, copy=copy)
265 elif isinstance(data, dict):
--> 266 mgr = self._init_dict(data, index, columns, dtype=dtype)
267 elif isinstance(data, ma.MaskedArray):
268 import numpy.ma.mrecords as mrecords
/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _init_dict(self, data, index, columns, dtype)
400 arrays = [data[k] for k in keys]
401
--> 402 return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
403
404 def _init_ndarray(self, values, index, columns, dtype=None, copy=False):
/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
5382
5383 # don't force copy because getting jammed in an ndarray anyway
-> 5384 arrays = _homogenize(arrays, index, dtype)
5385
5386 # from BlockManager perspective
/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _homogenize(data, index, dtype)
5693 v = lib.fast_multiget(v, oindex.values, default=NA)
5694 v = _sanitize_array(v, index, dtype=dtype, copy=False,
-> 5695 raise_cast_failure=False)
5696
5697 homogenized.append(v)
/anaconda/lib/python3.5/site-packages/pandas/core/series.py in _sanitize_array(data, index, dtype, copy, raise_cast_failure)
2917
2918 # scalar like
-> 2919 if subarr.ndim == 0:
2920 if isinstance(data, list): # pragma: no cover
2921 subarr = np.array(data, dtype=object)
AttributeError: 'NoneType' object has no attribute 'ndim'

fit_transform error using CountVectorizer

So I have a dataframe X which looks something like this:
X.head()
0 My wife took me here on my birthday for breakf...
1 I have no idea why some people give bad review...
3 Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4 General Manager Scott Petello is a good egg!!!...
6 Drop what you're doing and drive here. After I...
Name: text, dtype: object
And then,
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(X)
But I get this error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-61-8ff79b91e317> in <module>()
----> 1 X = cv.fit_transform(X)
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
867
868 vocabulary, X = self._count_vocab(raw_documents,
--> 869 self.fixed_vocabulary_)
870
871 if self.binary:
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
790 for doc in raw_documents:
791 feature_counter = {}
--> 792 for feature in analyze(doc):
793 try:
794 feature_idx = vocabulary[feature]
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(doc)
264
265 return lambda doc: self._word_ngrams(
--> 266 tokenize(preprocess(self.decode(doc))), stop_words)
267
268 else:
~/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(x)
230
231 if self.lowercase:
--> 232 return lambda x: strip_accents(x.lower())
233 else:
234 return strip_accents
~/anaconda3/lib/python3.6/site-packages/scipy/sparse/base.py in __getattr__(self, attr)
574 return self.getnnz()
575 else:
--> 576 raise AttributeError(attr + " not found")
577
578 def transpose(self, axes=None, copy=False):
AttributeError: lower not found
No idea why.
You need to specify the column name of the text data even if the dataframe has single column.
X_countMatrix = cv.fit_transform(X['text'])
Because a CountVectorizer expects an iterable as input and when you supply a dataframe as an argument, only thing thats iterated is the column names. So even if you did not have any errors, that would be incorrect. Lucky that you got an error and got a chance to correct it.

Reconstruct Image from overlapping patches of image

I have used tf.extract_image_patches() to get a tensor of overlapping patches
from the image as described in this link. The answer in the mentioned link suggests to use tf.space_to_depth() to reconstruct the image from overlapping patches. But the problem is that this does not give the desirable results in my case and upon researching I came to know that tf.space_to_depth() does not deal with the overlapping blocks. My code looks like:
import tensorflow as tf
import numpy as np
c = 3
height = 3900
width = 6000
ksizes = [1, 150, 150, 1]
strides = [1, 75, 75, 1]
image = #image of shape [1, height, width, 3]
patches = tf.extract_image_patches(image, ksizes = ksizes, strides= strides, [1, 1, 1, 1], 'VALID')
patches = tf.reshape(patches, [-1, 150, 150, 3])
reconstructed = tf.reshape(patches, [1, height, width, 3])
rec_new = tf.space_to_depth(reconstructed,75)
rec_new = tf.reshape(rec_new,[height,width,3])
This gives me error:
InvalidArgumentError Traceback (most recent call
last)
D:\AnacondaIDE\lib\site-packages\tensorflow\python\framework\common_shapes.py
in _call_cpp_shape_fn_impl(op, input_tensors_needed,
input_tensors_as_shapes_needed, require_shape_fn)
653 graph_def_version, node_def_str, input_shapes, input_tensors,
--> 654 input_tensors_as_shapes, status)
655 except errors.InvalidArgumentError as err:
D:\AnacondaIDE\lib\contextlib.py in exit(self, type, value,
traceback)
87 try:
---> 88 next(self.gen)
89 except StopIteration:
D:\AnacondaIDE\lib\site-packages\tensorflow\python\framework\errors_impl.py
in raise_exception_on_not_ok_status()
465 compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 466 pywrap_tensorflow.TF_GetCode(status))
467 finally:
InvalidArgumentError: Dimension size must be evenly divisible by
70200000 but is 271957500 for 'Reshape_22' (op: 'Reshape') with input
shapes: [4029,150,150,3], [4] and with input tensors computed as
partial shapes: input1 = [?,3900,6000,3].
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call
last) in ()
----> 1 reconstructed = tf.reshape(features, [-1, height, width, channel])
2 rec_new = tf.space_to_depth(reconstructed,75)
3 rec_new = tf.reshape(rec_new,[h,h,c])
D:\AnacondaIDE\lib\site-packages\tensorflow\python\ops\gen_array_ops.py
in reshape(tensor, shape, name) 2617 """ 2618 result =
_op_def_lib.apply_op("Reshape", tensor=tensor, shape=shape,
-> 2619 name=name) 2620 return result 2621
D:\AnacondaIDE\lib\site-packages\tensorflow\python\framework\op_def_library.py
in apply_op(self, op_type_name, name, **keywords)
765 op = g.create_op(op_type_name, inputs, output_types, name=scope,
766 input_types=input_types, attrs=attr_protos,
--> 767 op_def=op_def)
768 if output_structure:
769 outputs = op.outputs
D:\AnacondaIDE\lib\site-packages\tensorflow\python\framework\ops.py in
create_op(self, op_type, inputs, dtypes, input_types, name, attrs,
op_def, compute_shapes, compute_device) 2630
original_op=self._default_original_op, op_def=op_def) 2631 if
compute_shapes:
-> 2632 set_shapes_for_outputs(ret) 2633 self._add_op(ret) 2634
self._record_op_seen_by_control_dependencies(ret)
D:\AnacondaIDE\lib\site-packages\tensorflow\python\framework\ops.py in
set_shapes_for_outputs(op) 1909 shape_func =
_call_cpp_shape_fn_and_require_op 1910
-> 1911 shapes = shape_func(op) 1912 if shapes is None: 1913 raise RuntimeError(
D:\AnacondaIDE\lib\site-packages\tensorflow\python\framework\ops.py in
call_with_requiring(op) 1859 1860 def
call_with_requiring(op):
-> 1861 return call_cpp_shape_fn(op, require_shape_fn=True) 1862 1863 _call_cpp_shape_fn_and_require_op =
call_with_requiring
D:\AnacondaIDE\lib\site-packages\tensorflow\python\framework\common_shapes.py
in call_cpp_shape_fn(op, require_shape_fn)
593 res = _call_cpp_shape_fn_impl(op, input_tensors_needed,
594 input_tensors_as_shapes_needed,
--> 595 require_shape_fn)
596 if not isinstance(res, dict):
597 # Handles the case where _call_cpp_shape_fn_impl calls unknown_shape(op).
D:\AnacondaIDE\lib\site-packages\tensorflow\python\framework\common_shapes.py
in _call_cpp_shape_fn_impl(op, input_tensors_needed,
input_tensors_as_shapes_needed, require_shape_fn)
657 missing_shape_fn = True
658 else:
--> 659 raise ValueError(err.message)
660
661 if missing_shape_fn:
ValueError: Dimension size must be evenly divisible by 70200000 but is
271957500 for 'Reshape_22' (op: 'Reshape') with input shapes:
[4029,150,150,3], [4] and with input tensors computed as partial
shapes: input1 = [?,3900,6000,3].
I know this is error due to non-compatible dimensions, but it should be that way, right? Please help me to solve this.
I guess that the problem is that in the link you posted the author is using the same value for strides and ksizes, while you are using strides equal to one half of ksizes. This is the reason why the dimensions do not match, you should write the logic of reducing the size of the patches before gluing them (for instance by selecting the central square of each patch).

Tree classifier to graphviz ERROR

I had made a Tree Classifier named model and tried to use the export graphviz function like this:
export_graphviz(decision_tree=model,
out_file='NT_model.dot',
feature_names=X_train.columns,
class_names=model.classes_,
leaves_parallel=True,
filled=True,
rotate=False,
rounded=True)
For some reason my run had raised this exception:
TypeError Traceback (most recent call last)
<ipython-input-298-40fe56bb0c85> in <module>()
6 filled=True,
7 rotate=False,
----> 8 rounded=True)
C:\Users\yonatanv\AppData\Local\Continuum\Anaconda3\lib\site-
packages\sklearn\tree\export.py in export_graphviz(decision_tree, out_file,
max_depth, feature_names, class_names, label, filled, leaves_parallel,
impurity, node_ids, proportion, rotate, rounded, special_characters)
431 recurse(decision_tree, 0, criterion="impurity")
432 else:
--> 433 recurse(decision_tree.tree_, 0,
criterion=decision_tree.criterion)
434
435 # If required, draw leaf nodes at same depth as each other
C:\Users\yonatanv\AppData\Local\Continuum\Anaconda3\lib\site-
packages\sklearn\tree\export.py in recurse(tree, node_id, criterion, parent,
depth)
319 out_file.write('%d [label=%s'
320 % (node_id,
--> 321 node_to_str(tree, node_id,
criterion)))
322
323 if filled:
C:\Users\yonatanv\AppData\Local\Continuum\Anaconda3\lib\site-
packages\sklearn\tree\export.py in node_to_str(tree, node_id, criterion)
289 np.argmax(value),
290 characters[2])
--> 291 node_string += class_name
292
293 # Clean up any trailing newlines
TypeError: ufunc 'add' did not contain a loop with signature matching types
dtype('<U90') dtype('<U90') dtype('<U90')
My hyper parameters for the visualizations are those:
print(model)
DecisionTreeClassifier(class_weight={1.0: 10, 0.0: 1}, criterion='gini',
max_depth=7, max_features=None, max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=50,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=0, splitter='best')
print(model.classes_)
[ 0. , 1. ]
Help would be most appreciated!
As you see here specified in the documentation of export_graphviz, the param class_names works for strings, not float or int.
class_names : list of strings, bool or None, optional (default=None)
Try converting the model.classes_ to list of strings before passing them in export_graphviz.
Try class_names=['0', '1'] or class_names=['0.0', '1.0'] in the call to export_graphviz().
For a more general solution, use:
class_names=[str(x) for x in model.classes_]
But is there a specific reason that you are passing float values as y in model.fit()? Because that is mostly not required in classification task. Do you have actual y labels as this only or are you converting string labels to numeric before fitting the model?

Resources