MatPlotLib Key Error: 0 When Annotating - python-3.x

In this bit of code, I am trying to create a bar chart with one bar (as the "total" bar which will be placed next to other bars) with a slice of a data frame. It works for the other slice of the data frame, but for the "total" slice (which is just one row), I keep getting this "Key Error: 0":
x_4t = dfCOPN['Percent'][-1:]
y_4t = dfCOPN['index'][-1:]
ind4t = np.arange(len(y_4t))
...
for i, text in enumerate(ind4t):
if x_4t<72:
ax4t.annotate(str(x_4t)[:-2]+"%", xy=(x_4t+2,ind4t+0.4),fontsize=9, color='black', va='center', ha='left')
elif x_4t>=72:
ax4t.annotate(str(x_4t[i])[:-2]+"%", xy=(x_4t[i]-2,ind4t[i]+0.4),fontsize=9, color='white', va='center', ha='right')
Here's the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-42-e192a1699cc5> in <module>()
174 # for i, text in enumerate(ind4t):
--> 175 if x_4t<72:
176 ax4t.annotate(str(x_4t)[:-2]+"%", xy=(x_4t+2,ind4t+0.4),fontsize=9, color='black', va='center', ha='left')
C:\Users\m\Anaconda3\lib\site-packages\pandas\core\generic.py in __nonzero__(self)
729 "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
--> 730 .format(self.__class__.__name__))
731
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-42-e192a1699cc5> in <module>()
185 except:
186 for i, text in enumerate(ind4t):
--> 187 ax4t.annotate(str(x_4t[i])[:-2]+"%", xy=(x_4t[i]+2,ind4t[i]+0.4),fontsize=9, color='black', va='center', ha='left')
188 for i, text in enumerate(ind5t):
189 ax5t.annotate(str(x_5t[i])[:-2]+"%", xy=(x_5t[i]+2,ind5t[i]+0.4),fontsize=9, color='black', va='center', ha='left')
C:\Users\m\Anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
549 def __getitem__(self, key):
550 try:
--> 551 result = self.index.get_value(self, key)
552
553 if not np.isscalar(result):
C:\Users\m\Anaconda3\lib\site-packages\pandas\core\index.py in get_value(self, series, key)
1721
1722 try:
-> 1723 return self._engine.get_value(s, k)
1724 except KeyError as e1:
1725 if len(self) > 0 and self.inferred_type in ['integer','boolean']:
pandas\index.pyx in pandas.index.IndexEngine.get_value (pandas\index.c:3204)()
pandas\index.pyx in pandas.index.IndexEngine.get_value (pandas\index.c:2903)()
pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:3843)()
pandas\hashtable.pyx in pandas.hashtable.Int64HashTable.get_item (pandas\hashtable.c:6525)()
pandas\hashtable.pyx in pandas.hashtable.Int64HashTable.get_item (pandas\hashtable.c:6463)()
KeyError: 0
I'll send a complete example if needed, but I just wanted to see if the answer is obvious enough to not do so.
Thanks in advance!

TL;DR; use iloc
x4t.iloc[0]
The problem is that pd.Series use index not position indexing via []. This is a powerful idea/bit of syntax as frequently when you are working with a Series the value of the index is the interesting thing, not the position in the Series (ex a date index), however if you expect that Series / DataFrames will behave exactly like a numpy array you will have issues (I speak from experience here).

Related

No attribute 'set_values' or 'At_indexer' error due to set_values and at() in pandas due to Python3

I am new to pandas and still learning.
I am trying to add two numbers in series label-wise. One method is this:
numbers = pd.Series(np.random.randint(0,1000,10000))
for label, value in numbers.iteritems():
numbers.set_values(label, value+2)
numbers.head()
Output:
AttributeError: 'Series' object has no attribute 'set_values'
Now upon research I found out that it was deprecated and at() is used instead.
so when I used it like this:
for label, value in numbers.iteritems():
numbers.at(label, value+2)
numbers.head()
Output:
TypeError: '_AtIndexer' object is not callable
So when I use it like this with at[]:
for label, value in numbers.iteritems():
numbers.at[label, value+2]
numbers.head()
I get this output:
KeyError Traceback (most recent call last)
<ipython-input-43-b1f985a669d7> in <module>
1 for label, value in numbers.iteritems():
----> 2 numbers.at[label, value+2]
3
4 numbers.head()
C:\Users\Public\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
2078 return self.obj.loc[key]
2079
-> 2080 return super().__getitem__(key)
2081
2082 def __setitem__(self, key, value):
C:\Users\Public\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
2025
2026 key = self._convert_key(key)
-> 2027 return self.obj._get_value(*key, takeable=self._takeable)
2028
2029 def __setitem__(self, key, value):
C:\Users\Public\anaconda3\lib\site-packages\pandas\core\series.py in _get_value(self, label, takeable)
987
988 # Similar to Index.get_value, but we do not fall back to positional
--> 989 loc = self.index.get_loc(label)
990 return self.index._get_values_for_loc(self, loc, label)
991
C:\Users\Public\anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(self, key, method, tolerance)
356 except ValueError as err:
357 raise KeyError(key) from err
--> 358 raise KeyError(key)
359 return super().get_loc(key, method=method, tolerance=tolerance)
360
KeyError: (0, 10002)
What am I doing wrong and what can be fixed?
at is used as an accessor to the index given by its argument. When you pass label, value+2 to it, it sees this argument as a 2-tuple and looks for an index named literally (0, 10002) in the first turn but it fails since your series has integer indices 0, 1, ..., not tuples.
So you can leave label only in at and set what it returns to value + 2:
numbers = pd.Series(np.random.randint(0,1000,10000))
for label, value in numbers.iteritems():
# lookup the value and set it
numbers.at[label] = value + 2
(noting that this is equivalent to numbers += 2).

How do I pass the values to Catboost?

I'm trying to work with catboost and I've got a problem that I'm really stuck with right now. I have a dataframe with 28 columns, 2 of them are categorical. When the data is numerical there are some even and some fractional numbers, also some 0.00 values that should represent not an empty values but the actual nulls (like 1-1=0).
I'm trying to run this:
train_cl = cb.Pool(data=ret_df.iloc[:580000, :-1], label=ret_df.iloc[:580000, -1], cat_features=cats)
evl_cl = cb.Pool(data=ret_df.iloc[580000:, :-1], label=ret_df.iloc[580000:, -1], cat_features=cats)
But I have this error
---------------------------------------------------------------------------
CatBoostError Traceback (most recent call last)
<ipython-input-112-a515b0ab357b> in <module>
1 train_cl = cb.Pool(data=ret_df.iloc[:580000, :-1], label=ret_df.iloc[:580000, -1], cat_features=cats)
----> 2 evl_cl = cb.Pool(data=ret_df.iloc[580000:, :-1], label=ret_df.iloc[580000:, -1], cat_features=cats)
~\AppData\Local\Programs\Python\Python36\lib\site-packages\catboost\core.py in __init__(self, data, label, cat_features, text_features, embedding_features, column_description, pairs, delimiter, has_header, ignore_csv_quoting, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, feature_names, thread_count, log_cout, log_cerr)
615 )
616
--> 617 self._init(data, label, cat_features, text_features, embedding_features, pairs, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, feature_names, thread_count)
618 super(Pool, self).__init__()
619
~\AppData\Local\Programs\Python\Python36\lib\site-packages\catboost\core.py in _init(self, data, label, cat_features, text_features, embedding_features, pairs, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, feature_names, thread_count)
1081 if label is not None:
1082 self._check_label_type(label)
-> 1083 self._check_label_empty(label)
1084 label = self._label_if_pandas_to_numpy(label)
1085 if len(np.shape(label)) == 1:
~\AppData\Local\Programs\Python\Python36\lib\site-packages\catboost\core.py in _check_label_empty(self, label)
723 """
724 if len(label) == 0:
--> 725 raise CatBoostError("Labels variable is empty.")
726
727 def _check_label_shape(self, label, samples_count):
CatBoostError: Labels variable is empty.
I've googled this trouble, but found nothing. My hypothesis is that there is a problem with 0.00 values, but I do not know how to solve this because I literally can't replace these values with anything.
Please, help me!

Apply a fucntion with lambda to Pandas

I have 2 time-series and I would like to find the nearest date from each date in time-series1 to time-series2. I found how to do it separately per date, but I would like to apply it to the entire time-series1. They are in two different dataframes called o and p
This is how my data looks like:
Time-series1:
o['date']
>>>0 2020-01-26
1 2020-01-28
2 2020-01-31
3 2020-02-15
4 2020-02-17
...
86 2021-01-10
87 2021-01-20
88 2021-01-27
89 2021-01-30
90 2021-02-14
Name: date, Length: 91, dtype: datetime64[ns]
Time-series2:
p['date']
>>>1 2020-02-17
3 2020-03-02
4 2020-03-03
5 2020-03-04
6 2020-03-05
...
172 2021-01-30
173 2021-02-06
174 2021-02-07
177 2021-02-12
179 2021-02-14
Name: date, Length: 144, dtype: datetime64[ns]
The function that I use:
def nearest(pivot,items):
return min(items, key=lambda x: abs(x - pivot))
Which works on a separate singular date, for example:
nearest(o['date'][6], p['date'])
>>>Timestamp('2020-03-02 00:00:00')
When I try to apply it to the whole pandas Series I get an error:
o['date'].apply(nearest, args=(p['date']))
>>>---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-17-82c86ffd48ff> in <module>()
----> 1 o['date'].apply(nearest, args=(p['date']))
C:\Users\ran\Anaconda3\envs\main\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
4180
4181 # handle ufuncs and lambdas
-> 4182 if kwds or args and not isinstance(func, np.ufunc):
4183
4184 def f(x):
C:\Users\ran\Anaconda3\envs\main\lib\site-packages\pandas\core\generic.py in __nonzero__(self)
1325 def __nonzero__(self):
1326 raise ValueError(
-> 1327 f"The truth value of a {type(self).__name__} is ambiguous. "
1328 "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
1329 )
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
I feel that there is something basic I'm missing.
I guess I can do:
[nearest(x, p['date']) for x in o['date']]
But I would like to know how to apply it to a Pandas Series
Use Series.apply with lambda function:
s = o['date'].apply(lambda x: nearest(x, p['date']))
Or with args parameter:
s = o['date'].apply(nearest, args=(p['date'], ))
Numpy alternative with numpy.argmin should be faster:
a = o['date'].to_numpy()
b = p['date'].to_numpy()
pos = np.argmin(np.abs(a- b[:, None]), axis=0)
s = pd.Series(b[pos], index=o.index)

ImageDataBunch.from_df positional indexers are out-of-bounds

scratching my head on this issue. i dont know how to identify the positional indexers. am i even passing them?
attempting this for my first kaggle comp, can pass in the csv to a dataframe and make the needed edits. trying to create the ImageDataBunch so training a cnn can begin. This error pops up no matter which method is tried. Any advice would be appreciated.
data = ImageDataBunch.from_df(path, df, ds_tfms=tfms, size=24)
data.classes
Backtrace
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-25-5588812820e8> in <module>
----> 1 data = ImageDataBunch.from_df(path, df, ds_tfms=tfms, size=24)
2 data.classes
/opt/conda/lib/python3.7/site-packages/fastai/vision/data.py in from_df(cls, path, df, folder, label_delim, valid_pct, seed, fn_col, label_col, suffix, **kwargs)
117 src = (ImageList.from_df(df, path=path, folder=folder, suffix=suffix, cols=fn_col)
118 .split_by_rand_pct(valid_pct, seed)
--> 119 .label_from_df(label_delim=label_delim, cols=label_col))
120 return cls.create_from_ll(src, **kwargs)
121
/opt/conda/lib/python3.7/site-packages/fastai/data_block.py in _inner(*args, **kwargs)
477 assert isinstance(fv, Callable)
478 def _inner(*args, **kwargs):
--> 479 self.train = ft(*args, from_item_lists=True, **kwargs)
480 assert isinstance(self.train, LabelList)
481 kwargs['label_cls'] = self.train.y.__class__
/opt/conda/lib/python3.7/site-packages/fastai/data_block.py in label_from_df(self, cols, label_cls, **kwargs)
283 def label_from_df(self, cols:IntsOrStrs=1, label_cls:Callable=None, **kwargs):
284 "Label `self.items` from the values in `cols` in `self.inner_df`."
--> 285 labels = self.inner_df.iloc[:,df_names_to_idx(cols, self.inner_df)]
286 assert labels.isna().sum().sum() == 0, f"You have NaN values in column(s) {cols} of your dataframe, please fix it."
287 if is_listy(cols) and len(cols) > 1 and (label_cls is None or label_cls == MultiCategoryList):
/opt/conda/lib/python3.7/site-packages/pandas/core/indexing.py in __getitem__(self, key)
1760 except (KeyError, IndexError, AttributeError):
1761 pass
-> 1762 return self._getitem_tuple(key)
1763 else:
1764 # we by definition only have the 0th axis
/opt/conda/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
2065 def _getitem_tuple(self, tup: Tuple):
2066
-> 2067 self._has_valid_tuple(tup)
2068 try:
2069 return self._getitem_lowerdim(tup)
/opt/conda/lib/python3.7/site-packages/pandas/core/indexing.py in _has_valid_tuple(self, key)
701 raise IndexingError("Too many indexers")
702 try:
--> 703 self._validate_key(k, i)
704 except ValueError:
705 raise ValueError(
/opt/conda/lib/python3.7/site-packages/pandas/core/indexing.py in _validate_key(self, key, axis)
2007 # check that the key does not exceed the maximum size of the index
2008 if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis):
-> 2009 raise IndexError("positional indexers are out-of-bounds")
2010 else:
2011 raise ValueError(f"Can only index by location with a [{self._valid_types}]")
IndexError: positional indexers are out-of-bounds
I faced this error while creating a DataBunch when my dataframe/CSV did not have a class label explicitly defined.
I created a dummy column which stored 1's for all my rows in the dataframe and it seemed to work. Also please be sure to store your independent variable in the second column and the label(dummy variable in this case) in the first column.
I believe this error happens if there's just one column in the Pandas DataFrame.
Thanks.
Code:
df = pd.DataFrame(lines, columns=["dummy_value", "text"])
df.to_csv("./train.csv")
data_lm = TextLMDataBunch.from_csv(path, "train.csv", min_freq=1)
Note: This is my first attempt at answering a StackOverflow question. Hope it helped!
This error also appears when your dataset is not correctly split between test and validation.
In the case of dataframes, it assumes there is a column is_valid that indicates which rows are in validation set.
If all rows have True, then the training set is empty, so fastai cannot index into it to prepare the first example, thus raising this error.
Example:
data = pd.DataFrame({
'fname': [f'{x}.png' for x in range(10)],
'label': np.arange(10)%2,
'is_valid': True
})
blk = DataBlock((ImageBlock, CategoryBlock),
splitter=ColSplitter(),
get_x=ColReader('fname'),
get_y=ColReader('label'),
item_tfms=Resize(224, method=ResizeMethod.Squish),
)
blk.summary(data)
Results in the error.
Solution
The solution is to check that your data can be split correctly into train and valid sets. In the above example, it suffices to have one row that is not in validation set:
data.loc[0, 'is_valid'] = False
How to figure it out?
Work in a jupyter notebook. After the error, type %debug in a cell, and enter the post mortem debugging. Go to the frame of the setup function ( fastai/data/core.py(273) setup() ) by going up 5 frames.
This takes you to this line that is throwing the error.
You can then print(self.splits) and observe that the first one is empty.

How to plot activities by hour based on dataset in ggplot?

I'm trying to create code that will plot the number of entries into the MTA by hour. I have a csv dataset of entries and the hours they occur. I've reduced the dataset to a Pandas dataframe with just the entries, and I've added a column for "Hour" to show the time that the entries occur.
However, when plotting, I keep getting a "ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()." and I'm unclear what this means, or why it's happening.
I've tried adjusting the "Hour" column to a datetime; I've tried using the index instead of the "Hour" column in the dataframe.
from ggplot import *
import pandas as pd
turnstile_weather = pd.read_csv('/home/pi/Documents/Data analysis/turnstile_data_master_with_weather.csv')
def plot_weather_data(turnstile_weather):
entries_by_hour = pd.DataFrame(turnstile_weather['ENTRIESn_hourly'] \
.groupby(turnstile_weather['Hour']).sum())
entries_by_hour['Hour'] = set(turnstile_weather['Hour'])
plot = ggplot(entries_by_hour, \
aes(entries_by_hour['Hour'], entries_by_hour['ENTRIESn_hourly'])) \
+ geom_line()
print(plot)
plot_weather_data(turnstile_weather)
I expect to get a line chart, with Hours as the X-axis and Entries by Hour as the Y-axis, but I get an error instead:
ValueError Traceback (most recent call last)
<ipython-input-9-3cf39740bb64> in <module>
10 print(plot)
11
---> 12 plot_weather_data(turnstile_weather)
<ipython-input-9-3cf39740bb64> in plot_weather_data(turnstile_weather)
7 entries_by_hour = pd.DataFrame(turnstile_weather['ENTRIESn_hourly'].groupby(turnstile_weather['Hour']).sum())
8 entries_by_hour['Hour'] = set(turnstile_weather['Hour'])
----> 9 plot = ggplot(entries_by_hour, aes(entries_by_hour['Hour'], entries_by_hour['ENTRIESn_hourly'])) + geom_line()
10 print(plot)
11
/usr/local/lib/python3.5/dist-packages/ggplot/ggplot.py in __init__(self, aesthetics, data)
53 self._aes = aesthetics
54 self.data = data.copy()
---> 55 self._handle_index()
56 self.data = self._aes._evaluate_expressions(self.data)
57 self.data = self._aes.handle_identity_values(self.data)
/usr/local/lib/python3.5/dist-packages/ggplot/ggplot.py in _handle_index(self)
132
133 def _handle_index(self):
--> 134 if '__index__' in self._aes.values():
135 self.data['__index__'] = self.data.index
136
/usr/lib/python3.5/_collections_abc.py in __contains__(self, value)
688 def __contains__(self, value):
689 for key in self._mapping:
--> 690 if value == self._mapping[key]:
691 return True
692 return False
~/.local/lib/python3.5/site-packages/pandas/core/generic.py in __nonzero__(self)
1476 raise ValueError("The truth value of a {0} is ambiguous. "
1477 "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
-> 1478 .format(self.__class__.__name__))
1479
1480 __bool__ = __nonzero__
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

Resources