Use idxmax for indexing in pandas - python-3.x

Here is what I am trying to do:
In [7]: from pandas import DataFrame, Series
In [8]: import pandas as pd
In [9]: import numpy as np
In [10]: df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
Out[10]:
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
In [11]: df.idxmax()
Out[11]:
one b
two d
dtype: object
In [12]: df[df.idxmax()] = -9.99
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-12-018b077daf48> in <module>()
----> 1 df[df.idxmax()] = -9.99
/usr/local/lib/python3.4/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
2103
2104 if isinstance(key, (Series, np.ndarray, list, Index)):
-> 2105 self._setitem_array(key, value)
2106 elif isinstance(key, DataFrame):
2107 self._setitem_frame(key, value)
/usr/local/lib/python3.4/site-packages/pandas/core/frame.py in _setitem_array(self, key, value)
2131 self[k1] = value[k2]
2132 else:
-> 2133 indexer = self.ix._convert_to_indexer(key, axis=1)
2134 self._check_setitem_copy()
2135 self.ix._setitem_with_indexer((slice(None), indexer), value)
/usr/local/lib/python3.4/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
1141 if isinstance(obj, tuple) and is_setter:
1142 return {'key': obj}
-> 1143 raise KeyError('%s not in index' % objarr[mask])
1144
1145 return _values_from_object(indexer)
KeyError: "['b' 'd'] not in index"
Intuitively this should work, but it doesn't. Any workarounds?

You should iterate over the series and access the index and col name to set the values:
In [30]:
for items in df.idxmax().iteritems():
print(items)
df.loc[items[1], items[0]] = -9.9
df
('one', 'b')
('two', 'd')
Out[30]:
one two
a 1.40 NaN
b -9.90 -4.5
c NaN NaN
d 0.75 -9.9
I've printed the items to show what the contents are

Related

Applying function to a dataframe with a vector return axis related error?

I have the following function, dataframe and vector, why I am getting an error?
import pandas as pd
import numpy as np
def vanilla_vec_similarity(x, y):
x.drop('request_id', axis=1, inplace=True).values.flatten().tolist()
y.drop('request_id', axis=1, inplace=True).values.flatten().tolist()
res = (np.array(x) == np.array(y)).astype(int)
return res.mean()
test_df = pd.DataFrame({'request_id': [55, 42, 13], 'a': ['x','y','z'], 'b':[1,2,3], 'c': [1.0, -1.8, 19.113]})
test_vec = pd.DataFrame([[123,'x',1.1, -1.8]], columns=['request_id', 'a', 'b', 'c'])
test_df['similarity'] = test_df.apply(lambda x: vanilla_vec_similarity(x, test_vec), axis=1)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in _get_axis_number(cls, axis)
367 try:
--> 368 return cls._AXIS_TO_AXIS_NUMBER[axis]
369 except KeyError:
KeyError: 1
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
10 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in _get_axis_number(cls, axis)
368 return cls._AXIS_TO_AXIS_NUMBER[axis]
369 except KeyError:
--> 370 raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
371
372 #classmethod
ValueError: No axis named 1 for object type Series
You can make this code work with the following changes:
def vanilla_vec_similarity(x, y):
x.drop('request_id', axis=1).values.flatten().tolist()
y.drop('request_id', axis=1).values.flatten().tolist()
res = (np.array(x) == np.array(y)).astype(int)
return res.mean()
test_df = pd.DataFrame({'request_id': [55, 42, 13], 'a': ['x','y','z'], 'b':[1,2,3], 'c': [1.0, -1.8, 19.113]})
test_vec = pd.DataFrame([[123,'x',1.1, -1.8]], columns=['request_id', 'a', 'b', 'c'])
test_df['similarity'] = test_df.apply(lambda x: vanilla_vec_similarity(x.to_frame().T, test_vec), axis=1)
Explanation:
Firstly when you do this test_df.apply(lambda x: vanilla_vec_similarity(x, test_vec), axis=1) you are passing each
row as a series (with column names as index of series) to the
function.
Code breaks because you are trying to drop column request_id as it does not exists.
Also you don't need to use inplace=True.
Or You can just use:
test_df['similarity'] = test_df.apply(lambda x: x[1:].eq(pd.Series(test_vec.loc[0])[1:]).mean(), axis=1)
Or If you define test_vec as Series instead of Dataframe:
test_vec = pd.Series([123,'x',1.1, -1.8], index=['request_id', 'a', 'b', 'c'])
test_df['similarity'] = test_df.apply(lambda x: x[1:].eq(test_vec[1:]).mean(), axis=1)

How to Vectorizationdataframe pandas daframe with list in condition?

I would like to Vectorization my dataframe with NumPy arrays but I got an error
Here is the code :
Here I initialize my dataframe
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': [True,False,True,False],
'E': pd.Categorical(["test", "Draft", "test", "Draft"]),
'F': 'foo'})
df2
output:
A B C D E F
0 1.0 2013-01-02 1.0 True test foo
1 1.0 2013-01-02 1.0 False Draft foo
2 1.0 2013-01-02 1.0 True test foo
3 1.0 2013-01-02 1.0 False train foo
Here I define the function to apply to dataframe's columns
def IsBillingValid2(xE,yBilling):
if(xE not in ['Draft','Cancelled'] and yBilling==True): #Order Edited
return True
else:
return False
Here I launch my function
df2['BillingPostalCode_Det_StageName_Det']=IsBillingValid(df2['E'].values,df2['D'].values)
Here is the Error:
output:
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<command-2946139111570059> in <module>
16 return False
17
---> 18 df2['BillingPostalCode_Det_StageName_Det']=IsBillingValid(df2['E'].values,df2['D'].values)
19
<command-2041881674588848> in IsBillingValid(xStageName, yBilling)
207 def IsBillingValid(xStageName,yBilling):
208
--> 209 if(xStageName not in ['Draft','Cancelled'] and yBilling==True): #Order Edited
210 return True
211
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Thanks for your help
You don't need apply especially when you want vectorized operation.
Use pandas.Series.isin:
df2['BillingPostalCode_Det_StageName_Det'] = ~df2["E"].isin({"Draft", "Cancelled"}) & df2["D"]
print(df2)
Output:
A B C D E F BillingPostalCode_Det_StageName_Det
0 1.0 2013-01-02 1.0 True test foo True
1 1.0 2013-01-02 1.0 False Draft foo False
2 1.0 2013-01-02 1.0 True test foo True
3 1.0 2013-01-02 1.0 False Draft foo False

hvplot histogram: DataError: None of the available storage backends were able to support the supplied data format

import pandas as pd
import numpy as np
import random
import copy
import feather
import plotly.graph_objects as go
import plotly.express as px
import panel as pn
import holoviews as hv
import geoviews as gv
import geoviews.feature as gf
import cartopy
import cartopy.feature as cf
from geoviews import opts
from cartopy import crs as ccrs
import hvplot.pandas # noqa
import colorcet as cc
from colorcet.plotting import swatch
hv.extension("bokeh","plotly")
I have a dataframe called test:
Out[5]:
age age_band car_ins_renew_month people_type
0 NaN NaN NaN sign_up_only
1 61.0 55-64 7.0 active_interest
2 NaN NaN NaN sign_up_only
3 55.0 55-64 8.0 previous_customer
4 NaN NaN NaN sign_up_only
... ... ... ... ...
107627 42.0 35-44 6.0 previous_customer
107628 73.0 65+ 7.0 previous_customer
107629 NaN NaN NaN sign_up_only
107630 NaN NaN NaN sign_up_only
107631 NaN NaN NaN sign_up_only
[107632 rows x 4 columns]
In [6]: test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107632 entries, 0 to 107631
Data columns (total 4 columns):
age 73289 non-null float32
age_band 73289 non-null category
car_ins_renew_month 64290 non-null float32
people_type 107632 non-null category
dtypes: category(2), float32(2)
memory usage: 1.0 MB
For the entire test dataframe, I can successfully produce histograms using hvplot:
age (with hover data for age_band):
In [7]: test.hvplot.hist(
...: y="age",by=["age_band"],
...: bins=[18,25,35,45,55,65,74],
...: xticks=[(21.5,"18-24"),(30,"25-34"),(40,"35-44"),(50,"45-54"),(60,"55-64"),(69.5,"65-74")],
...: color="teal",legend=False,
...: line_width=4,line_color="w",
...: width=650,height=280
...: )
car_ins_renew_month:
test.hvplot.hist(
...: y="car_ins_renew_month",
...: bins=[1,2,3,4,5,6,7,8,9,10,11,12,13],
...: xticks=[(1.5,"JAN"),(2.5,"FEB"),(3.5,"MAR"),(4.5,"APR"),(5.5,"MAY"),(6.5,"JUN"),(7.5,"JUL"),(8.5,"AUG"),(9.5,"SEP"),(10.5,"OCT"),(11.5,"NOV"),(12.5,"DEC")],
...: color="teal",legend=False,
...: line_width=4,line_color="w",
...: width=650,height=280
...: )
However, for the subset of test where people_type is equal to previous_customer:
In [11]: test_prev_cust = test.loc[test["people_type"]=="previous_customer"]
while I can successfully produce a histogram for the car_ins_renew_month attribute:
In [13]: test_prev_cust.hvplot.hist(
...: y="car_ins_renew_month",
...: bins=[1,2,3,4,5,6,7,8,9,10,11,12,13],
...: xticks=[(1.5,"JAN"),(2.5,"FEB"),(3.5,"MAR"),(4.5,"APR"),(5.5,"MAY"),(6.5,"JUN"),(7.5,"JUL"),(8.5,"AUG"),(9.5,"SEP"),(10.5,"OCT"),(11.5,"NOV"),(12.5,"DEC")],
...: color="teal",legend=False,
...: line_width=4,line_color="w",
...: width=650,height=280
...: )
when I try to produce a histogram for the age attribute then I get the following error:
In [14]: test_prev_cust = hvplot.hist(
...: y="age",by=["age_band"],
...: bins=[18,25,35,45,55,65,74],
...: xticks=[(21.5,"18-24"),(30,"25-34"),(40,"35-44"),(50,"45-54"),(60,"55-64"),(69.5,"65-74")],
...: color="teal",legend=False,
...: line_width=4,line_color="w",
...: width=650,height=280
...: )
---------------------------------------------------------------------------
DataError Traceback (most recent call last)
<ipython-input-100-b2108cee586d> in <module>
7 color="teal",legend=False,
8 line_width=4,line_color="w",
----> 9 width=650,height=280
10 )
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/hvplot/plotting/core.py in hist(self, y, by, **kwds)
399 The HoloViews representation of the plot.
400 """
--> 401 return self(kind='hist', x=None, y=y, by=by, **kwds)
402
403 def kde(self, y=None, by=None, **kwds):
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/hvplot/plotting/core.py in __call__(self, x, y, kind, **kwds)
70 return pn.panel(plot, **panel_dict)
71
---> 72 return self._get_converter(x, y, kind, **kwds)(kind, x, y)
73
74 def _get_converter(self, x=None, y=None, kind=None, **kwds):
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/hvplot/converter.py in __call__(self, kind, x, y)
942 obj = DynamicMap(cbcallable, streams=[self.stream])
943 else:
--> 944 obj = method(x, y)
945
946 if self.crs and self.project:
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/hvplot/converter.py in hist(self, x, y, data)
1383 if self.by:
1384 hist = hists = histogram(
-> 1385 ds.groupby(self.by), dimension=y, **hist_opts
1386 )
1387 hist = hists.last
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/param/parameterized.py in __new__(class_, *args, **params)
2810 inst = class_.instance()
2811 inst.param._set_name(class_.__name__)
-> 2812 return inst.__call__(*args,**params)
2813
2814 def __call__(self,*args,**kw):
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/operation.py in __call__(self, element, **kwargs)
162 elif 'streams' not in kwargs:
163 kwargs['streams'] = self.p.streams
--> 164 return element.apply(self, **kwargs)
165
166
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/accessors.py in __call__(self, function, streams, link_inputs, dynamic, **kwargs)
113 for k, v in self._obj.data.items():
114 new_val = v.apply(function, dynamic=dynamic, streams=streams,
--> 115 link_inputs=link_inputs, **kwargs)
116 if new_val is not None:
117 mapped.append((k, new_val))
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/accessors.py in __call__(self, function, streams, link_inputs, dynamic, **kwargs)
108 if hasattr(function, 'dynamic'):
109 inner_kwargs['dynamic'] = False
--> 110 return function(self._obj, **inner_kwargs)
111 elif self._obj._deep_indexable:
112 mapped = []
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/operation.py in __call__(self, element, **kwargs)
159 for k, el in element.items()])
160 elif isinstance(element, ViewableElement):
--> 161 return self._apply(element)
162 elif 'streams' not in kwargs:
163 kwargs['streams'] = self.p.streams
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/operation.py in _apply(self, element, key)
119 for hook in self._preprocess_hooks:
120 kwargs.update(hook(self, element))
--> 121 ret = self._process(element, key)
122 for hook in self._postprocess_hooks:
123 ret = hook(self, ret, **kwargs)
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/operation/element.py in _process(self, element, key)
657 hist *= edges[1]-edges[0]
658 return Histogram((edges, hist), kdims=[element.get_dimension(selected_dim)],
--> 659 label=element.label, **params)
660
661
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/element/chart.py in __init__(self, data, edges, **params)
196 elif isinstance(data, tuple) and len(data) == 2 and len(data[0])+1 == len(data[1]):
197 data = data[::-1]
--> 198 super(Histogram, self).__init__(data, **params)
199
200
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/data/__init__.py in __init__(self, data, kdims, vdims, **kwargs)
209 validate_vdims = kwargs.pop('_validate_vdims', True)
210 initialized = Interface.initialize(type(self), data, kdims, vdims,
--> 211 datatype=kwargs.get('datatype'))
212 (data, self.interface, dims, extra_kws) = initialized
213 super(Dataset, self).__init__(data, **dict(kwargs, **dict(dims, **extra_kws)))
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/data/interface.py in initialize(cls, eltype, data, kdims, vdims, datatype)
252 % (intfc.__name__, e))
253 error = ' '.join([error, priority_error])
--> 254 raise DataError(error)
255
256 return data, interface, dims, extra_kws
DataError: None of the available storage backends were able to support the supplied data format.
I know that I can successfully produce histograms using hvplot for subsets of my test dataframe for both the car_ins_renew_month and age attributes as I was able to do this for people_type is equal to active_interest. I just can't for people_type is equal to previous_customer.
One thing that I did notice for my test_prev_cust dataframe was that there are no people in 2 of the categories for age_band:
In [18]: test_prev_cust["age_band"].value_counts()
Out[18]:
45-54 13457
55-64 10369
35-44 8760
65+ 7801
25-34 0
18-24 0
Name: age_band, dtype: int64
Could this be the cause of my issue? If so then is there a way to work around this and still include age_band as a hover data on my plot?
Thanks
Software versions:
bokeh 1.4.0 py37_0
cartopy 0.17.0 py37haea56ea_1
colorcet 2.0.2 py_0 pyviz
feather-format 0.4.0 py_1003 conda-forge
geoviews 1.6.5 py_0 pyviz
holoviews 1.12.6 py_0 pyviz
hvplot 0.5.2 py_0 pyviz
jupyter 1.0.0 py37_7
matplotlib 3.1.1 py37h54f8f79_0
notebook 6.0.2 py37_0
numpy 1.17.3 py37h4174a10_0
pandas 0.25.3 py37h0a44026_0
panel 0.7.0 py_0 pyviz
plotly 4.3.0 py_0 plotly
plotly_express 0.4.1 py_0 plotly
python 3.7.5 h359304d_0
seaborn 0.9.0 pyh91ea838_1
I'm on os x Catalina, using latest version of Firefox and I am working in a Jupyter notebook.
The problem is caused by your variable age_band being categorical, having 0 counts for some of the categories and using it with the keyword by=['age_band].
You could try converting age_band to a string, but in this case creating a barplot is nicer I think:
age_band_group = df.groupby(['age_band']
).agg(count=('age', np.size)
).fillna(0)
age_band_group.hvplot.bar(color='teal')

Unable to plot scatter plot because of TypeError

I have a dataset, in which i will be using only a single column to apply kmeans clustering. However while plotting the graph, i am getting "numpy.ndarray". I tried converting to float, but still facing the same issue
Dataframe:
Brim
1234.5
345
675.7
120
110
Code:
from sklearn.cluster import KMeans
import numpy as np
km = KMeans(n_clusters=4, init='k-means++',n_init=10)
km.fit(df1)
x = km.fit_predict(df1)
x
array([0, 0, 0, ..., 3, 3, 3])
np.shape(x)
(1097,)
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(df1[x ==1,0], df1[x == 0,1], s=100, c='red')
plt.scatter(df1[x ==1,0], df1[x == 1,1], s=100, c='black')
plt.scatter(df1[x ==2,0], df1[x == 2,1], s=100, c='blue')
plt.scatter(df1[x ==3,0], df1[x == 3,1], s=100, c='cyan')
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-62-5f0966ccc828> in <module>()
1 import matplotlib.pyplot as plt
2 get_ipython().run_line_magic('matplotlib', 'inline')
----> 3 plt.scatter(df1[x ==1,0], df1[x == 0,1], s=100, c='red')
4 plt.scatter(df1[x ==1,0], df1[x == 1,1], s=100, c='black')
5 plt.scatter(df1[x ==2,0], df1[x == 2,1], s=100, c='blue')
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2137 return self._getitem_multilevel(key)
2138 else:
->2139 return self._getitem_column(key)
2140
2141 def _getitem_column(self, key):
~\AppData\Local\Continuum\anaconda3\lib\site-
packages\pandas\core\frame.py in _getitem_column(self, key)
2144 # get column
2145 if self.columns.is_unique:
-> 2146 return self._get_item_cache(key)
2147
2148 # duplicate columns & possible reduce dimensionality
~\AppData\Local\Continuum\anaconda3\lib\site- packages\pandas\core\generic.py in _get_item_cache(self, item)
1838 """Return the cached item, item represents a label indexer."""
1839 cache = self._item_cache
-> 1840 res = cache.get(item)
1841 if res is None:
1842 values = self._data.get(item)
TypeError: unhashable type: 'numpy.ndarray'
If I understood your code correctly, you're trying to slice your DataFrame for plotting, based on the values of x.
For that, you should be using df1.loc[x==1,0] instead of df1[x==1,0] (and so on for all other slices).
In my case, I was trying to pick random 2 features and run KMeans classifier on it.
sample = df[['f1','f2','f3','f4','f5','f6','f7']].sample(2, axis=1)
kmeans_classifier = KMeans(n_clusters=3) # select random features
y_kmeans = kmeans_classifier.fit_predict(sample)
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 75, c ='red', label = 'Zero')
Last line was throwing the TypeError. I resolved this by converting the sample DataFrame to Numpy representation with values.
Modified code:
sample = df[['f1','f2','f3','f4','f5','f6','f7']].sample(2, axis=1).values

Applying function to pandas dataframe

I have a pandas dataframe called 'tourdata' consisting of 676k rows of data. Two of the columns are latitude and longitude.
Using the reverse_geocode package I want to convert these coordinates to a country data.
When I call :
import reverse_geocode as rg
tourdata['Country'] = rg.search((row[tourdata['latitude']],row[tourdata['longitude']]))
I get the error :
ValueErrorTraceback (most recent call last)
in ()
1 coordinates = (tourdata['latitude'],tourdata['longitude']),
----> 2 tourdata['Country'] = rg.search((row[tourdata['latitude']],row[tourdata['longitude']]))
~/anaconda/envs/py3/lib/python3.6/site-packages/reverse_geocode/init.py
in search(coordinates)
114 """
115 gd = GeocodeData()
--> 116 return gd.query(coordinates)
117
118
~/anaconda/envs/py3/lib/python3.6/site-packages/reverse_geocode/init.py
in query(self, coordinates)
46 except ValueError as e:
47 logging.info('Unable to parse coordinates: {}'.format(coordinates))
---> 48 raise e
49 else:
50 results = [self.locations[index] for index in indices]
~/anaconda/envs/py3/lib/python3.6/site-packages/reverse_geocode/init.py
in query(self, coordinates)
43 """
44 try:
---> 45 distances, indices = self.tree.query(coordinates, k=1)
46 except ValueError as e:
47 logging.info('Unable to parse coordinates: {}'.format(coordinates))
ckdtree.pyx in scipy.spatial.ckdtree.cKDTree.query()
ValueError: x must consist of vectors of length 2 but has shape (2,
676701)
To test that the package is working :
coordinates = (tourdata['latitude'][0],tourdata['longitude'][0]),
results = (rg.search(coordinates))
print(results)
Outputs :
[{'country_code': 'AT', 'city': 'Wartmannstetten', 'country': 'Austria'}]
Any help with this appreciated. Ideally I'd like to access the resulting dictionary and apply only the country code to the Country column.
The search method expects a list of coordinates. To obtain a single data point you can use "get" method.
Try :
tourdata['country'] = tourdata.apply(lambda x: rg.get((x['latitude'], x['longitude'])), axis=1)
It works fine for me :
import pandas as pd
tourdata = pd.DataFrame({'latitude':[0.3, 2, 0.6], 'longitude':[12, 5, 0.8]})
tourdata['country'] = tourdata.apply(lambda x: rg.get((x['latitude'], x['longitude'])), axis=1)
tourdata['country']
Output :
0 {'country': 'Gabon', 'city': 'Booué', 'country...
1 {'country': 'Sao Tome and Principe', 'city': '...
2 {'country': 'Ghana', 'city': 'Mumford', 'count...
Name: country, dtype: object

Resources