I have the following function, dataframe and vector, why I am getting an error?
import pandas as pd
import numpy as np
def vanilla_vec_similarity(x, y):
x.drop('request_id', axis=1, inplace=True).values.flatten().tolist()
y.drop('request_id', axis=1, inplace=True).values.flatten().tolist()
res = (np.array(x) == np.array(y)).astype(int)
return res.mean()
test_df = pd.DataFrame({'request_id': [55, 42, 13], 'a': ['x','y','z'], 'b':[1,2,3], 'c': [1.0, -1.8, 19.113]})
test_vec = pd.DataFrame([[123,'x',1.1, -1.8]], columns=['request_id', 'a', 'b', 'c'])
test_df['similarity'] = test_df.apply(lambda x: vanilla_vec_similarity(x, test_vec), axis=1)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in _get_axis_number(cls, axis)
367 try:
--> 368 return cls._AXIS_TO_AXIS_NUMBER[axis]
369 except KeyError:
KeyError: 1
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
10 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in _get_axis_number(cls, axis)
368 return cls._AXIS_TO_AXIS_NUMBER[axis]
369 except KeyError:
--> 370 raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
371
372 #classmethod
ValueError: No axis named 1 for object type Series
You can make this code work with the following changes:
def vanilla_vec_similarity(x, y):
x.drop('request_id', axis=1).values.flatten().tolist()
y.drop('request_id', axis=1).values.flatten().tolist()
res = (np.array(x) == np.array(y)).astype(int)
return res.mean()
test_df = pd.DataFrame({'request_id': [55, 42, 13], 'a': ['x','y','z'], 'b':[1,2,3], 'c': [1.0, -1.8, 19.113]})
test_vec = pd.DataFrame([[123,'x',1.1, -1.8]], columns=['request_id', 'a', 'b', 'c'])
test_df['similarity'] = test_df.apply(lambda x: vanilla_vec_similarity(x.to_frame().T, test_vec), axis=1)
Explanation:
Firstly when you do this test_df.apply(lambda x: vanilla_vec_similarity(x, test_vec), axis=1) you are passing each
row as a series (with column names as index of series) to the
function.
Code breaks because you are trying to drop column request_id as it does not exists.
Also you don't need to use inplace=True.
Or You can just use:
test_df['similarity'] = test_df.apply(lambda x: x[1:].eq(pd.Series(test_vec.loc[0])[1:]).mean(), axis=1)
Or If you define test_vec as Series instead of Dataframe:
test_vec = pd.Series([123,'x',1.1, -1.8], index=['request_id', 'a', 'b', 'c'])
test_df['similarity'] = test_df.apply(lambda x: x[1:].eq(test_vec[1:]).mean(), axis=1)
import pandas as pd
import numpy as np
import random
import copy
import feather
import plotly.graph_objects as go
import plotly.express as px
import panel as pn
import holoviews as hv
import geoviews as gv
import geoviews.feature as gf
import cartopy
import cartopy.feature as cf
from geoviews import opts
from cartopy import crs as ccrs
import hvplot.pandas # noqa
import colorcet as cc
from colorcet.plotting import swatch
hv.extension("bokeh","plotly")
I have a dataframe called test:
Out[5]:
age age_band car_ins_renew_month people_type
0 NaN NaN NaN sign_up_only
1 61.0 55-64 7.0 active_interest
2 NaN NaN NaN sign_up_only
3 55.0 55-64 8.0 previous_customer
4 NaN NaN NaN sign_up_only
... ... ... ... ...
107627 42.0 35-44 6.0 previous_customer
107628 73.0 65+ 7.0 previous_customer
107629 NaN NaN NaN sign_up_only
107630 NaN NaN NaN sign_up_only
107631 NaN NaN NaN sign_up_only
[107632 rows x 4 columns]
In [6]: test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107632 entries, 0 to 107631
Data columns (total 4 columns):
age 73289 non-null float32
age_band 73289 non-null category
car_ins_renew_month 64290 non-null float32
people_type 107632 non-null category
dtypes: category(2), float32(2)
memory usage: 1.0 MB
For the entire test dataframe, I can successfully produce histograms using hvplot:
age (with hover data for age_band):
In [7]: test.hvplot.hist(
...: y="age",by=["age_band"],
...: bins=[18,25,35,45,55,65,74],
...: xticks=[(21.5,"18-24"),(30,"25-34"),(40,"35-44"),(50,"45-54"),(60,"55-64"),(69.5,"65-74")],
...: color="teal",legend=False,
...: line_width=4,line_color="w",
...: width=650,height=280
...: )
car_ins_renew_month:
test.hvplot.hist(
...: y="car_ins_renew_month",
...: bins=[1,2,3,4,5,6,7,8,9,10,11,12,13],
...: xticks=[(1.5,"JAN"),(2.5,"FEB"),(3.5,"MAR"),(4.5,"APR"),(5.5,"MAY"),(6.5,"JUN"),(7.5,"JUL"),(8.5,"AUG"),(9.5,"SEP"),(10.5,"OCT"),(11.5,"NOV"),(12.5,"DEC")],
...: color="teal",legend=False,
...: line_width=4,line_color="w",
...: width=650,height=280
...: )
However, for the subset of test where people_type is equal to previous_customer:
In [11]: test_prev_cust = test.loc[test["people_type"]=="previous_customer"]
while I can successfully produce a histogram for the car_ins_renew_month attribute:
In [13]: test_prev_cust.hvplot.hist(
...: y="car_ins_renew_month",
...: bins=[1,2,3,4,5,6,7,8,9,10,11,12,13],
...: xticks=[(1.5,"JAN"),(2.5,"FEB"),(3.5,"MAR"),(4.5,"APR"),(5.5,"MAY"),(6.5,"JUN"),(7.5,"JUL"),(8.5,"AUG"),(9.5,"SEP"),(10.5,"OCT"),(11.5,"NOV"),(12.5,"DEC")],
...: color="teal",legend=False,
...: line_width=4,line_color="w",
...: width=650,height=280
...: )
when I try to produce a histogram for the age attribute then I get the following error:
In [14]: test_prev_cust = hvplot.hist(
...: y="age",by=["age_band"],
...: bins=[18,25,35,45,55,65,74],
...: xticks=[(21.5,"18-24"),(30,"25-34"),(40,"35-44"),(50,"45-54"),(60,"55-64"),(69.5,"65-74")],
...: color="teal",legend=False,
...: line_width=4,line_color="w",
...: width=650,height=280
...: )
---------------------------------------------------------------------------
DataError Traceback (most recent call last)
<ipython-input-100-b2108cee586d> in <module>
7 color="teal",legend=False,
8 line_width=4,line_color="w",
----> 9 width=650,height=280
10 )
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/hvplot/plotting/core.py in hist(self, y, by, **kwds)
399 The HoloViews representation of the plot.
400 """
--> 401 return self(kind='hist', x=None, y=y, by=by, **kwds)
402
403 def kde(self, y=None, by=None, **kwds):
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/hvplot/plotting/core.py in __call__(self, x, y, kind, **kwds)
70 return pn.panel(plot, **panel_dict)
71
---> 72 return self._get_converter(x, y, kind, **kwds)(kind, x, y)
73
74 def _get_converter(self, x=None, y=None, kind=None, **kwds):
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/hvplot/converter.py in __call__(self, kind, x, y)
942 obj = DynamicMap(cbcallable, streams=[self.stream])
943 else:
--> 944 obj = method(x, y)
945
946 if self.crs and self.project:
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/hvplot/converter.py in hist(self, x, y, data)
1383 if self.by:
1384 hist = hists = histogram(
-> 1385 ds.groupby(self.by), dimension=y, **hist_opts
1386 )
1387 hist = hists.last
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/param/parameterized.py in __new__(class_, *args, **params)
2810 inst = class_.instance()
2811 inst.param._set_name(class_.__name__)
-> 2812 return inst.__call__(*args,**params)
2813
2814 def __call__(self,*args,**kw):
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/operation.py in __call__(self, element, **kwargs)
162 elif 'streams' not in kwargs:
163 kwargs['streams'] = self.p.streams
--> 164 return element.apply(self, **kwargs)
165
166
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/accessors.py in __call__(self, function, streams, link_inputs, dynamic, **kwargs)
113 for k, v in self._obj.data.items():
114 new_val = v.apply(function, dynamic=dynamic, streams=streams,
--> 115 link_inputs=link_inputs, **kwargs)
116 if new_val is not None:
117 mapped.append((k, new_val))
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/accessors.py in __call__(self, function, streams, link_inputs, dynamic, **kwargs)
108 if hasattr(function, 'dynamic'):
109 inner_kwargs['dynamic'] = False
--> 110 return function(self._obj, **inner_kwargs)
111 elif self._obj._deep_indexable:
112 mapped = []
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/operation.py in __call__(self, element, **kwargs)
159 for k, el in element.items()])
160 elif isinstance(element, ViewableElement):
--> 161 return self._apply(element)
162 elif 'streams' not in kwargs:
163 kwargs['streams'] = self.p.streams
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/operation.py in _apply(self, element, key)
119 for hook in self._preprocess_hooks:
120 kwargs.update(hook(self, element))
--> 121 ret = self._process(element, key)
122 for hook in self._postprocess_hooks:
123 ret = hook(self, ret, **kwargs)
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/operation/element.py in _process(self, element, key)
657 hist *= edges[1]-edges[0]
658 return Histogram((edges, hist), kdims=[element.get_dimension(selected_dim)],
--> 659 label=element.label, **params)
660
661
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/element/chart.py in __init__(self, data, edges, **params)
196 elif isinstance(data, tuple) and len(data) == 2 and len(data[0])+1 == len(data[1]):
197 data = data[::-1]
--> 198 super(Histogram, self).__init__(data, **params)
199
200
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/data/__init__.py in __init__(self, data, kdims, vdims, **kwargs)
209 validate_vdims = kwargs.pop('_validate_vdims', True)
210 initialized = Interface.initialize(type(self), data, kdims, vdims,
--> 211 datatype=kwargs.get('datatype'))
212 (data, self.interface, dims, extra_kws) = initialized
213 super(Dataset, self).__init__(data, **dict(kwargs, **dict(dims, **extra_kws)))
~/opt/anaconda3/envs/test_env/lib/python3.7/site-packages/holoviews/core/data/interface.py in initialize(cls, eltype, data, kdims, vdims, datatype)
252 % (intfc.__name__, e))
253 error = ' '.join([error, priority_error])
--> 254 raise DataError(error)
255
256 return data, interface, dims, extra_kws
DataError: None of the available storage backends were able to support the supplied data format.
I know that I can successfully produce histograms using hvplot for subsets of my test dataframe for both the car_ins_renew_month and age attributes as I was able to do this for people_type is equal to active_interest. I just can't for people_type is equal to previous_customer.
One thing that I did notice for my test_prev_cust dataframe was that there are no people in 2 of the categories for age_band:
In [18]: test_prev_cust["age_band"].value_counts()
Out[18]:
45-54 13457
55-64 10369
35-44 8760
65+ 7801
25-34 0
18-24 0
Name: age_band, dtype: int64
Could this be the cause of my issue? If so then is there a way to work around this and still include age_band as a hover data on my plot?
Thanks
Software versions:
bokeh 1.4.0 py37_0
cartopy 0.17.0 py37haea56ea_1
colorcet 2.0.2 py_0 pyviz
feather-format 0.4.0 py_1003 conda-forge
geoviews 1.6.5 py_0 pyviz
holoviews 1.12.6 py_0 pyviz
hvplot 0.5.2 py_0 pyviz
jupyter 1.0.0 py37_7
matplotlib 3.1.1 py37h54f8f79_0
notebook 6.0.2 py37_0
numpy 1.17.3 py37h4174a10_0
pandas 0.25.3 py37h0a44026_0
panel 0.7.0 py_0 pyviz
plotly 4.3.0 py_0 plotly
plotly_express 0.4.1 py_0 plotly
python 3.7.5 h359304d_0
seaborn 0.9.0 pyh91ea838_1
I'm on os x Catalina, using latest version of Firefox and I am working in a Jupyter notebook.
The problem is caused by your variable age_band being categorical, having 0 counts for some of the categories and using it with the keyword by=['age_band].
You could try converting age_band to a string, but in this case creating a barplot is nicer I think:
age_band_group = df.groupby(['age_band']
).agg(count=('age', np.size)
).fillna(0)
age_band_group.hvplot.bar(color='teal')
I have a dataset, in which i will be using only a single column to apply kmeans clustering. However while plotting the graph, i am getting "numpy.ndarray". I tried converting to float, but still facing the same issue
Dataframe:
Brim
1234.5
345
675.7
120
110
Code:
from sklearn.cluster import KMeans
import numpy as np
km = KMeans(n_clusters=4, init='k-means++',n_init=10)
km.fit(df1)
x = km.fit_predict(df1)
x
array([0, 0, 0, ..., 3, 3, 3])
np.shape(x)
(1097,)
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(df1[x ==1,0], df1[x == 0,1], s=100, c='red')
plt.scatter(df1[x ==1,0], df1[x == 1,1], s=100, c='black')
plt.scatter(df1[x ==2,0], df1[x == 2,1], s=100, c='blue')
plt.scatter(df1[x ==3,0], df1[x == 3,1], s=100, c='cyan')
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-62-5f0966ccc828> in <module>()
1 import matplotlib.pyplot as plt
2 get_ipython().run_line_magic('matplotlib', 'inline')
----> 3 plt.scatter(df1[x ==1,0], df1[x == 0,1], s=100, c='red')
4 plt.scatter(df1[x ==1,0], df1[x == 1,1], s=100, c='black')
5 plt.scatter(df1[x ==2,0], df1[x == 2,1], s=100, c='blue')
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2137 return self._getitem_multilevel(key)
2138 else:
->2139 return self._getitem_column(key)
2140
2141 def _getitem_column(self, key):
~\AppData\Local\Continuum\anaconda3\lib\site-
packages\pandas\core\frame.py in _getitem_column(self, key)
2144 # get column
2145 if self.columns.is_unique:
-> 2146 return self._get_item_cache(key)
2147
2148 # duplicate columns & possible reduce dimensionality
~\AppData\Local\Continuum\anaconda3\lib\site- packages\pandas\core\generic.py in _get_item_cache(self, item)
1838 """Return the cached item, item represents a label indexer."""
1839 cache = self._item_cache
-> 1840 res = cache.get(item)
1841 if res is None:
1842 values = self._data.get(item)
TypeError: unhashable type: 'numpy.ndarray'
If I understood your code correctly, you're trying to slice your DataFrame for plotting, based on the values of x.
For that, you should be using df1.loc[x==1,0] instead of df1[x==1,0] (and so on for all other slices).
In my case, I was trying to pick random 2 features and run KMeans classifier on it.
sample = df[['f1','f2','f3','f4','f5','f6','f7']].sample(2, axis=1)
kmeans_classifier = KMeans(n_clusters=3) # select random features
y_kmeans = kmeans_classifier.fit_predict(sample)
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 75, c ='red', label = 'Zero')
Last line was throwing the TypeError. I resolved this by converting the sample DataFrame to Numpy representation with values.
Modified code:
sample = df[['f1','f2','f3','f4','f5','f6','f7']].sample(2, axis=1).values
I have a pandas dataframe called 'tourdata' consisting of 676k rows of data. Two of the columns are latitude and longitude.
Using the reverse_geocode package I want to convert these coordinates to a country data.
When I call :
import reverse_geocode as rg
tourdata['Country'] = rg.search((row[tourdata['latitude']],row[tourdata['longitude']]))
I get the error :
ValueErrorTraceback (most recent call last)
in ()
1 coordinates = (tourdata['latitude'],tourdata['longitude']),
----> 2 tourdata['Country'] = rg.search((row[tourdata['latitude']],row[tourdata['longitude']]))
~/anaconda/envs/py3/lib/python3.6/site-packages/reverse_geocode/init.py
in search(coordinates)
114 """
115 gd = GeocodeData()
--> 116 return gd.query(coordinates)
117
118
~/anaconda/envs/py3/lib/python3.6/site-packages/reverse_geocode/init.py
in query(self, coordinates)
46 except ValueError as e:
47 logging.info('Unable to parse coordinates: {}'.format(coordinates))
---> 48 raise e
49 else:
50 results = [self.locations[index] for index in indices]
~/anaconda/envs/py3/lib/python3.6/site-packages/reverse_geocode/init.py
in query(self, coordinates)
43 """
44 try:
---> 45 distances, indices = self.tree.query(coordinates, k=1)
46 except ValueError as e:
47 logging.info('Unable to parse coordinates: {}'.format(coordinates))
ckdtree.pyx in scipy.spatial.ckdtree.cKDTree.query()
ValueError: x must consist of vectors of length 2 but has shape (2,
676701)
To test that the package is working :
coordinates = (tourdata['latitude'][0],tourdata['longitude'][0]),
results = (rg.search(coordinates))
print(results)
Outputs :
[{'country_code': 'AT', 'city': 'Wartmannstetten', 'country': 'Austria'}]
Any help with this appreciated. Ideally I'd like to access the resulting dictionary and apply only the country code to the Country column.
The search method expects a list of coordinates. To obtain a single data point you can use "get" method.
Try :
tourdata['country'] = tourdata.apply(lambda x: rg.get((x['latitude'], x['longitude'])), axis=1)
It works fine for me :
import pandas as pd
tourdata = pd.DataFrame({'latitude':[0.3, 2, 0.6], 'longitude':[12, 5, 0.8]})
tourdata['country'] = tourdata.apply(lambda x: rg.get((x['latitude'], x['longitude'])), axis=1)
tourdata['country']
Output :
0 {'country': 'Gabon', 'city': 'Booué', 'country...
1 {'country': 'Sao Tome and Principe', 'city': '...
2 {'country': 'Ghana', 'city': 'Mumford', 'count...
Name: country, dtype: object