Applying function to a dataframe with a vector return axis related error? - python-3.x

I have the following function, dataframe and vector, why I am getting an error?
import pandas as pd
import numpy as np
def vanilla_vec_similarity(x, y):
x.drop('request_id', axis=1, inplace=True).values.flatten().tolist()
y.drop('request_id', axis=1, inplace=True).values.flatten().tolist()
res = (np.array(x) == np.array(y)).astype(int)
return res.mean()
test_df = pd.DataFrame({'request_id': [55, 42, 13], 'a': ['x','y','z'], 'b':[1,2,3], 'c': [1.0, -1.8, 19.113]})
test_vec = pd.DataFrame([[123,'x',1.1, -1.8]], columns=['request_id', 'a', 'b', 'c'])
test_df['similarity'] = test_df.apply(lambda x: vanilla_vec_similarity(x, test_vec), axis=1)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in _get_axis_number(cls, axis)
367 try:
--> 368 return cls._AXIS_TO_AXIS_NUMBER[axis]
369 except KeyError:
KeyError: 1
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
10 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in _get_axis_number(cls, axis)
368 return cls._AXIS_TO_AXIS_NUMBER[axis]
369 except KeyError:
--> 370 raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
371
372 #classmethod
ValueError: No axis named 1 for object type Series

You can make this code work with the following changes:
def vanilla_vec_similarity(x, y):
x.drop('request_id', axis=1).values.flatten().tolist()
y.drop('request_id', axis=1).values.flatten().tolist()
res = (np.array(x) == np.array(y)).astype(int)
return res.mean()
test_df = pd.DataFrame({'request_id': [55, 42, 13], 'a': ['x','y','z'], 'b':[1,2,3], 'c': [1.0, -1.8, 19.113]})
test_vec = pd.DataFrame([[123,'x',1.1, -1.8]], columns=['request_id', 'a', 'b', 'c'])
test_df['similarity'] = test_df.apply(lambda x: vanilla_vec_similarity(x.to_frame().T, test_vec), axis=1)
Explanation:
Firstly when you do this test_df.apply(lambda x: vanilla_vec_similarity(x, test_vec), axis=1) you are passing each
row as a series (with column names as index of series) to the
function.
Code breaks because you are trying to drop column request_id as it does not exists.
Also you don't need to use inplace=True.
Or You can just use:
test_df['similarity'] = test_df.apply(lambda x: x[1:].eq(pd.Series(test_vec.loc[0])[1:]).mean(), axis=1)
Or If you define test_vec as Series instead of Dataframe:
test_vec = pd.Series([123,'x',1.1, -1.8], index=['request_id', 'a', 'b', 'c'])
test_df['similarity'] = test_df.apply(lambda x: x[1:].eq(test_vec[1:]).mean(), axis=1)

Related

AttributeError: 'KMeans' object has no attribute 'labels_' with pytorch

first of all I thank , I tried to train model with pytorch but I got the following error:
AttributeError: 'KMeans' object has no attribute 'labels_'.I am trying to model a extract features point cloud using deep learning in pytorch. I get the following error . Could anyone help on this? ************** *************** Thanks!
def forward(self, feature_matrix_batch):
# feature_matrix_batch size = (N,I,D) where N=batch number, I=members, D=member dimensionality
N, I, D = feature_matrix_batch.size()
clusters = []
for i, feature_matrix in enumerate(feature_matrix_batch):
kmeans = KMeans(n_clusters=self.k, init=self.kmeansInit, n_init=self.n_init)
labels = np.apply_along_axis(lambda x: x + (i*self.k), axis=0, arr=kmeans.labels_)
clusters.extend(labels)
clusters = np.asarray(clusters)
list1 = []
list2 = []
for i in range(self.k*N):
indices = np.argwhere(clusters == i).flatten().tolist()
if len(indices) != 1:
edges = [e for e in netx.complete_graph(indices).edges]
inverse_edges = list(map(lambda x: (x[1], x[0]), edges))
edges.extend(inverse_edges)
unzip = list(zip(*edges))
list1.extend(unzip[0])
list2.extend(unzip[1])
else:
list1.append(indices[0])
list2.append(indices[0])
edge_index = torch.tensor([list1, list2], dtype=torch.long, device=getDevice(feature_matrix_batch))
edge_index = sort_edge_index(add_self_loops(edge_index)[0])[0]
conv_feature_matrix_batch = self.conv(feature_matrix_batch.view(-1, D), edge_index).view(N, I, -1)
# conv_feature_matrix_batch size = (N,I,L) where N=batch number, I=members, L=C+P
return feature_matrix_batch, conv_feature_matrix_batch, torch.tensor(clusters, dtype=torch.long, device=getDevice(feature_matrix_batch))
labels = np.apply_along_axis(lambda x: x + (i*self.k), axis=0, arr=kmeans.labels_)
AttributeError: 'KMeans' object has no attribute 'labels_'
Thanks for your help
The attribute labels_ of a KMeans object is created once you actually compute the clusters by running .fit() (or .fit_predict(), or .fit_transform()).
Simple example:
>>> from sklearn.cluster import KMeans
>>> from numpy.random import random
>>> X = random((10,2))
>>> X
array([[0.2096706 , 0.69704806],
[0.31732618, 0.29607599],
[0.10372159, 0.56911046],
[0.30922255, 0.07952464],
[0.21190404, 0.46823665],
[0.67134948, 0.95702692],
[0.14781526, 0.24619197],
[0.89931979, 0.96301003],
[0.88256126, 0.07569739],
[0.70776912, 0.92997521]])
>>> clustering = KMeans(n_clusters=3)
>>> clustering.labels_
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: 'KMeans' object has no attribute 'labels_'
>>> clustering.fit(X)
KMeans(n_clusters=3)
>>> clustering.labels_
array([0, 0, 0, 0, 0, 1, 0, 1, 2, 1], dtype=int32)

Unable to plot scatter plot because of TypeError

I have a dataset, in which i will be using only a single column to apply kmeans clustering. However while plotting the graph, i am getting "numpy.ndarray". I tried converting to float, but still facing the same issue
Dataframe:
Brim
1234.5
345
675.7
120
110
Code:
from sklearn.cluster import KMeans
import numpy as np
km = KMeans(n_clusters=4, init='k-means++',n_init=10)
km.fit(df1)
x = km.fit_predict(df1)
x
array([0, 0, 0, ..., 3, 3, 3])
np.shape(x)
(1097,)
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(df1[x ==1,0], df1[x == 0,1], s=100, c='red')
plt.scatter(df1[x ==1,0], df1[x == 1,1], s=100, c='black')
plt.scatter(df1[x ==2,0], df1[x == 2,1], s=100, c='blue')
plt.scatter(df1[x ==3,0], df1[x == 3,1], s=100, c='cyan')
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-62-5f0966ccc828> in <module>()
1 import matplotlib.pyplot as plt
2 get_ipython().run_line_magic('matplotlib', 'inline')
----> 3 plt.scatter(df1[x ==1,0], df1[x == 0,1], s=100, c='red')
4 plt.scatter(df1[x ==1,0], df1[x == 1,1], s=100, c='black')
5 plt.scatter(df1[x ==2,0], df1[x == 2,1], s=100, c='blue')
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2137 return self._getitem_multilevel(key)
2138 else:
->2139 return self._getitem_column(key)
2140
2141 def _getitem_column(self, key):
~\AppData\Local\Continuum\anaconda3\lib\site-
packages\pandas\core\frame.py in _getitem_column(self, key)
2144 # get column
2145 if self.columns.is_unique:
-> 2146 return self._get_item_cache(key)
2147
2148 # duplicate columns & possible reduce dimensionality
~\AppData\Local\Continuum\anaconda3\lib\site- packages\pandas\core\generic.py in _get_item_cache(self, item)
1838 """Return the cached item, item represents a label indexer."""
1839 cache = self._item_cache
-> 1840 res = cache.get(item)
1841 if res is None:
1842 values = self._data.get(item)
TypeError: unhashable type: 'numpy.ndarray'
If I understood your code correctly, you're trying to slice your DataFrame for plotting, based on the values of x.
For that, you should be using df1.loc[x==1,0] instead of df1[x==1,0] (and so on for all other slices).
In my case, I was trying to pick random 2 features and run KMeans classifier on it.
sample = df[['f1','f2','f3','f4','f5','f6','f7']].sample(2, axis=1)
kmeans_classifier = KMeans(n_clusters=3) # select random features
y_kmeans = kmeans_classifier.fit_predict(sample)
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 75, c ='red', label = 'Zero')
Last line was throwing the TypeError. I resolved this by converting the sample DataFrame to Numpy representation with values.
Modified code:
sample = df[['f1','f2','f3','f4','f5','f6','f7']].sample(2, axis=1).values

taking percentile on different ids in the dataframe with numpy.percentile

I have to take the percentile on the whole data at once but I have several ids data and want to have the separate results of all ids. here is my code where i am getting some error
result_frame.groupby('ID').apply(percentile('rolling_mean', [25]))
I am getting the following error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-60-87a94290cfde> in <module>()
----> 1 result_frame.groupby('VoyageID').apply(percentile('rolling_mean', [25]))
~/anaconda3/lib/python3.6/site-packages/numpy/lib/function_base.py in percentile(a, q, axis, out, overwrite_input, interpolation, keepdims)
4272 r, k = _ureduce(a, func=_percentile, q=q, axis=axis, out=out,
4273 overwrite_input=overwrite_input,
-> 4274 interpolation=interpolation)
4275 if keepdims:
4276 if q.ndim == 0:
~/anaconda3/lib/python3.6/site-packages/numpy/lib/function_base.py in _ureduce(a, func, **kwargs)
4014 keepdim = [1] * a.ndim
4015
-> 4016 r = func(a, **kwargs)
4017 return r, keepdim
4018
~/anaconda3/lib/python3.6/site-packages/numpy/lib/function_base.py in _percentile(a, q, axis, out, overwrite_input, interpolation, keepdims)
4389 n = np.isnan(ap[-1:, ...])
4390
-> 4391 x1 = take(ap, indices_below, axis=axis) * weights_below
4392 x2 = take(ap, indices_above, axis=axis) * weights_above
4393
TypeError: ufunc 'multiply' did not contain a loop with signature matching types dtype('<U32') dtype('<U32') dtype('<U32')
How about this?
import numpy as np
import pandas as pd
data = pd.DataFrame({'group': ['A', 'B'] * 100, 'value': np.random.randn(200)})
data.groupby('group')['value'].quantile([.25, .75])
You are correct to group, you just need to identify the column you'd like to summarise, and then apply the percentile using quantile.

Applying function to pandas dataframe

I have a pandas dataframe called 'tourdata' consisting of 676k rows of data. Two of the columns are latitude and longitude.
Using the reverse_geocode package I want to convert these coordinates to a country data.
When I call :
import reverse_geocode as rg
tourdata['Country'] = rg.search((row[tourdata['latitude']],row[tourdata['longitude']]))
I get the error :
ValueErrorTraceback (most recent call last)
in ()
1 coordinates = (tourdata['latitude'],tourdata['longitude']),
----> 2 tourdata['Country'] = rg.search((row[tourdata['latitude']],row[tourdata['longitude']]))
~/anaconda/envs/py3/lib/python3.6/site-packages/reverse_geocode/init.py
in search(coordinates)
114 """
115 gd = GeocodeData()
--> 116 return gd.query(coordinates)
117
118
~/anaconda/envs/py3/lib/python3.6/site-packages/reverse_geocode/init.py
in query(self, coordinates)
46 except ValueError as e:
47 logging.info('Unable to parse coordinates: {}'.format(coordinates))
---> 48 raise e
49 else:
50 results = [self.locations[index] for index in indices]
~/anaconda/envs/py3/lib/python3.6/site-packages/reverse_geocode/init.py
in query(self, coordinates)
43 """
44 try:
---> 45 distances, indices = self.tree.query(coordinates, k=1)
46 except ValueError as e:
47 logging.info('Unable to parse coordinates: {}'.format(coordinates))
ckdtree.pyx in scipy.spatial.ckdtree.cKDTree.query()
ValueError: x must consist of vectors of length 2 but has shape (2,
676701)
To test that the package is working :
coordinates = (tourdata['latitude'][0],tourdata['longitude'][0]),
results = (rg.search(coordinates))
print(results)
Outputs :
[{'country_code': 'AT', 'city': 'Wartmannstetten', 'country': 'Austria'}]
Any help with this appreciated. Ideally I'd like to access the resulting dictionary and apply only the country code to the Country column.
The search method expects a list of coordinates. To obtain a single data point you can use "get" method.
Try :
tourdata['country'] = tourdata.apply(lambda x: rg.get((x['latitude'], x['longitude'])), axis=1)
It works fine for me :
import pandas as pd
tourdata = pd.DataFrame({'latitude':[0.3, 2, 0.6], 'longitude':[12, 5, 0.8]})
tourdata['country'] = tourdata.apply(lambda x: rg.get((x['latitude'], x['longitude'])), axis=1)
tourdata['country']
Output :
0 {'country': 'Gabon', 'city': 'Booué', 'country...
1 {'country': 'Sao Tome and Principe', 'city': '...
2 {'country': 'Ghana', 'city': 'Mumford', 'count...
Name: country, dtype: object

Use idxmax for indexing in pandas

Here is what I am trying to do:
In [7]: from pandas import DataFrame, Series
In [8]: import pandas as pd
In [9]: import numpy as np
In [10]: df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
Out[10]:
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
In [11]: df.idxmax()
Out[11]:
one b
two d
dtype: object
In [12]: df[df.idxmax()] = -9.99
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-12-018b077daf48> in <module>()
----> 1 df[df.idxmax()] = -9.99
/usr/local/lib/python3.4/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
2103
2104 if isinstance(key, (Series, np.ndarray, list, Index)):
-> 2105 self._setitem_array(key, value)
2106 elif isinstance(key, DataFrame):
2107 self._setitem_frame(key, value)
/usr/local/lib/python3.4/site-packages/pandas/core/frame.py in _setitem_array(self, key, value)
2131 self[k1] = value[k2]
2132 else:
-> 2133 indexer = self.ix._convert_to_indexer(key, axis=1)
2134 self._check_setitem_copy()
2135 self.ix._setitem_with_indexer((slice(None), indexer), value)
/usr/local/lib/python3.4/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
1141 if isinstance(obj, tuple) and is_setter:
1142 return {'key': obj}
-> 1143 raise KeyError('%s not in index' % objarr[mask])
1144
1145 return _values_from_object(indexer)
KeyError: "['b' 'd'] not in index"
Intuitively this should work, but it doesn't. Any workarounds?
You should iterate over the series and access the index and col name to set the values:
In [30]:
for items in df.idxmax().iteritems():
print(items)
df.loc[items[1], items[0]] = -9.9
df
('one', 'b')
('two', 'd')
Out[30]:
one two
a 1.40 NaN
b -9.90 -4.5
c NaN NaN
d 0.75 -9.9
I've printed the items to show what the contents are

Resources