How can I replace nan with 1 in a specific condition - python-3.x

In my program, I have some arrays and I am using a simple formula to calculate a value. The code I am using
from itertools import combinations
import numpy as np
res = [
np.array([[12.99632095], [29.60571445], [-1.85595153], [68.78926787], [2.75185088], [2.75204384]]),
np.array([[15.66458062], [0], [-3.75927882], [0], [2.30128711], [197.45459974]]),
np.array([[10.66458062], [0], [0], [-2.65954113], [-2.30128711], [197.45459974]]),
]
def cal():
pairs = combinations(res, 2)
results = []
for pair in pairs:
r = np.concatenate(pair, axis=1)
r1 = r[:, 0]
r2 = r[:, 1]
sign = np.sign(r1 * r2)
result = np.multiply(sign, np.min(np.abs(r), axis=1) / np.max(np.abs(r), axis=1))
results.append(result)
return results
The output I am getting is
RuntimeWarning: invalid value encountered in true_divide
result = np.multiply(sign, np.min(np.abs(r), axis=1) / np.max(np.abs(r), axis=1))
[array([0.82966287, 0. , 0.49369882, 0. , 0.83626883,
0.0139376 ]), array([ 0.82058458, 0. , 0. , -0.03866215, -0.83626883,
0.0139376 ]), array([ 0.68080856, nan, 0. , 0. , -1. ,
1. ])]
I here, I am getting nan for the 3rd output array. I understand, I got nan because of 0/0.
As the size of the array or the position of 0 is not fixed. So, I want to change the code in this way, if I get 0/0, here, instead of nan I want to save 1.
Could you tell me how can I handle this nan?

One of possible solutions:
import itertools as it
def cal():
pairs = it.combinations(res, 2)
rv = []
for pair in pairs:
r = np.concatenate(pair, axis=1)
sign = np.sign(np.prod(r, axis=1))
t1 = np.min(np.abs(r), axis=1)
t2 = np.max(np.abs(r), axis=1)
ratio = np.full(shape=r.shape[0], fill_value=1.)
np.divide(t1, t2, out=ratio, where=np.not_equal(t2, 0.))
wrk = np.full(shape=r.shape[0], fill_value=1.)
np.multiply(sign, ratio, out=wrk, where=np.not_equal(t2, 0.))
rv.append(wrk)
return rv
Instead of np.sign(r1 * r2) I used np.sign(np.prod(r, axis=1)).
Then the trick to set default values instead of NaN is that I created
an array filled with this default value and called np.divide passing
out and where to divide only where the divisor is not 0.
The last step is np.multiply, with just the same where condition.
To test this code and pretty print the result, you can run:
with np.printoptions(formatter={'float': '{: 9.5f}'.format}):
result = cal()
for tbl in result:
print(tbl)
The result is:
[ 0.82966 0.00000 0.49370 0.00000 0.83627 0.01394]
[ 0.82058 0.00000 0.00000 -0.03866 -0.83627 0.01394]
[ 0.68081 1.00000 0.00000 0.00000 -1.00000 1.00000]
As you can see, in the last case there are 2 1.0 values, coresponding
with same values in the second and third source array.

Related

How do I eliminate elements in the same position in two lists but just filtering one list?

I have two lists i.e.:
a = [0.0 , 30.1, 0.0, 10.1]
b = [1000, 9830, 100, 1023]
I want to remove from list "a" the elements equals to 0.0 and remove the elements from list "b" which are in the same position of the elements 0.0 in the list "a".
I know I can do this saving the index of 0.0 elements in a list, and then delete from the list b. Is there something more efficient? Because I want to apply the method in very large datasets.
Thanks
Using NumPy is of the most efficient ways. It is very easy doing so by NumPy indexing if you can use this library, without needing loops and much more faster. We need to convert them to arrays and reconvert them to list by .tolist() if needed:
a = np.array(a)
b = np.array(b)
a_ = a[a == 0]
# [0. 0.]
b_ = b[a != 0]
# [9830 1023]
This should do the trick:
a = [0.0 , 30.1, 0.0, 10.1]
b = [1000, 9830, 100, 1023]
#####################################################
assert(len(a) == len(b))
b = [b[index] if value != 0 else "" for index, value in enumerate(a)]
b = list(filter(lambda x: x != "", b))
#####################################################
print(b) # prints [9830, 1023]

Replace specific column values with pd.NA

I am working on a data set that contains longitude and latitude values.
I converted those values to clusters using DBSCAN.
Then I plotted the clusters just as a sanity check.
I get this:
The point at (0, 0) is obviously an issue.
So I ran this code to capture which row(s) are a problem.
a = df3.loc[(df3['latitude'] < 0.01) & (df3['longitude'] < 0.01)].index
print(a) # 1812 rows with 0.0 longitude and -2e-08 latitude
I have 1812 rows with missing data all represented as 0.0 longitude and -2e-08 latitude in the source file.
I am debating some imputation strategies but first I want to replace the 0.0 and -2e-08 values
with np.NA or np.nan so that I can then use fillna() with whatever I ultimately decide to do.
I have tried both:
df3.replace((df3['longitude'] == 0.0), pd.NA, inplace=True)
df3.replace((df3['latitude'] == -2e-08), pd.NA, inplace=True)
print(df3['longitude'].value_counts(dropna=False), '\n')
print(df3['latitude'].value_counts(dropna=False))
and
df3.replace((df3['longitude'] < 0.01), pd.NA, inplace=True)
df3.replace((df3['latitude'] < 0.01), pd.NA, inplace=True)
print(df3['longitude'].value_counts(dropna=False), '\n')
print(df3['latitude'].value_counts(dropna=False))
In both cases the existing values remain in place, i.e., the desired substitution with pd.NA
is not occurring.
What would be the correct procedure to replace the unwanted 1812 values in both the latitude and longitude columns with pd.NA or np.nan, as I simply plan to the impute something to replace the null values.
Try this one out:
df3['longitude'] = df3['longitude'].apply(lambda x:np.nan if x == 0.0 else x)
df3['latitude'] = df3['latitude'].apply(lambda x:np.nan if x==-2e-08 else x)
print(df3['longitude'].value_counts(dropna=False), '\n')
print(df3['latitude'].value_counts(dropna=False))
With an example
import numpy as np
import pandas as pd
a = [1, 2, 0.0, -2e-08]
b = [1, 2, 0.0, -2e-08]
df = pd.DataFrame(zip(a, b))
df.columns = ['lat', 'long']
df.long = df.long.apply(lambda x:np.nan if x == 0.0 else x)
df.lat = df.lat.apply(lambda x:np.nan if x==-2e-08 else x)

Starting with Sklearns Nearest Neighbors output, how do I remove results where the record is it's own nearest neighbor?

I am want to use sklearns NearestNeighbors() model to do some data analysis.
In my use case, I want to grab the N nearest neighbors and put it back into a pandas dataframe to evaluate the similarity of different records.
However, the results include the original record. In my case, that isn't useful. I want the nearest different records.
Example:
xtest = np.array([[1,1,1], [1,1,1], [1,.8,1] [.8,1,1]])
nn = NearestNeighbors(n_neighbors=2)
nn.fit(xtest)
distances, indices = nn.kneighbors(xtest)
returns:
(array([[0. , 0. ],
[0. , 0. ],
[0. , 0.2],
[0. , 0.2]]),
array([[0, 1],
[0, 1],
[2, 1],
[3, 1]], dtype=int64))
In the above arrays the cells at indices (0,0), (1,1), (2, 0) and (3,0) are unimportant.
My goal is to manipulate this output so that I can create the following columns in pandas:
"NearestNeighbor1" - the index of the nearest record other than itself
"NearestNeighbor1_dist" -the distance of the nearest record other than itself even if the distance is zero.
"NearestNeighbor2" - the index of the next nearest record other than itself
"NearestNeighbor2_dist" -the distance of the nearest record other than itself even if the distance is zero.
In the event of a tie, I don't care which record comes first (as long as it isn't itself).
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd
xtest = np.array([[1,1,1], [1,1,1], [1,.8,1], [.8,1,1]])
nn = NearestNeighbors(n_neighbors=3)
nn.fit(xtest)
distances, indices = nn.kneighbors(xtest)
df_ind = pd.DataFrame(data=indices)
df_ind = df_ind.apply(func=lambda x: [y for y in x if y != x.name], axis=1, result_type='expand')
df = pd.DataFrame({'NearestNeighbor1': df_ind.iloc[:, 0],
'NearestNeighbor1_dist': distances[:,1],
'NearestNeighbor2': df_ind.iloc[:, 1],
'NearestNeighbor2_dist': distances[:, 2]
})
print(df)
Output:
NearestNeighbor1 NearestNeighbor1_dist NearestNeighbor2 NearestNeighbor2_dist
0 1 0.0 2 0.2
1 0 0.0 2 0.2
2 1 0.2 0 0.2
3 1 0.2 0 0.2
This solution works for an arbitrary number of neighbors, although, I wonder if there is a more elegant solution using numpy.
N_NBRS = 4
nbrs = NearestNeighbors(n_neighbors=N_NBRS + 1, algorithm='brute')
nbrs.fit(X)
dist_n, ix_n = nbrs.kneighbors(X)
replacement = []
for row_idx, row in enumerate(ix_n):
new_row = [val for val in row if val != row_idx]
new_row = new_row[:N_NBRS] #Truncate in the event of many exact matches
replacement.append(new_row)
ix_n2 = np.array(replacement)
dist_n2 = dist_n[:,1:]
results = X.copy()
for col_idx in range(N_NBRS):
results[f'Neighbor{col_idx + 1}'] = ix_n2[:,col_idx]
results[f'Neighbor{col_idx + 1}_dist'] = dist_n2[:,col_idx]

Randomly select elements from string in a dataframe

I have dataframe with 7 string columns:
bul; age; gender; hh; pn; freq_pn; rcrds_to_select
1; 2; 5; 1; ['35784905', '40666303', '47603805', '68229102'];4;3
2; 3; 3; 3; ['06299501', '07694901', '35070201'];3;2
In the last column I have the number of id's from "pn" column that I need to select randomly. Example: in the first row I have 4 id's ['35784905', '40666303', '47603805', '68229102'] and I need to select 3 random id's and remove the not selected one. There can be rows with only one id. I came to the conclusion that I need to turn the values in tuples and store them in another column ('pnTuple'). I don't know if this is the right way.
mass_grouped3['pnTuple'] = [tuple(x) for x in mass_grouped3['pn'].values]
I think random.shuffle will do the job, but have no idea how to implement it in my script. I was thinking something like this, but is not working:
for row in mass_grouped3['pnTuple']:
list = list(mass_grouped3['pnTuple'])
whitelist = random.shuffle(list)
Any ideas how to do this selection are appreciated.
You want to randomly select 1 from every row and make the rest 0. Here's one approach. Sample the indices and based on indices assign 1. i.e
idx = pd.DataFrame(np.stack(np.where(df==1))).T.groupby(0).apply(lambda x: x.sample(1)).values
# array([[0, 2],
# [1, 1],
# [2, 0],
# [3, 3]])
ndf = pd.DataFrame(np.zeros(df.shape),columns=df.columns)
ndf.values[idx[:,0],idx[:,1]] = 1
W1 W2 W3 W4
0 0 0 1 0
1 1 0 0 0
2 1 0 0 0
3 0 1 0 0
Welcome to StackOverflow! Hope this helps
Lets go step by step
First lets construct our random function that can select 3
>>> import random
>>> random.choices(['35784905', '40666303', '47603805', '68229102'], k=3)
['68229102', '40666303', '35784905']
I have a sample data frame, df with columns with same data as yours
>>> df
a b
0 12 [35784905, 40666303, 47603805, 68229102]
1 12 [06299501, 07694901, 35070201]
>>> df['b']
0 [35784905, 40666303, 47603805, 68229102]
1 [06299501, 07694901, 35070201]
Name: b, dtype: object
>>> df['b'].map(lambda alist: random.choices(alist, k=3) if len(alist) > 3 else alist)
0 [35784905, 68229102, 35784905]
1 [06299501, 07694901, 35070201]
Name: b, dtype: object
>>> df['b'] = df['b'].map(lambda alist: random.choices(alist, k=3) if len(alist) > 3 else alist)
Using pandas map operation to apply this data transformation to whole columns
Note: We are using a lambda function lambda alist: random.choices(alist, k=3) if len(alist) > 3 else alist to ensure that each list has more than 3 items, and only then apply this operation.
It might be a little new, but this a standard way of writing code in python. Learn more about Python, lambda function and pandas for some time.

New column based on a row with conditions in Pandas

I'm trying to do an operation with Dataframes but i'm not sure how I can solve the problem using the built-in Pandas Operations (Actualy my code is based on a for so I'm trying to build a more elegant solution).
Given the following Dataframes, defined with the columns described below
original_df = [o1, o2, o3, o4]
weights_df = [w1, w2, w3, w4]
conditions_df = [c1, c2, c3, c4]
I need to built a new column on original_df based on the division of o1/w1 but depending on the value of c1, with takes the values ["+" or "-" I need to do the -o1/w1 operation.
As long as I did was:
orignal_df['newcolumn'] = original_df / weights_df
Where of course I divided the two terms but without applying the condition, I'm trying to do with map and apply functions but I'm not sure how I can add the third column into the function.
original_df = [100, 200, 300, 400]
weights_df = [10, 20, 30, 40]
conditions_df = [1, 2, 3, 4]
df = pd.DataFrame({'x':original_df, 'y':weights_df, 'z':conditions_df})
def div(x, y, z):
if z > 2:
return float(x/y)
else:
return float(-1*x/y)
df['new_feature'] = df.apply(lambda p: div(p['x'], p['y'], p['z']), axis=1)
This is one way of solving. If your conditions_df contains '+'/'-' then you can change the condition in def div(x, y, z) accordingly.
You can use numpy.where for mask by condition:
#data from lisa answer
#df = pd.DataFrame({'x':original_df, 'y':weights_df, 'z':conditions_df})
df['new_feature'] = df['x'] / df['y'] * np.where(df['z'] > 2, 1, -1)
print (df)
x y z new_feature
0 100 10 1 -10.0
1 200 20 2 -10.0
2 300 30 3 10.0
3 400 40 4 10.0
Timings:
#4k rows
df = pd.concat([df]*1000).reset_index(drop=True)
#lisa answer
In [95]: %timeit df['new_feature1'] = df.apply(lambda p: div(p['x'], p['y'], p['z']), axis=1)
10 loops, best of 3: 123 ms per loop
In [96]: %timeit df['new_feature2'] = df['x'] / df['y'] * np.where(df['z'] > 2, 1, -1)
1000 loops, best of 3: 595 µs per loop

Resources