how to replace a cell in a pandas dataframe - python-3.x

After forming the below python pandas dataframe (for example)
import pandas
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pandas.DataFrame(data,columns=['Name','Age'])
If I iterate through it, I get
In [62]: for i in df.itertuples():
...: print( i.Index, i.Name, i.Age )
...:
0 Alex 10
1 Bob 12
2 Clarke 13
What I would like to achieve is to replace the value of a particular cell
In [67]: for i in df.itertuples():
...: if i.Name == "Alex":
...: df.at[i.Index, 'Age'] = 100
...:
Which seems to work
In [64]: df
Out[64]:
Name Age
0 Alex 100
1 Bob 12
2 Clarke 13
The problem is that when using a larger different dataset, and do:
First, I create a new column named like NETELEMENT with a default value of ""
I would like to replace the default value "" with the string that the function lookup_netelement returns
df['NETELEMENT'] = ""
for i in df.itertuples():
df.at[i.Index, 'NETELEMENT'] = lookup_netelement(i.PEER_SRC_IP)
print( i, lookup_netelement(i.PEER_SRC_IP) )
But what I get as a result is:
Pandas(Index=769, SRC_AS='', DST_AS='', COMMS='', SRC_COMMS=nan, AS_PATH='', SRC_AS_PATH=nan, PREF='', SRC_PREF='0', MED='0', SRC_MED='0', PEER_SRC_AS='0', PEER_DST_AS='', PEER_SRC_IP='x.x.x.x', PEER_DST_IP='', IN_IFACE='', OUT_IFACE='', PROTOCOL='udp', TOS='0', BPS=35200.0, SRC_PREFIX='', DST_PREFIX='', NETELEMENT='', IN_IFNAME='', OUT_IFNAME='') routerX
meaning that it should be:
NETELEMENT='routerX' instead of NETELEMENT=''
Could you please advise what I am doing wrong ?
EDIT: for reasons of completeness the lookup_netelement is defined as
def lookup_netelement(ipaddr):
try:
x = LOOKUP['conn'].hget('ipaddr;{}'.format(ipaddr), 'dev') or b""
except:
logger.error('looking up `ipaddr` for netelement caused `{}`'.format(repr(e)), exc_info=True)
x = b""
x = x.decode("utf-8")
return x

Hope you are looking for where for conditional replacement i.e
def wow(x):
return x ** 10
df['new'] = df['Age'].where(~(df['Name'] == 'Alex'),wow(df['Age']))
Output :
Name Age new
0 Alex 10 10000000000
1 Bob 12 12
2 Clarke 13 13
3 Alex 15 576650390625
Based on your edit your trying to apply the function i.e
df['new'] = df['PEER_SRC_IP'].apply(lookup_netelement)
Edit : For your comment on sending two columns, use lambda with axis 1 i.e
def wow(x,y):
return '{} {}'.format(x,y)
df.apply(lambda x : wow(x['Name'],x['Age']),1)

Related

Count element in list if it is present in each row of a column. Add to a new column (pandas)

I have a pandas df like this:
MEMBERSHIP
[2022_K_, EWREW_NK]
[333_NFK_,2022_K_, EWREW_NK, 000]
And I have a list of keys:
list_k = ["_K_","_NK_","_NKF_","_KF_"]
I want to add and create a column that count if any of that element is in the column. The desired output is:
MEMBERSHIP | COUNT
[2022_K_, EWREW_NK] | 2
[333_NFK_,2022_K_, EWREW_NK, 000] | 3
Can you help me?
IIUC, you can use pandas .str acccess methods with regex:
import pandas as pd
df = pd.DataFrame({'MEMBERSHIP':[['2022_K_', 'EWREW_NK'],
['333_NFK_','2022_K_', 'EWREW_NK', '000']]})
list_k = ["_K_","_NK","_NFK_","_KF_"] #I changed this list a little
reg = '|'.join(list_k)
df['count'] = df['MEMBERSHIP'].explode().str.contains(reg).groupby(level=0).sum()
print(df)
Output:
MEMBERSHIP count
0 [2022_K_, EWREW_NK] 2
1 [333_NFK_, 2022_K_, EWREW_NK, 000] 3
you can use a lambda function:
def check(x):
total=0
for i in x:
if type(i) != str: #if value is not string pass.
pass
else:
for j in list_k:
if j in i:
total+=1
return total
df['count']=df['MEMBERSHIP'].apply(lambda x: check(x))
I come up with this dumb code
count_row=0
df['Count']= None
for i in df['MEMBERSHIP_SPLIT']:
count_element=0
for sub in i:
for e in list_k:
if e in sub:
count_element+=1
df['Count'][count_row]=count_element
count_row += 1

Format Data using panadas groupBy such that it groups by one column

I have the data in below format in an csv :-
However, the format in which I required is below :-
I have written below code, but somehow the groupby is not working for me.
def grouping():
df = pd.read_csv("final_data_6.csv")
df['n'] = df.apply(lambda x: (x['data'], x['Period']), axis=1)
df.groupby(['data','Period'])['n'].apply(list).reset_index()
df.to_csv("final_data_9.csv", encoding="utf-8", index=False)
Use GroupBy.agg with create dictionaries filled by list:
def grouping():
df = pd.read_csv("final_data_6.csv")
df['n'] = [x for x in zip(df['positions'], df['Period'])]
df=df.groupby('data')['n'].agg(lambda x:{'entities':list(x)}).reset_index(name='entity')
df.to_csv("final_data_9.csv", encoding="utf-8", index=False)
Sample data test:
print (df)
data positions Period
0 abc 37,41 disease
1 abc 10,16 drugs
2 def 4,14 indication
3 def 78,86 intervention
df['n'] = [x for x in zip(df['positions'], df['Period'])]
print (df)
data positions Period n
0 abc 37,41 disease (37,41, disease)
1 abc 10,16 drugs (10,16, drugs)
2 def 4,14 indication (4,14, indication)
3 def 78,86 intervention (78,86, intervention)
df=df.groupby('data')['n'].agg(lambda x:{'entities':list(x)}).reset_index(name='entity')
print (df)
data entity
0 abc {'entities': [('37,41', 'disease'), ('10,16', ...
1 def {'entities': [('4,14', 'indication'), ('78,86'...

Use lambda, apply, and join function on a pandas dataframe

Goal
Apply deid_notes function to df
Background
I have a df that resembles this sample df
import pandas as pd
df = pd.DataFrame({'Text' : ['there are many different types of crayons',
'i like a lot of sports cares',
'the middle east has many camels '],
'P_ID': [1,2,3],
'Word' : ['crayons', 'cars', 'camels'],
'P_Name' : ['John', 'Mary', 'Jacob'],
'N_ID' : ['A1', 'A2', 'A3']
})
#rearrange columns
df = df[['Text','N_ID', 'P_ID', 'P_Name', 'Word']]
df
Text N_ID P_ID P_Name Word
0 many types of crayons A1 1 John crayons
1 i like sports cars A2 2 Mary cars
2 has many camels A3 3 Jacob camels
I use the following function to deidentify certain words within the Text column using NeuroNER http://neuroner.com/
def deid_notes(text):
#use predict function from neuorNER to tag words to be deidentified
ner_list = n1.predict(text)
#n1.predict wont work in this toy example because neuroNER package needs to be installed (and installation is difficult)
#but the output resembles this: [{'start': 1, 'end:' 11, 'id': 1, 'tagged word': crayon}]
#use start and end position of tagged words to deidentify and replace with **BLOCK**
if len(ner_list) > 0:
parts_to_take = [(0, ner_list[0]['start'])] + [(first["end"]+1, second["start"]) for first, second in zip(ner_list, ner_list[1:])] + [(ner_list[-1]['end'], len(text)-1)]
parts = [text[start:end] for start, end in parts_to_take]
deid = '**BLOCK**'.join(parts)
#if n1.predict does not identify any words to be deidentified, place NaN
else:
deid='NaN'
return pd.Series(deid, index='Deid')
Problem
I apply the deid_notes function to my df using the following code
fx = lambda x: deid_notes(x.Text,axis=1)
df.join(df.apply(fx))
But I get the following error
AttributeError: ("'Series' object has no attribute 'Text'", 'occurred at index Text')
Question
How do I get the deid_notes function to work on my df?
Assuming you are returning a pandas series as output from deid_notes function taking text as the only input argument. Pass the axis = 1 argument to the apply instead of died_notes. For eg.
# Dummy function
def deid_notes(text):
deid = 'prediction to: ' + text
return pd.Series(deid, index = ['Deid'])
fx = lambda x: deid_notes(x.Text)
df.join(df.apply(fx, axis =1))

How to split a DataFrame in pandas in predefined percentages?

I have a pandas dataframe sorted by a number of columns. Now I'd like to split the dataframe in predefined percentages, so as to extract and name a few segments.
For example, I want to take the first 20% of rows to create the first segment, then the next 30% for the second segment and leave the remaining 50% to the third segment.
How would I achieve that?
Use numpy.split:
a, b, c = np.split(df, [int(.2*len(df)), int(.5*len(df))])
Sample:
np.random.seed(100)
df = pd.DataFrame(np.random.random((20,5)), columns=list('ABCDE'))
#print (df)
a, b, c = np.split(df, [int(.2*len(df)), int(.5*len(df))])
print (a)
A B C D E
0 0.543405 0.278369 0.424518 0.844776 0.004719
1 0.121569 0.670749 0.825853 0.136707 0.575093
2 0.891322 0.209202 0.185328 0.108377 0.219697
3 0.978624 0.811683 0.171941 0.816225 0.274074
print (b)
A B C D E
4 0.431704 0.940030 0.817649 0.336112 0.175410
5 0.372832 0.005689 0.252426 0.795663 0.015255
6 0.598843 0.603805 0.105148 0.381943 0.036476
7 0.890412 0.980921 0.059942 0.890546 0.576901
8 0.742480 0.630184 0.581842 0.020439 0.210027
9 0.544685 0.769115 0.250695 0.285896 0.852395
print (c)
A B C D E
10 0.975006 0.884853 0.359508 0.598859 0.354796
11 0.340190 0.178081 0.237694 0.044862 0.505431
12 0.376252 0.592805 0.629942 0.142600 0.933841
13 0.946380 0.602297 0.387766 0.363188 0.204345
14 0.276765 0.246536 0.173608 0.966610 0.957013
15 0.597974 0.731301 0.340385 0.092056 0.463498
16 0.508699 0.088460 0.528035 0.992158 0.395036
17 0.335596 0.805451 0.754349 0.313066 0.634037
18 0.540405 0.296794 0.110788 0.312640 0.456979
19 0.658940 0.254258 0.641101 0.200124 0.657625
Creating a dataframe with 70% values of original dataframe
part_1 = df.sample(frac = 0.7)
Creating dataframe with rest of the 30% values
part_2 = df.drop(part_1.index)
I've written a simple function that does the job.
Maybe that might help you.
P.S:
Sum of fractions must be 1.
It will return len(fracs) new dfs. so you can insert fractions list at long as you want (e.g: fracs=[0.1, 0.1, 0.3, 0.2, 0.2])
np.random.seed(100)
df = pd.DataFrame(np.random.random((99,4)))
def split_by_fractions(df:pd.DataFrame, fracs:list, random_state:int=42):
assert sum(fracs)==1.0, 'fractions sum is not 1.0 (fractions_sum={})'.format(sum(fracs))
remain = df.index.copy().to_frame()
res = []
for i in range(len(fracs)):
fractions_sum=sum(fracs[i:])
frac = fracs[i]/fractions_sum
idxs = remain.sample(frac=frac, random_state=random_state).index
remain=remain.drop(idxs)
res.append(idxs)
return [df.loc[idxs] for idxs in res]
train,test,val = split_by_fractions(df, [0.8,0.1,0.1]) # e.g: [test, train, validation]
print(train.shape, test.shape, val.shape)
outputs:
(79, 4) (10, 4) (10, 4)

Python Pandas: bootstrap confidence limits by row rather than entire dataframe

What I am trying to do is to get bootstrap confidence limits by row regardless of the number of rows and make a new dataframe from the output.I currently can do this for the entire dataframe, but not by row. The data I have in my actual program looks similar to what I have below:
0 1 2
0 1 2 3
1 4 1 4
2 1 2 3
3 4 1 4
I want the new dataframe to look something like this with the lower and upper confidence limits:
0 1
0 1 2
1 1 5.5
2 1 4.5
3 1 4.2
The current generated output looks like this:
0 1
0 2.0 2.75
The python 3 code below generates a mock dataframe and generates the bootstrap confidence limits for the entire dataframe. The result is a new dataframe with just 2 values, a upper and a lower confidence limit rather than 4 sets of 2(one for each row).
import pandas as pd
import numpy as np
import scikits.bootstrap as sci
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
print(zz)
x= zz.dtypes
print(x)
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
b = sci.ci(a)
b = pd.DataFrame(b)
b = b.T
print(b)
Thank you for any help.
scikits.bootstrap operates by assuming that data samples are arranged by row, not by column. If you want the opposite behavior, just use the transpose, and a statfunction that doesn't combine columns.
import pandas as pd
import numpy as np
import scikits.bootstrap as sci
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
print(zz)
x= zz.dtypes
print(x)
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
b = sci.ci(a.T, statfunction=lambda x: np.average(x, axis=0))
print(b.T)
Below is the answer I ended up figuring out to create bootstrap ci by row.
import pandas as pd
import numpy as np
import numpy.random as npr
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
x= zz.dtypes
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
def bootstrap(data, num_samples, statistic, alpha):
n = len(data)
idx = npr.randint(0, n, (num_samples, n))
samples = data[idx]
stat = np.sort(statistic(samples, 1))
return (stat[int((alpha/2.0)*num_samples)],
stat[int((1-alpha/2.0)*num_samples)])
cc = list(a.index.values) # informs generator of the number of rows
def bootbyrow(cc):
for xx in range(1):
xx = list(a.index.values)
for xx in range(len(cc)):
k = a.apply(lambda y: y[xx])
k = k.values
for xx in range(1):
kk = list(bootstrap(k,10000,np.mean,0.05))
yield list(kk)
abc = pd.DataFrame(list(bootbyrow(cc))) #bootstrap ci by row
# the next 4 just show that its working correctly
a0 = bootstrap((a.loc[0,].values),10000,np.mean,0.05)
a1 = bootstrap((a.loc[1,].values),10000,np.mean,0.05)
a2 = bootstrap((a.loc[2,].values),10000,np.mean,0.05)
a3 = bootstrap((a.loc[3,].values),10000,np.mean,0.05)
print(abc)
print(a0)
print(a1)
print(a2)
print(a3)

Resources