how to alter column of a dataframe with different values and by various condition in python? - python-3.x

I have a data frame where I want to alter the column "conf" with different values according to the condition satisfied.
df=pd.DataFrame({"conf":[100,100,100,100],
"i":[-2,3,-3,10],
"o":[12,13,14,16],
"n":[6,4,6,1],
"id":[1.4,2,1.3,1.7],
"od":[2,3,2.5,8],
"nd":[2,3,2.4,-0.9],
"iwr":[60,60,45,65],
"owr":[65,88,90,78],
"nwr":[67,63,60,60]})
df
i want to do this but i am doing it in the wrong way beacuse i am passing series to if condition, which is not the right way.
def column_alter(df):
if (((df["id"]>1.5) & (df["iwr"]>1.5)):
if (((df["od"]>1.5) & (df["nd"]>1.5)) & ((df["owr"]>60) & (df["nwr"]>60))):
df["conf"]= df["conf"]
else:
df["conf"]= df["conf"]*0.5
else:
if (((df["od"]>1.5) & (df["nd"]>1.5)) & ((df["owr"]>60) &(df["nwr"]>60))):
df["conf"]= df["conf"]
else:
df["conf"]= df["conf"]*0.25
return df
Required Output: I want to return whole dataframe with modified Conf value i.e [100,100,25,50]

For else is possible invert mask by ~, chain mask by & and multiple with DataFrame.loc:
m0 = (df["id"]>1.5) & (df["iwr"]>1.5)
m1 = (df["od"]>1.5) & (df["nd"]>1.5) & (df["owr"]>60) & (df["nwr"]>60)
df.loc[m0 & ~m1, "conf"] *= 0.5
df.loc[~m0 & ~m1, "conf"] *= 0.25
print (df)
conf i o n id od nd iwr owr nwr
0 100.0 -2 12 6 1.4 2.0 2.0 60 65 67
1 100.0 3 13 4 2.0 3.0 3.0 60 88 63
2 25.0 -3 14 6 1.3 2.5 2.4 45 90 60
3 50.0 10 16 1 1.7 8.0 -0.9 65 78 60
Another solution with numpy.select:
m0 = (df["id"]>1.5) & (df["iwr"]>1.5)
m1 = (df["od"]>1.5) & (df["nd"]>1.5) & (df["owr"]>60) & (df["nwr"]>60)
df['conf'] *= np.select([m0 & ~m1, ~m0 & ~m1], [0.5, 0.25], default=1)
#long alternative
#df['conf'] = np.select([m0 & ~m1, ~m0 & ~m1],
[df['conf'] * 0.5, df['conf'] * 0.25], default=df['conf'])
print (df)
conf i o n id od nd iwr owr nwr
0 100.0 -2 12 6 1.4 2.0 2.0 60 65 67
1 100.0 3 13 4 2.0 3.0 3.0 60 88 63
2 25.0 -3 14 6 1.3 2.5 2.4 45 90 60
3 50.0 10 16 1 1.7 8.0 -0.9 65 78 60

Related

Moving aggregate within a specified date range

Using a sample credit card transactions data below:
df = pd.DataFrame({
'card_id' : [1, 1, 1, 2, 2],
'date' : [datetime(2020, 6, random.randint(1, 14)) for i in range(5)],
'amount' : [random.randint(1, 100) for i in range(5)]})
df
card_id date amount
0 1 2020-06-07 11
1 1 2020-06-11 45
2 1 2020-06-14 87
3 2 2020-06-04 48
4 2 2020-06-12 76
I'm trying to take the total amount spent in the past 7 days of a card at the point of the transaction. For example, if card_id 1 made a transaction on June 8, I want to get the total transactions from June 1 to June 7. This is what I was hoping to get:
card_id date amount sum_past_7d
0 1 2020-06-07 11 0
1 1 2020-06-11 45 11
2 1 2020-06-14 87 56
3 2 2020-06-04 48 0
4 2 2020-06-12 76 48
I'm currently using this function and pd.apply to generate my desired column but it's taking too long on the actual data (> 1 million rows).
df['past_week'] = df['date'].apply(lambda x: x - timedelta(days=7))
def myfunction(x):
return df.loc[(df['card_id'] == x.card_id) & \
(df['date'] >= x.past_week) & \
(df['date'] < x.date), :]['amount'].sum()
Is there a faster and more efficient way to do this?
Let's try rolling on date with groupby:
# make sure the data is sorted properly
# your sample is already sorted, so you can skip this
df = df.sort_values(['card_id', 'date'])
df['sum_past_7D'] = (df.set_index('date').groupby('card_id')
['amount'].rolling('7D').sum()
.groupby('card_id').shift(fill_value=0)
.values
)
Output:
card_id date amount sum_past_7D
0 1 2020-06-07 11 0.0
1 1 2020-06-11 45 11.0
2 1 2020-06-14 87 56.0
3 2 2020-06-04 48 0.0
4 2 2020-06-12 76 48.0

Fill in missing values in DataFrame Column which is incrementing by 10

Say , Some Values in the 'Counts' column are missing. These numbers are meant to be increased by 10 with each row so '35' and '55' need to be put in place. I would want to fill in these missing values.
Counts
0 25
1 NaN
2 45
3 NaN
4 65
So my output should be :
Counts
0 25
1 35
2 45
3 55
4 65
Thanks,
We have interpolate
df=df.interpolate()
Counts
0 25.0
1 35.0
2 45.0
3 55.0
4 65.0
Since you now the pattern, you can simply recreate it:
start = df.iloc[0]['Counts'] # first row
end = df.iloc[-1]['Counts'] # last row
df['Counts'] = np.where(df['Counts'].notnull(), df['Counts'],
np.arange(start, end + 1, 10))

Replace values in Columns

I want to replace values in columns using if loop:
If value in column [D] is not same as any values in [A,B,C] then replace column with first NaN with D, and if there is no NaN in a row, create a new column [E] and add value from column [D] in column [E].
ID A B C D
0 22 32 NaN 22
1 25 13 NaN 15
2 27 NaN NaN 20
3 29 10 16 29
4 12 92 33 55
I want output to be:
ID A B C D E
0 22 32 NaN 22
1 25 13 15 15
2 27 20 NaN 20
3 29 10 16 29
4 12 92 33 55 55
List = [[22 , 32 , None , 22],
[25 , 13 , None , 15],
[27 , None , None , 20],
[29 , 10 , 16 , 29],
[12 , 92 , 33 , 55]]
for Row in List:
Target_C = Row[3]
if Row.count(Target_C) < 2: # If there is no similar condetion pass
None_Found = False # Small bool to check later if there is no None !
for enumerate_Column in enumerate(Row): # get index for each list
if(None in enumerate_Column): # if there is None gin the row
Row[enumerate_Column[0]] = Target_C # replace None with column D
None_Found = True # Change None_Found to True
if(None_Found): # Break the loop if found None
break
if(None_Found == False): # if you dont found None add new clulmn
Row.append(Target_C)
My Code example
You can do it this way
a = df.isnull()
b = (a[a.any(axis=1)].idxmax(axis=1))
nanindex = b.index
check = (df.A!=df.D) & (df.B!=df.D) & (df.C!=df.D)
commonind = check[~check].index
replace_ind_list = list(nanindex.difference(commonind))
new_col_list = df.index.difference(list(set(commonind.tolist()+nanindex.tolist()))).tolist()
df['E']=''
for index, row in df.iterrows():
for val in new_col_list:
if index == val:
df.at[index,'E'] = df['D'][index]
for val in replace_ind_list:
if index == val:
df.at[index,b[val]] = df['D'][index]
df
Output
ID A B C D E
0 0 22 32.0 NaN 22
1 1 25 13.0 15.0 15
2 2 27 20.0 NaN 20
3 3 29 10.0 16.0 29
4 4 12 92.0 33.0 55 55

Python Pandas: How to insert a new column which is a sum of next 'n' (can be a fraction also) values of another column?

I've got a DataFrame, let's say the name is 'test' storing data as below:
Week Stock(In Number of Weeks) Demand (In Units)
0 W01 2.4 37
1 W02 3.6 33
2 W03 2.0 46
3 W04 5.8 45
4 W05 4.6 56
5 W06 3.0 38
6 W07 5.0 45
7 W08 7.5 54
8 W09 4.3 35
9 W10 2.2 38
10 W11 2.0 50
11 W12 6.0 37
I want to insert a new column in this dataframe which for every row, is the sum of "No. of weeks" rows of column "Demand(In Units)".
That is, in the case of this dataframe,
for 0th row that new column should be the sum of 2.4 rows of column "Demand(In Units)" which would be 37+33+ 0.4*46
for 1st row, the value should be 33+46+45+ 0.6*56
for 2nd row, it should be 46+45
.
.
.
for 7th row, it should be 54+35+38+50+37 (since number of rows left are smaller than the value 7.5, all the remaining rows get summed up)
.
.
.
and so on.
Effectively, I want my dataframe to have a new column as follows:
Week Stock(In Number of Weeks) Demand (In Units) Stock (In Units)
0 W01 2.4 37 88.4
1 W02 3.6 33 157.6
2 W03 2.0 46 91.0
3 W04 5.8 45 266.0
4 W05 4.6 56 214.0
5 W06 3.0 38 137.0
6 W07 5.0 45 222.0
7 W08 7.5 54 214.0
8 W09 4.3 35 160.0
9 W10 2.2 38 95.4
10 W11 2.0 50 87.0
11 W12 6.0 37 37.0
Can somebody suggest some way to achieve this?
I can achieve it through iterating over each row but it would be very slow for millions of rows which I want to process at a time.
The code which I am using right now is:
for i in range(len(test)):
if int(np.floor(test.loc[i, 'Stock(In Number of Weeks)'])) >= len(test[i:]):
number_of_full_rows = len(test[i:])
fraction_of_last_row = 0
y = 0
else:
number_of_full_rows = int(np.floor(test.loc[i, 'Stock(In Number of Weeks)']))
fraction_of_last_row = test.loc[i, 'Stock(In Number of Weeks)'] - number_of_full_rows
y = test.loc[i+number_of_full_rows, 'Demand (In Units)'] * fraction_of_last_row
x = np.sum(test[i:i+number_of_full_rows]['Demand (In Units)'])
test.loc[i, 'Stock (In Units)'] = x+y
I tried with some test data:
def func(r, col):
n = int(r['Stock(In Number of Weeks)'])
f = float(r['Stock(In Number of Weeks)'] - n)
i = r.name # row index value
z = np.zeros(len(df)) #initialize all zeros
v = np.hstack((np.ones(n), np.array(f))) # vecotor of ones and fraction part
e = min(len(v), len(z[i:]))
z[i:i+e] = v[:len(z[i:])] #change z starting at index until lenght
r['Stock (In Units)'] = col # z #compute scalar product
return r
df = df.apply(lambda r: func(df['Demand (In Units)'].values, r), axis=1)

Parsing table with streaks of binaries to select larger group element

I have a table like the following (only much longer):
# time binary frequency
0 2.1 0 0.65
1 3.2 1 0.72
2 5.8 0 0.64
3 7.1 0 0.63
4 9.5 1 0.72
5 14.1 1 0.74
6 21.5 0 0.62
7 27.3 0 0.61
8 29.5 1 1.00
9 32.1 1 1.12
10 35.5 1 0.99
I want to collect all the times correspondent to only binary == 1 and, among the small groups, those whose correspondent frequency value is higher. In the table above, this would result in:
times = 3.2, 14.1, 32.1
I am not sure how to approach the sequentiality of the table on the first place, and then how to compare the values among them returning only the correspondent time (and not, for example, the largest frequency). Time hides a periodicity, so I would avoid to build another table with only binary == 1 elements.
Having my time, binary, and frequency arrays, I can isolate relevant elements by:
condition = (binary == 1)
time1 = time(condition)
frequency1 = frequency(condition)
but I do not know how to proceed to isolate the various streaks. What are useful functions I can use?
I don't know that there are any clever functions to use for this. Here's some code that will do the job. Please note that I removed the headers from your file.
binary is either zero or one, depending on whether the rows other values are to be included in a group. Initially in_group is set to False to indicate that no group has started. As rows are read, when binary is zero, if the code has been reading rows for a group and, therefore, in_group is True, in_group is set to False because now that a zero has been encountered that group has come to an end. Since processing of the group has ended, it's time to print results for it. As rows are read, when binary is one, if in_group is True then the code has already started processing rows are a group and the code checks whether the newest frequency is greater than what has been see before. If so, it updates both rep_time and rep_frequency. If in_group is False then this is the first row of a new group and in_group is set True and initial values of rep_time and rep_frequency are set.
with open('pyser.txt') as pyser:
in_group = False
for line in pyser:
_, time, binary, frequency = [float(_) for _ in line.rstrip().split()]
if binary == 0:
if in_group:
in_group = False
print (rep_time)
else:
if in_group:
if frequency > rep_frequency:
rep_time, rep_frequency = time, frequency
else:
in_group = True
rep_time, rep_frequency = time, frequency
if in_group:
print (rep_time)
Output:
3.2
14.1
32.1
Edit: We seem to be using different definitions of the problem.
In the first group, we agree. But, in the second group, the maximum amplitude is about 4.07E-01, which corresponds to a time of about 5.4740E+04.
I've also written code in Pandas:
>>> import pandas as pd
>>> df = pd.read_csv('Gyd9P1rb.txt', sep='\s+', skiprows=2, header=None, names='Row TSTOP PSRTIME DETECTED FDOTMAX AMPLITUDE AMPLITUDE_ERR'.split())
>>> del df['Row']
>>> del df['TSTOP']
>>> del df['FDOTMAX']
>>> del df['AMPLITUDE_ERR']
>>> groups = []
>>> in_group = False
>>> group_number = 1
>>> for b in df['DETECTED']:
... if b:
... if not in_group:
... group_number +=1
... in_group = True
... groups.append(group_number)
... else:
... groups.append(0)
... in_group = False
...
>>> df['groups'] = pd.Series(groups, index=df.index)
>>> df.head()
PSRTIME DETECTED AMPLITUDE groups
0 54695.471283 1 0.466410 2
1 54698.532412 1 0.389607 2
2 54701.520814 1 0.252858 2
3 54704.557583 0 0.103460 0
4 54707.557563 0 0.088215 0
>>> gb = df.groupby(by=df['groups'])
>>> def f(x):
... the_max = x['AMPLITUDE'].idxmax()
... print ( x['groups'][the_max], x['PSRTIME'][the_max])
...
>>> gb.apply(f)
0 58064.3656376
0 58064.3656376
2 54695.4712834
3 54740.4917137
4 54788.477571
5 54836.472922
6 54881.4605511
7 54926.4664883
8 54971.4932866
9 55019.5021472
10 55064.5029133
11 55109.4948108
12 55154.414381
13 55202.488766
14 55247.4721132
15 55292.5301332
16 55340.4728542
17 55385.5229596
18 55430.5332147
19 55478.4812671
20 55523.4894451
21 55568.4626766
22 55616.4630348
23 55661.4969604
24 55709.4504634
25 55754.4711994
26 55799.4736923
27 55844.5050404
28 55892.4699313
29 55937.4721754
30 55985.4677572
31 56030.5119765
32 56075.5517149
33 56168.4447074
34 56213.507484
35 56306.5133063
36 56351.4943058
37 56396.579122
38 56441.5683651
39 56489.5321173
40 56534.4838082
41 56582.469025
42 56627.4135202
43 56672.4926625
44 56720.582296
45 56768.5232469
46 56813.4997925
47 56858.3890558
48 56903.5182596
49 56951.4892721
50 56996.5787435
51 57086.3948136
52 57179.5421833
53 57272.5059448
54 57362.452523
55 57635.5013047
56 57728.4925251
57 57773.5235416
58 57821.5390364
59 57866.5205882
60 57911.5590132
61 57956.5699637
62 58001.4331976
Empty DataFrame
Columns: []
Index: []
The results of the two methods are the same, up to differences in presentation precision.
I also created a small set of data that would give easily calculable results. This is it. The original program performed correctly.
0 -1 0 -1
1 0 1 2
2 -1 0 -1
3 -1 0 -1
4 0 1 0
5 1 1 1
6 -1 0 -1
7 -1 0 -1
8 -1 0 -1
9 0 1 4
10 1 1 3
11 2 1 2
12 -1 0 -1
13 -1 0 -1
14 -1 0 -1
15 -1 0 -1
16 0 1 0
17 1 1 1
18 2 1 2
19 3 1 3
20 -1 0 -1
21 -1 0 -1
22 -1 0 -1
23 -1 0 -1
24 -1 0 -1
25 0 1 6
26 1 1 5
27 2 1 4
28 3 1 3
29 4 1 2
30 -1 0 -1
31 -1 0 -1
32 -1 0 -1
33 -1 0 -1
34 -1 0 -1
35 -1 0 -1
36 0 1 0
37 1 1 1
38 2 1 2
39 3 1 3
40 4 1 4
41 5 1 5
41 -1 0 -1
41 -1 0 -1

Resources