Remove consecutive duplicate entries from pandas in each cell - python-3.x

I have a data frame that looks like
d = {'col1': ['a,a,b', 'a,c,c,b'], 'col2': ['a,a,b', 'a,b,b,a']}
pd.DataFrame(data=d)
expected output
d={'col1':['a,b','a,c,b'],'col2':['a,b','a,b,a']}
I have tried like this :
arr = ['a', 'a', 'b', 'a', 'a', 'c','c']
print([x[0] for x in groupby(arr)])
How do I remove the duplicate entries in each row and column of dataframe?
a,a,b,c should be a,b,c

From what I understand, you don't want to include values which repeat in a sequence, you can try with this custom function:
def myfunc(x):
s=pd.Series(x.split(','))
res=s[s.ne(s.shift())]
return ','.join(res.values)
print(df.applymap(myfunc))
col1 col2
0 a,b a,b
1 a,c,b a,b,a
Another function can be created with itertools.groupby such as :
from itertools import groupby
def myfunc(x):
l=[x[0] for x in groupby(x.split(','))]
return ','.join(l)

You could define a function to help with this, then use .applymap to apply it to all columns (or .apply one column at a time):
d = {'col1': ['a,a,b', 'a,c,c,b'], 'col2': ['a,a,b', 'a,b,b,a']}
df = pd.DataFrame(data=d)
def remove_dups(string):
split = string.split(',') # split string into a list
uniques = set(split) # remove duplicate list elements
return ','.join(uniques) # rejoin the list elements into a string
result = df.applymap(remove_dups)
This returns:
col1 col2
0 a,b a,b
1 a,c,b a,b
Edit: This looks slightly different to your expected output, why do you expect a,b,a for the second row in col2?
Edit2: to preserve the original order, you can replace the set() function with unique_everseen()
from more_itertools import unique_everseen
.
.
.
uniques = unique_everseen(split)

Related

column comprehension robust to missing values

I have only been able to create a two column data frame from a defaultdict (termed output):
df_mydata = pd.DataFrame([(k, v) for k, v in output.items()],
columns=['id', 'value'])
What I would like to be able to do is using this basic format also initiate the dataframe with three columns: 'id', 'id2' and 'value'. I have a separate defined dict that contains the necessary look up info, called id_lookup.
So I tried:
df_mydata = pd.DataFrame([(k, id_lookup[k], v) for k, v in output.items()],
columns=['id', 'id2','value'])
I think I'm doing it right, but I get key errors. I will only know if id_lookup is exhaustive for all possible encounters in hindsight. For my purposes, simply putting it all together and placing 'N/A` or something for those types of errors will be acceptable.
Would the above be appropriate for calculating a new column of data using a defaultdict and a simple lookup dict, and how might I make it robust to key errors?
Here is an example of how you could do this:
import pandas as pd
from collections import defaultdict
df = pd.DataFrame({'id': [1, 2, 3, 4],
'value': [10, 20, 30, 40]})
id_lookup = {1: 'A', 2: 'B', 3: 'C'}
new_column = defaultdict(str)
# Loop through the df and populate the defaultdict
for index, row in df.iterrows():
try:
new_column[index] = id_lookup[row['id']]
except KeyError:
new_column[index] = 'N/A'
# Convert the defaultdict to a Series and add it as a new column in the df
df['id2'] = pd.Series(new_column)
# Print the updated DataFrame
print(df)
which gives:
id value id2
0 1 10 A
1 2 20 B
2 3 30 C
3 4 40 N/A
​

Pandas exclude rows based on dynamic condition set from configuration file

As title suggest, I have a rule engine in xml format which contains column name and values to exlcule.
<ExclusionSet>
<Exclude Excl="Col1:A" Count="1"/>
<Exclude Excl="Col2:BB,BBB" Count="1"/>
<Exclude Excl="Col3:A1B" Count="1"/>
<Exclude Excl="Col1:A2" Excl="Col2:BC" Count="2"/>
</ExclusionSet>
based on the above condition I need to exclude rows. i.e. row where Col1 has value A, Col2 has value BB and value BBB, Col3 has value A1B.
I was able to get a working code for the single condition (first 3) but unable to figure out how to implement last condition (with more than one condition)
def exclusionEngine(config,df):
#parsing xml
xml_map = minidom.parse(config)
value_map = xml_map.getElementsByTagName('Exclude')
exclusion_df = pd.DataFrame()
#iterating conditions
for atrb in value_map:
#not using Count any where but thought it might be useful for multiple conditions.
rule_count = atrb.attributes['Count'].value
for count in range(1,int(rule_count)+1):
#column name
col = atrb.attributes['Excl'].value.split(':')[0]
# value(s) as a list
value = list(atrb.attributes['Excl'].value.split(':')[1].split(','))
#creating filter for exclusion ; if there is way to implement multiple filters dynamically or create a list of filters and apply it.
filter1 = df[col].isin(value)
df = df.loc[~(filter1)]
return df
expecting something as follows but dynamic as there could be more conditions or less.
df = df.loc[~(filter1 & filter2)]
EDIT:
To simplify the ask here, is it possible to evaluate multiple conditions dynamically ?
<Exclude Excl="Col1:A2" Excl="Col2:BC" Count="2"/>
You could use the pandas query method. I am dropping all the non related xml stuff, as it is not going to work (you have a duplicate attribute so the supplied text is not a valid xml)
import pandas as pd
import re
def exclusionEngine(config: str,df: pd.DataFrame):
ret_df = df.copy()
with open(config, 'r') as inp:
for line in inp.readlines():
if '</ExclusionSet>' in line:
read = False
continue
if 'ExclusionSet' in line:
read = True
continue
if read:
matches = re.findall("Excl\=\"(.*?)\"", line)
query = 'not (' + ' & '.join([f'({col}=="{x}")' for match in matches for col,x in [match.rstrip().split(':')]]) + ')'
ret_df = ret_df.query(query)
return ret_df
So for example if we have the following dataframe:
df = pd.DataFrame([['A', 'B1', 'C1'], ['O', 'C', 'D'], ['B', 'A', 'C'], ['M', 'BB,BBB', 'A'], ['A2', 'H', 'B'], ['A2', 'BC', 'y']], columns=['Col1', 'Col2', 'Col3'])
Col1 Col2 Col3
0 A B1 C1
1 O C D
2 B A C
3 M BB,BBB A
4 A2 H B
5 A2 BC y
and config is saved inside 'config.xml' then calling exclusionEngine('config.xml', df) returns:
Col1 Col2 Col3
1 O C D
2 B A C
4 A2 H B
I think what you looking for is this:
df = df.loc[~(filter1 ^ filter2)]
This also will give you the same result:
df.loc[~filter1 & ~filter2]

How to divide multilevel columns in Python

I have a df like this:
arrays = [['bar', 'bar', 'baz', 'baz'],
['one', 'two', 'one', 'two']]
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(3, 4), index=['A', 'B', 'C'], columns=index)
df.head()
returning:
I want to add some columns where all second level dimensions are divided by each other - bar one is divided by baz one, and bar two is divided by baz two, etc.
df[["bar"]]/df[["baz"]]
and
df[["bar"]].div(df[["baz"]])
returns NaN's
You can select both levels by only one []:
df1 = df["bar"]/df["baz"]
print (df1)
second one two
A 1.564478 -0.115979
B 14.604267 -19.749265
C -0.511788 -0.436637
If want add MultiIndex add MultiIndex.from_product:
df1.columns = pd.MultiIndex.from_product([['new'], df1.columns], names=df.columns.names)
print (df1)
first new
second one two
A 1.564478 -0.115979
B 14.604267 -19.749265
C -0.511788 -0.436637
Another idea for MultiIndex in output is use your solution with rename columns to same names, here new:
df2 = df[["bar"]].rename(columns={'bar':'new'})/df[["baz"]].rename(columns={'baz':'new'})
print (df2)
first new
second one two
A 1.564478 -0.115979
B 14.604267 -19.749265
C -0.511788 -0.436637

compare index and column in data frame with dictionary

I have a dictionary:
d = {'A-A': 1, 'A-B':2, 'A-C':3, 'B-A':5, 'B-B':1, 'B-C':5, 'C-A':3,
'C-B':4, 'C-C': 9}
and a list:
L = [A,B,C]
I have a DataFrame:
df =pd.DataFrame(columns = L, index=L)
I would like to fill each row in df by values in dictionary based on dictionary keys.For example:
A B C
A 1 2 3
B 5 1 5
C 3 4 9
I tried doing that by:
df.loc[L[0]]=[1,2,3]
df.loc[L[1]]=[5,1,5]
df.loc[L[2]] =[3,4,9]
Is there another way to do that especially when there is a huge data?
Thank you for help
Here is another way that I can think of:
import numpy as np
import pandas as pd
# given
d = {'A-A': 1, 'A-B':2, 'A-C':3, 'B-A':5, 'B-B':1, 'B-C':5, 'C-A':3,
'C-B':4, 'C-C': 9}
L = ['A', 'B', 'C']
# copy the key values into a numpy array
z = np.asarray(list(d.values()))
# reshape the array according to your DataFrame
z_new = np.reshape(z, (3, 3))
# copy it into your DataFrame
df = pd.DataFrame(z_new, columns = L, index=L)
This should do the trick, though it's probably not the best way:
for index in L:
prefix = index + "-"
df.loc[index] = [d.get(prefix + column, 0) for column in L]
Calculating the prefix separately beforehand is probably slower for a small list and probably faster for a large list.
Explanation
for index in L:
This iterates through all of the row names.
prefix = index + "-"
All of the keys for each row start with index + "-", e.g. "A-", "B-"… etc..
df.loc[index] =
Set the contents of the entire row.
[ for column in L]
The same as your comma thing ([1, 2, 3]) just for an arbitrary number of items. This is called a "list comprehension".
d.get( , 0)
This is the same as d[ ] but returns 0 if it can't find anything.
prefix + column
Sticks the column on the end, e.g. "A-" gives "A-A", "A-B"…

Merge and then sort columns of a dataframe based on the columns of the merging dataframe

I have two dataframes, both indexed with timestamps. I would like to preserve the order of the columns in the first dataframe that is merged.
For example:
#required packages
import pandas as pd
import numpy as np
# defining stuff
num_periods_1 = 11
num_periods_2 = 4
# create sample time series
dates1 = pd.date_range('1/1/2000 00:00:00', periods=num_periods_1, freq='10min')
dates2 = pd.date_range('1/1/2000 01:30:00', periods=num_periods_2, freq='10min')
column_names_1 = ['C', 'B', 'A']
column_names_2 = ['B', 'C', 'D']
df1 = pd.DataFrame(np.random.randn(num_periods_1, len(column_names_1)), index=dates1, columns=column_names_1)
df2 = pd.DataFrame(np.random.randn(num_periods_2, len(column_names_2)), index=dates2, columns=column_names_2)
df3 = df1.merge(df2, how='outer', left_index=True, right_index=True, suffixes=['_1', '_2'])
print("\nData Frame Three:\n", df3)
The above code generates two data frames the first with columns C, B, and A. The second dataframe has columns B, C, and D. The current output has the columns in the following order; C_1, B_1, A, B_2, C_2, D. What I want the columns from the output of the merge to be C_1, C_2, B_1, B_2, A_1, D_2. The order of the columns is preserved from the first data frame and any data similar to the second data frame is added next to the corresponding data.
Could there be a setting in merge or can I use sort_index to do this?
EDIT: Maybe a better way to phrase the sorting process would be to call it uncollated. Where each column is put together and so on.
Using an OrderedDict, as you suggested.
from collections import OrderedDict
from itertools import chain
c = df3.columns.tolist()
o = OrderedDict()
for x in c:
o.setdefault(x.split('_')[0], []).append(x)
c = list(chain.from_iterable(o.values()))
df3 = df3[c]
An alternative that involves extracting the prefixes and then calling sorted on the index.
# https://stackoverflow.com/a/46839182/4909087
p = [s[0] for s in c]
c = sorted(c, key=lambda x: (p.index(x[0]), x))
df = df[c]

Resources