In python pandas why can't I add more levels to a MultiIndex? - python-3.x

I would like to create DataFrames that have three levels. Why does the following function not work twice?
def superGroup(dataframe=None,multi_index_name=None):
out_dataframe = pd.DataFrame(dataframe.values,index=dataframe.index,columns=pd.MultiIndex.from_product([[multi_index_name],dataframe.columns]))
return out_dataframe
ran = pd.DataFrame(np.random.rand(3),columns=["Random"])
ran2 = superGroup(ran,"Hello")
superGroup(ran2,"World")#Does not work
>>>[Out]: NotImplementedError: isnull is not defined for MultiIndex

Here is a solution I figured out after spending way too much time on this problem. Hope that it helps those out there that have had the same problem.
def superGroup(dataframe=None,new_level=None):
"""Returns a dataframe entered but multiindexed with name new level.
Parameters
----------
dataframe : DataFrame
new_level : str
Returns
-------
out_df : DataFrame
"""
if type(dataframe.columns) == pd.indexes.base.Index:
out_df = pd.DataFrame(dataframe.values,index=dataframe.index,columns=pd.MultiIndex.from_product([[new_level],dataframe.columns]))
return out_df
if type(dataframe.columns) == pd.indexes.multi.MultiIndex:
levels = [list(i.values) for i in dataframe.columns.levels]
levels = [[new_level]]+levels
out_df = pd.DataFrame(dataframe.values, index = dataframe.index, columns = pd.MultiIndex.from_product(levels))
return out_df

Related

Passing DF from function

i wrote a function which build a df inside it and i want to use it afterwards outside the function or in another function, how can i do it witout facing any recognition problem?
Thankw's a lot :)
The code:
def DisplayDataFrame():
file_path = filedialog.askopenfilename()
df1 = pd.read_excel(file_path)
cols = list(df1.columns)
tree = ttk.Treeview(root)
tree.pack()
tree["columns"] = cols
for i in cols:
tree.column(i, anchor="w")
tree.heading(i, text=i, anchor='w')
for index, row in df1.iterrows():
tree.insert("", 0, text=index, values=list(row))
option = df1.index()
Do you mean use df1 from your DisplayDataFrame() in other functions? If so, you can have return df1 in your function like this:
def DisplayDataFrame():
'''
your original codes to define df1
'''
return df1
dataframe = DisplayDataFrame()
Then you can reuse the dataframe in other functions.

Optimizing Pyspark UDF on large data

I am trying to optimize this code that creates a dummy when the column's value (of a pyspark dataframe) is in [categories].
When the run is on 100K rows, it takes about 30seconds to run. In my case I have around 20M rows which will take a lot of time.
def create_dummy(dframe,col_name,top_name,categories,**options):
lst_tmp_col = []
if 'lst_tmp_col' in options:
lst_tmp_col = options["lst_tmp_col"]
udf = UserDefinedFunction(lambda x: 1 if x in categories else 0, IntegerType())
dframe = dframe.withColumn(str(top_name), udf(col(col_name))).cache()
dframe = dframe.select(lst_tmp_col+ [str(top_name)])
return dframe
In other words, how do I optimize this function and cut the total time down regarding the volume of my data? And how to make sure that this function does not iterates over my data?
Appreciate your suggestions. Thanks
You don't need a UDF for encoding the categories. You can use isin:
import pyspark.sql.functions as F
def create_dummy(dframe, col_name, top_name, categories, **options):
lst_tmp_col = []
if 'lst_tmp_col' in options:
lst_tmp_col = options["lst_tmp_col"]
dframe = dframe.withColumn(str(top_name), F.col(col_name).isin(categories).cast("int")).cache()
dframe = dframe.select(lst_tmp_col + [str(top_name)])
return dframe

multiple nested functions output

I'm trying to get the result of multiple functions as nested functions from a dataframe
For example, 2 functions:
def carr(df):
df['carr'] = df[['end_value_carr','arr']].max(axis=1)
return df
def date(df):
df['date_id'] = pd.to_datetime(df['date_id']).dt.date
df['renewal_date'] = pd.to_datetime(df['renewal_date']).dt.date
df['next_renewal_date'] = pd.to_datetime(df['next_renewal_date']).dt.date
return df
When I use each one separately I get the right output
However, trying to have them nested in one function gives me a NoneType:
def cleanup(data):
df = data.copy()
def carr(df):
df['carr'] = df[['end_value_carr','arr']].max(axis=1)
return df
def date(df):
df['date_id'] = pd.to_datetime(df['date_id']).dt.date
df['renewal_date'] = pd.to_datetime(df['renewal_date']).dt.date
df['next_renewal_date'] = pd.to_datetime(df['next_renewal_date']).dt.date
return df
return df
Appreciate your help!
Thanks
Define all three functions separately
def carr(df):
df['carr'] = df[['end_value_carr','arr']].max(axis=1)
return df
def date(df):
df['date_id'] = pd.to_datetime(df['date_id']).dt.date
df['renewal_date'] = pd.to_datetime(df['renewal_date']).dt.date
df['next_renewal_date'] = pd.to_datetime(df['next_renewal_date']).dt.date
return df
Call the first two functions in your third one.
def cleanup(data):
df = data.copy()
df = carr(df)
df = date(df)
return df
Then you can call your cleanup function, which will call carr and date on its own.
df = cleanup(df)

How to apply a function fastly on the list of DataFrame in Python?

I have a list of DataFrames with equal length of columns and rows but different values, such as
data = [df1, df2,df3.... dfn] .
How can I apply a function function on each dataframe in the list data? I used following code but it doe not work
data = [df1, def2,df3.... dfn]
def maxloc(data):
data['loc_max'] = np.zeros(len(data))
for i in range(1,len(data)-1): #from the second value on
if data['q_value'][i] >= data['q_value'][i-1] and data['q_value'][i] >= data['q_value'][i+1]:
data['loc_max'][i] = 1
return data
df_list = [df.pipe(maxloc) for df in data]
Seems to me the problem is in your maxloc() function as this code works.
I added also the maximum value in the return of maxloc.
from random import randrange
import pandas as pd
def maxloc(data_frame):
max_index = data_frame['Value'].idxmax(0)
maximum = data_frame['Value'][max_index]
return max_index, maximum
# create test list of data-frames
data = []
for i in range(5):
temp = []
for j in range(10):
temp.append(randrange(100))
df = pd.DataFrame({'Value': temp}, index=(range(10)))
data.append(df)
df_list = [df.pipe(maxloc) for df in data]
for i, (index, value) in enumerate(df_list):
print(f"Data-frame {i:02d}: maximum = {value} at position {index}")

How do I add a dynamic list of variable to the command pd.concat

I am using python3 and pandas to create a script that will:
Be dynamic across different dataset lengths(rows) and unique values - completed
Take unique values from column A and create separate dataframes as variables for each unique entry - completed
Add totals to the bottom of each dataframe - completed
Concatenate the separate dataframes back together - incomplete
The issue is I am unable to formulate a way to create a list of the variables in use and apply them as arg in to the command pd.concat.
The sample dataset. The dataset may have more unique BrandFlavors or less which is why the script must be flexible and dynamic.
Script:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore')
excel_file = ('testfile.xlsx')
df = pd.read_excel(excel_file)
df = df.sort_values(by='This', ascending=False)
colarr = df.columns.values
arr = df[colarr[0]].unique()
for i in range(len(arr)):
globals()['var%s' % i] = df.loc[df[colarr[0]] == arr[i]]
for i in range(len(arr)):
if globals()['var%s' % i].empty:
''
else:
globals()['var%s' % i] = globals()['var%s' % i].append({'BrandFlavor':'Total',
'This':globals()['var%s' % i]['This'].sum(),
'Last':globals()['var%s' % i]['Last'].sum(),
'Diff':globals()['var%s' % i]['Diff'].sum(),
'% Chg':globals()['var%s' % i]['Diff'].sum()/globals()['var%s' % i]['Last'].sum() * 100}, ignore_index=True)
globals()['var%s' % i]['% Chg'].fillna(0, inplace=True)
globals()['var%s' % i].fillna(' ', inplace=True)
I have tried this below, however the list is a series of strings
vararr = []
count = 0
for x in range(len(arr)):
vararr.append('var' + str(count))
count = count + 1
df = pd.concat([vararr])
pd.concat does not recognize a string. I tired to build a class with an arg defined but had the same issue.
The desired outcome would be a code snippet that generated a list of variables that matched the ones created by lines 9/10 and could be referenced by pd.concat([list, of, vars, here]). It must be dynamic. Thank you
Just fixing the issue at hand, you shouldn't use globals to make variables, that is not considered good practice. Your code should work with some minor modifications.
import pandas as pd
import warnings
warnings.simplefilter(action='ignore')
excel_file = ('testfile.xlsx')
df = pd.read_excel(excel_file)
df = df.sort_values(by='This', ascending=False)
def good_dfs(dataframe):
if dataframe.empty:
pass
else:
this = dataframe.This.sum()
last = dataframe.Last.sum()
diff = dataframe.Diff.sum()
data = {
'BrandFlavor': 'Total',
'This': this,
'Last': last,
'Diff': diff,
'Pct Change': diff / last * 100
}
dataframe.append(data, ignore_index=True)
dataframe['Pct Change'].fillna(0.0, inplace=True)
dataframe.fillna(' ', inplace=True)
return dataframe
colarr = df.columns.values
arr = df[colarr[0]].unique()
dfs = []
for i in range(len(arr)):
temp = df.loc[df[colarr[0]] == arr[i]]
dfs.append(temp)
final_dfs = [good_dfs(d) for d in dfs]
final_df = pd.concat(final_dfs)
Although I will say, there are far easier ways to accomplish what you want without doing all of this, however that can be a separate question.

Resources