How to pass a Pandas dataframe as argument to function through apply - python-3.x

I have a custom function as below to do something.
def f(x):
x['A'] = '123'
return x
df = df.groupby(level=0).apply(f)
Now, I would like to change the function as
def f(x):
x['A'] = '123'
df2['name'] = 'ABC'
return x
How to pass the dataframe df2 as an argument to apply?
Does it work? df = df.groupby(level=0).apply(f, args = df2)

df = df.groupby(level=0).apply(f, args = df2) - this will give an error ""TypeError: f() got an unexpected keyword argument 'args'
correct solution: remove args and pass like this, it solves the error.
df = df.groupby(level=0).apply(f, df2)

Related

Passing DF from function

i wrote a function which build a df inside it and i want to use it afterwards outside the function or in another function, how can i do it witout facing any recognition problem?
Thankw's a lot :)
The code:
def DisplayDataFrame():
file_path = filedialog.askopenfilename()
df1 = pd.read_excel(file_path)
cols = list(df1.columns)
tree = ttk.Treeview(root)
tree.pack()
tree["columns"] = cols
for i in cols:
tree.column(i, anchor="w")
tree.heading(i, text=i, anchor='w')
for index, row in df1.iterrows():
tree.insert("", 0, text=index, values=list(row))
option = df1.index()
Do you mean use df1 from your DisplayDataFrame() in other functions? If so, you can have return df1 in your function like this:
def DisplayDataFrame():
'''
your original codes to define df1
'''
return df1
dataframe = DisplayDataFrame()
Then you can reuse the dataframe in other functions.

multiple nested functions output

I'm trying to get the result of multiple functions as nested functions from a dataframe
For example, 2 functions:
def carr(df):
df['carr'] = df[['end_value_carr','arr']].max(axis=1)
return df
def date(df):
df['date_id'] = pd.to_datetime(df['date_id']).dt.date
df['renewal_date'] = pd.to_datetime(df['renewal_date']).dt.date
df['next_renewal_date'] = pd.to_datetime(df['next_renewal_date']).dt.date
return df
When I use each one separately I get the right output
However, trying to have them nested in one function gives me a NoneType:
def cleanup(data):
df = data.copy()
def carr(df):
df['carr'] = df[['end_value_carr','arr']].max(axis=1)
return df
def date(df):
df['date_id'] = pd.to_datetime(df['date_id']).dt.date
df['renewal_date'] = pd.to_datetime(df['renewal_date']).dt.date
df['next_renewal_date'] = pd.to_datetime(df['next_renewal_date']).dt.date
return df
return df
Appreciate your help!
Thanks
Define all three functions separately
def carr(df):
df['carr'] = df[['end_value_carr','arr']].max(axis=1)
return df
def date(df):
df['date_id'] = pd.to_datetime(df['date_id']).dt.date
df['renewal_date'] = pd.to_datetime(df['renewal_date']).dt.date
df['next_renewal_date'] = pd.to_datetime(df['next_renewal_date']).dt.date
return df
Call the first two functions in your third one.
def cleanup(data):
df = data.copy()
df = carr(df)
df = date(df)
return df
Then you can call your cleanup function, which will call carr and date on its own.
df = cleanup(df)

How to resolve the error dataframe not callable

Count = df.groupby("user_id", as_index=False).count()
Mean = df.groupby("user_id", as_index=False).mean()
dfMerged = pd.merge(df, count, how='right', on=['user_id'])
dfMerged()
dfMerged is not a function.
When you create like this:
def dfMerged():
print('This is function')
dfMerged() #Result -- this is a function
Then it's a function.
Count = df.groupby("user_id", as_index=False).count()
Mean = df.groupby("user_id", as_index=False).mean()
dfMerged = pd.merge(df, count, how='right', on=['user_id'])
dfMerged.head(10)

Can we use keyword arguments in UDF

Question I have is can we we use keyword arguments along with UDF in Pyspark as I did below. conv method has a keyword argument conv_type which by default is assigned to a specific type of formatter however I want to specify a different format at some places. Which is not getting through in udf because of keyword argument. Is there a different approach of using keyword argument here?
from datetime import datetime as dt, timedelta as td,date
tpid_date_dict = {'69': '%d/%m/%Y', '62': '%Y/%m/%d', '70201': '%m/%d/%y', '66': '%d.%m.%Y', '11': '%d-%m-%Y', '65': '%Y-%m-%d'}
def date_formatter_based_on_id(column, date_format):
val = dt.strptime(str(column),'%Y-%m-%d').strftime(date_format)
return val
def generic_date_formatter(column, date_format):
val = dt.strptime(str(column),date_format).strftime('%Y-%m-%d')
return val
def conv(column, id, conv_type=date_formatter_based_on_id):
try:
date_format=tpid_date_dict[id]
except KeyError as e:
print("Key value not found!")
val = None
if column:
try:
val = conv_type(column, date_format)
except Exception as err:
val = column
return val
conv_func = functions.udf(conv, StringType())
date_formatted = renamed_cols.withColumn("check_in_std",
conv_func(functions.col("check_in"), functions.col("id"),
generic_date_formatter))
So the problem is with the last statement(date_formatted = renamed_cols.withColumn("check_in_std",
conv_func(functions.col("check_in"), functions.col("id"),
generic_date_formatter)))
Since the third argument generic_date_formatter is a keyword argument.
On trying this I get following error:
AttributeError: 'function' object has no attribute '_get_object_id'
Unfortunately you cannot use udf with keyword arguments. UserDefinedFunction.__call__ is defined with positional arguments only:
def __call__(self, *cols):
judf = self._judf
sc = SparkContext._active_spark_context
return Column(judf.apply(_to_seq(sc, cols, _to_java_column)))
but the problem you have is not really related to keyword arguments. You get exception because generic_date_formatter is not a Column object but a function.
You can create udf dynamically:
def conv(conv_type=date_formatter_based_on_id):
def _(column, id):
try:
date_format=tpid_date_dict[id]
except KeyError as e:
print("Key value not found!")
val = None
if column:
try:
val = conv_type(column, date_format)
except Exception as err:
val = column
return val
return udf(_, StringType())
which can be called:
conv_func(generic_date_formatter)(functions.col("check_in"), functions.col("id"))
Check Passing a data frame column and external list to udf under withColumn for details.

pandas dataframe output need to be a string instead of a list

I have a requirement that the result value should be a string. But when I calculate the maximum value of dataframe it gives the result as a list.
import pandas as pd
def answer_one():
df_copy = [df['# Summer'].idxmax()]
return (df_copy)
df = pd.read_csv('olympics.csv', index_col=0, skiprows=1)
for col in df.columns:
if col[:2]=='01':
df.rename(columns={col:'Gold'+col[4:]}, inplace=True)
if col[:2]=='02':
df.rename(columns={col:'Silver'+col[4:]}, inplace=True)
if col[:2]=='03':
df.rename(columns={col:'Bronze'+col[4:]}, inplace=True)
if col[:1]=='№':
df.rename(columns={col:'#'+col[1:]}, inplace=True)
names_ids = df.index.str.split('\s\(')
df.index = names_ids.str[0] # the [0] element is the country name (new index)
df['ID'] = names_ids.str[1].str[:3] # the [1] element is the abbreviation or ID (take first 3 characters from that)
df = df.drop('Totals')
df.head()
answer_one()
But here the answer_one() will give me a List as an output and not a string. Can someone help me know how this came be converted to a string or how can I get the answer directly from dataframe as a string. I don't want to convert the list to a string using str(df_copy).
Your first solution would be as #juanpa.arrivillaga put it: To not wrap it. Your function becomes:
def answer_one():
df_copy = df['# Summer'].idxmax()
return (df_copy)
>>> 1
Another thing that you might not be expecting but idxmax() will return the index of the max, perhaps you want to do:
def answer_one():
df_copy = df['# Summer'].max()
return (df_copy)
>>> 30
Since you don't want to do str(df_copy) you can do df_copy.astype(str) instead.
Here is how I would write your function:
def get_max_as_string(data, column_name):
""" Return Max Value from a column as a string."""
return data[column_name].max().astype(str)
get_max_as_string(df, '# Summer')
>>> '30'

Resources