Dynamic Columns .withColumn Python DataFrame - python-3.x

I wanted to apply .withColumn dynamically on my Spark DataFrame with column names in list
from pyspark.sql.functions import col
from pyspark.sql.types import BooleanType
def get_dtype(dataframe,colname):
return [dtype for name, dtype in dataframe.dtypes if name == colname][0]
def get_matches(dataframe):
return [x for x in dataframe.columns if get_dtype(dataframe,x)=='tinyint']
matches = get_matches(srcpartyaddressDF)
matches
Above code give me list of columns where column datatype is 'tinyint'
Result:
Out[67]: ['verified_flag', 'standard_flag', 'overseas_flag', 'active']
Now I want to do below for each column from list matches dynamically
partyaddressDF = srcpartyaddressDF.withColumn("verified_flag", col("verified_flag").cast(BooleanType())).withColumn("standard_flag", col("standard_flag").cast(BooleanType())).withColumn("overseas_flag", col("overseas_flag").cast(BooleanType())).withColumn("active", col("active").cast(BooleanType()))
How can this be acheived in Python3

you can do something like this:
# import is necessary only for python 3
from functools import reduce
def do_cast(df, cl):
return df.withColumn(cl, col(cl).cast(BooleanType()))
matches = ['verified_flag', 'standard_flag', 'overseas_flag', 'active']
partyaddressDF = reduce(do_cast, matches, srcpartyaddressDF)
basically, it takes initial value (srcpartyaddressDF), and apply first item from list (column name), then takes 2nd value from list, and use it with result that was obtained on first execution, then 3rd value, ...

Related

Spark keeping words in column that match a list

I currently have a list and a Spark dataframe:
['murder', 'violence', 'flashback', 'romantic', 'cult', 'revenge', 'psychedelic', 'comedy', 'suspenseful', 'good versus evil']
I am having a tough time figuring out a way to create a new column in the dataframe that takes the first matching word from the tags column for each row and puts it in the newly created column for that row.
For example, lets say the first row in the tags column has only "murder" in it, I would want that to show in the new column. Then, if the next row had "boring", "silly" and "cult" in it, I would want it to show cult in the new column since it matches the list. If the last row in tags column had "revenge", "cult" in it, I would want it to only show revenge, since its the first word that matches the list.
from pyspark.sql import functions as F
df = spark.createDataFrame([('murder',), ('boring silly cult',), ('revenge cult',)], ['tags'])
mylist = ['murder', 'violence', 'flashback', 'romantic', 'cult', 'revenge', 'psychedelic', 'comedy', 'suspenseful', 'good versus evil']
pattern = '|'.join([f'({x})' for x in mylist])
df = df.withColumn('first_from_list', F.regexp_extract('tags', pattern, 0))
df.show()
# +-----------------+---------------+
# | tags|first_from_list|
# +-----------------+---------------+
# | murder| murder|
# |boring silly cult| cult|
# | revenge cult| revenge|
# +-----------------+---------------+
You could use a PySpark UDF (User Defined Function).
First, let's write a python function to find a first match between a list (in this case the list you provided) and a string, that is, the value of the tags column:
def find_first_match(tags):
first_match = ''
genres= ['murder', 'violence', 'flashback', 'romantic', 'cult', 'revenge', 'psychedelic', 'comedy', 'suspenseful', 'good versus evil']
for tag in tags.split():
for genre in genres:
if tag==genre:
first_match=tag
return first_match
Then, we need to convert this function into a PySpark udf so that we can use it in combination with the .withColumn() operation:
find_first_matchUDF = udf(lambda z:find_first_match(z))
Now we can apply the udf function to generate a new column. Assuming df is the name of your DataFrame:
from pyspark.sql.functions import col
new_df = df.withColumn("first_match", find_first_matchUDF(col("tags")))
This approach only works if all tag in your tags column are separated by white spaces.
P.S
You can avoid the second step using annotation:
from pyspark.sql.functions import col
#udf(returnType=StringType())
def find_first_match(tags):
first_match = ''
genres= ['murder', 'violence', 'flashback', 'romantic', 'cult', 'revenge', 'psychedelic', 'comedy', 'suspenseful', 'good versus evil']
for tag in tags.split():
for genre in genres:
if tag==genre:
first_match=tag
return first_match
new_df = df.withColumn("first_match", find_first_match(col("tags")))

How to average groups of columns

Given the following pandas dataframe:
I am trying to get to point b (shown in image 2). Where I want to use row 'class' to identify column names and average columns with the same class. I have been trying to use setdefault to create a dictionary but I am not having much luck. I aim to achieve the final result shown in fig 2.
Since this is a representative example (the actual dataframe is huge), please let me know of a loop based example if possible.
Any help or pointers in the right direction is immensely appreciated.
Imports and Test DataFrame
import pandas as pd
from string import ascii_lowercase # for test data
import numpy as np # for test data
np.random.seed(365)
df = pd.DataFrame(np.random.rand(5, 6) * 1000, columns=list(ascii_lowercase[:6]))
df.index.name = 'Class'
a b c d e f
Class
0 941.455743 641.602705 684.610467 588.562066 543.887219 368.070913
1 766.625774 305.012427 442.085972 110.443337 438.373785 752.615799
2 291.626250 885.722745 996.691261 486.568378 349.410194 151.412764
3 891.947611 773.542541 780.213921 489.000349 532.862838 189.855095
4 958.551868 882.662907 86.499676 243.609553 279.726092 215.662172
Create a DataFrame of column pair means
# use list slicing to select even and odd columns
even_cols = df.columns[0::2]
odd_cols = df.columns[1::2]
# zip the two lists into pairs
# zip creates tuples, but pandas requires list of columns, so we map the tuples into lists
col_pairs = list(map(list, zip(even_cols, odd_cols)))
# in a list comprehension iterate through each column pair, get the mean, and concat the results into a dataframe
df_means = pd.concat([df[pairs].mean(axis=1) for pairs in col_pairs], axis=1)
# in a list comprehension create column header names with a string join
df_means.columns = [' & '.join(pair) for pair in col_pairs]
# display(df_means)
a & b c & d e & f
Class
0 791.529224 636.586267 455.979066
1 535.819101 276.264655 595.494792
2 588.674498 741.629819 250.411479
3 832.745076 634.607135 361.358966
4 920.607387 165.054615 247.694132
Try This
df['A B'] = df[['A', 'B']].mean(axis=1)

Converting Pandas DataFrame OrderedSet column into list

I have a Pandas DataFrame, one column, is an OrderedSet like this:
df
OrderedSetCol
0 OrderedSet([1721754, 3622558, 2550234, 2344034, 8550040])
This is:
from ordered_set import OrderedSet
I am just trying to convert this column into list:
df['OrderedSetCol_list'] = df['OrderedSetCol'].apply(lambda x: ast.literal_eval(str("\'" + x.replace('OrderedSet(','').replace(')','') + "\'")))
The code executes succesfully, but, my column type is still str and not list
type(df.loc[0]['OrderedSetCol_list'])
str
What am I doing wrong?
EDIT: My OrderedSetCol is also a string column as I am reading a file from a disk, which was originally saved from OrderedSet column.
Expected Output:
[1721754, 3622558, 2550234, 2344034, 8550040]
You can apply a list calling just like you would do with the OrderedSet itself:
df = pd.DataFrame({'OrderedSetCol':[OrderedSet([1721754, 3622558, 2550234, 2344034, 8550040])]})
df.OrderedSetCol.apply(list)
Output:
[1721754, 3622558, 2550234, 2344034, 8550040]
If your data type string column:
df.OrderedSetCol.str.findall('\d+')

Creating score column in Pyspark data frame using jellyfish package

I have a data frame like this
df = [id1, id2, name1, name2, address1, address2, DOB1, DOB2]
I would like get Jaro_winkler score (in a new column) for the column1 and column2 in the Pyspark DataFrame. I am trying to use jellyfish python package.
Thanks
This response stems mainly from viewing a similar question in StackOverflow here. In their example, they investigate how null values can be dealt with when running a jellyfish string comparison.
You'll want to set up a UDF call to utilize the parallel processing powers of pyspark. See code below:
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType
import jellyfish
# initiate user defined function (UDF) call.
#udf(DoubleType())
def jaro_winkler(s1, s2):
return jellyfish.jaro_winkler(s1, s2)
# to create a new column
df = df.withColumn('new_column',jaro_winkler(col('column1'),col('column2')))
# to show top 20 results
df.select('new_column').show()
for a similar functionality with the option to deal with null values, I would suggest altering your function to incorporate the following change:
#udf(DoubleType())
def jaro_winkler(s1, s2):
if s1 is None or s2 is None:
out = 0
else:
out = jellyfish.jaro_winkler(s1, s2)
return out

subtract mean from pyspark dataframe

I'm trying to calculate the average for each column in a dataframe and subtract from each element in the column. I've created a function that attempts to do that, but when I try to implement it using a UDF, I get an error: 'float' object has no attribute 'map'. Any ideas on how I can create such a function? Thanks!
def normalize(data):
average=data.map(lambda x: x[0]).sum()/data.count()
out=data.map(lambda x: (x-average))
return out
mapSTD=udf(normalize,IntegerType())
dats = data.withColumn('Normalized', mapSTD('Fare'))
In your example there is problem with UDF function which can not be applied to row and whole DataFrame. UDF can be applied only to single row, but Spark also enables implementing UDAF (User Defined Aggregate Functions) working on whole DataFrame.
To solve your problem you can use below function:
from pyspark.sql.functions import mean
def normalize(df, column):
average = df.agg(mean(df[column]).alias("mean")).collect()[0]["mean"]
return df.select(df[column] - average)
Use it like this:
normalize(df, "Fare")
Please note that above only works on single column, but it is possible to implement something more generic:
def normalize(df, columns):
selectExpr = []
for column in columns:
average = df.agg(mean(df[column]).alias("mean")).collect()[0]["mean"]
selectExpr.append(df[column] - average)
return df.select(selectExpr)
use it like:
normalize(df, ["col1", "col2"])
This works, but you need to run aggregation for each column, so with many columns performance could be issue, but it is possible to generate only one aggregate expression:
def normalize(df, columns):
aggExpr = []
for column in columns:
aggExpr.append(mean(df[column]).alias(column))
averages = df.agg(*aggExpr).collect()[0]
selectExpr = []
for column in columns:
selectExpr.append(df[column] - averages[column])
return df.select(selectExpr)
Adding onto Piotr's answer. If you need to keep the existing dataframe and add normalized columns with aliases, the function can be modified as:
def normalize(df, columns):
aggExpr = []
for column in columns:
aggExpr.append(mean(df[column]).alias(column))
averages = df.agg(*aggExpr).collect()[0]
selectExpr = ['*']
for column in columns:
selectExpr.append((df[column] - averages[column]).alias('normalized_'+column))
return df.select(selectExpr)

Resources