Generate n columns in dataframe based on mutiple values - apache-spark

I have created dataframe like this from a table
df = spark.sql("select * from test") # it is having 2 columns id and name
df2 = df.groupby('id').agg(collect_list('name')
df2.show()
|id|name|
|44038:4572|[0032477212299451]|
|44038:5439|[00324772, 0032477, 003247, 00324]|
|44038:4429|[0032477212299308]|
Until here it's correct, for one id I can store multiple names (values).
Now when I try to create dynamic columns into dataframe based on values, it is not working.
df3 = df2.select([df2.id] + [df2.name[i] for i in range (length)])
Output:
|id |name[0]|
|44038:4572|0032477212299451|
|44038:5439|00324772|
|44038:4429|032477212299308|
Expected output in dataframe:
|id|name[0]|name[1]|name[2]|name[3]|
|44038:4572|0032477212299451|null|null|null|
|44038:5439|00324772|0032477|003247|0034|
|44038:4429|032477212299308|null|null|null|
And then have to replace null with 0.

You might be better off doing pivot instead of collect_list:
from pyspark.sql import functions as F, Window
df2 = (df.withColumn('rn', F.row_number().over(Window.partitionBy('id').orderBy(F.desc('name'))))
.groupBy('id')
.pivot('rn')
.agg(F.first('name'))
.fillna("0")
)
df2.show()
+----------+----------------+-------+------+-----+
| id| 1| 2| 3| 4|
+----------+----------------+-------+------+-----+
|44038:4572|0032477212299451| 0| 0| 0|
|44038:5439| 00324772|0032477|003247|00324|
|44038:4429|0032477212299308| 0| 0| 0|
+----------+----------------+-------+------+-----+
If you want pretty column names, you can do
df3 = df2.toDF('id', *[f'name{i}' for i in range(len(df2.columns) - 1)])
df3.show()
+----------+----------------+-------+------+-----+
| id| name0| name1| name2|name3|
+----------+----------------+-------+------+-----+
|44038:4572|0032477212299451| 0| 0| 0|
|44038:5439| 00324772|0032477|003247|00324|
|44038:4429|0032477212299308| 0| 0| 0|
+----------+----------------+-------+------+-----+

Related

How to set the value of a Pyspark column based on two conditions of the value of another column

Say I have a dataframe:
+-----+-----+-----+
|id |foo. |bar. |
+-----+-----+-----+
| 1| baz| 0|
| 2| baz| 0|
| 3| 333| 2|
| 4| 444| 1|
+-----+-----+-----+
I want to set the 'foo' column to a value depending on the value of bar.
If bar is 2: set the value of foo for that row to 'X',
else if bar is 1: set the value of foo for that row to 'Y'
And if neither condition is met, leave the foo value as it is.
pyspark.when seems like the closest method, but that doesn't seem to work based on another columns value.
when can work with other columns. You can use F.col to get the value of the other column and provide an appropriate condition:
import pyspark.sql.functions as F
df2 = df.withColumn(
'foo',
F.when(F.col('bar') == 2, 'X')
.when(F.col('bar') == 1, 'Y')
.otherwise(F.col('foo'))
)
df2.show()
+---+---+---+
| id|foo|bar|
+---+---+---+
| 1|baz| 0|
| 2|baz| 0|
| 3| X| 2|
| 4| Y| 1|
+---+---+---+
We can solve this using when òr UDF in spark to insert new column based on condition.
Create Sample DataFrame:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('AddConditionalColumn').getOrCreate()
data = [(1,"baz",0),(2,"baz",0),(3,"333",2),(4,"444",1)]
columns = ["id","foo","bar"]
df = spark.createDataFrame(data = data, schema = columns)
df.show()
+---+---+---+
| id|foo|bar|
+---+---+---+
| 1|baz| 0|
| 2|baz| 0|
| 3|333| 2|
| 4|444| 1|
+---+---+---+
Using When:
from pyspark.sql.functions import when
df2 = df.withColumn("foo", when(df.bar == 2,"X")
.when(df.bar == 1,"Y")
.otherwise(df.foo))
df2.show()
+---+---+---+
| id|foo|bar|
+---+---+---+
| 1|baz| 0|
| 2|baz| 0|
| 3| X| 2|
| 4| Y| 1|
+---+---+---+
Using UDF:
import pyspark.sql.functions as F
from pyspark.sql.types import *
def executeRule(value):
if value == 2:
return 'X'
elif value == 1:
return 'Y'
else:
return value
# Converting function to UDF
ruleUDF = F.udf(executeRule, StringType())
df3 = df.withColumn("foo", ruleUDF("bar"))
df3.show()
+---+---+---+
| id|foo|bar|
+---+---+---+
| 1| 0| 0|
| 2| 0| 0|
| 3| X| 2|
| 4| Y| 1|
+---+---+---+

How to create an index column, every 7 occurrences in pyspark

I have a pyspark dataframe that looks like this:
import pandas as pd
foo = pd.DataFrame({'date_col':['2010-02-27','2010-01-20','2010-01-20','2010-01-21','2010-01-21','2010-02-21','2010-02-22','2010-02-23','2010-02-24','2010-02-25','2010-02-26','2010-01-20','2010-01-21','2010-02-20'], 'group':['a','a','a','a','a','a','a','a','a','a','a','b','b','b']})
I would like to create a week column, which would be an index, which increases every 7 ordered unique values of the date_col by group.
The resulting dataframe should look like this:
foo = pd.DataFrame({'date_col':['2010-02-27','2010-01-20','2010-01-20','2010-01-21','2010-01-21','2010-02-21','2010-02-22','2010-02-23','2010-02-24','2010-02-25','2010-02-26','2010-01-20','2010-01-21','2010-02-20'], 'group':['a','a','a','a','a','a','a','a','a','a','a','b','b','b'],
'week':[2,2,1,1,1,1,1,1,1,1,1,1,1,1]})
Any ideas could I do that in pyspark?
UPDATE
Some more explanation on the logic.
Basically the operation could be split into the following steps:
Order the foo on date_col grouped by group
Create a temp_index column, which would rank the date_col by group
Create a week column which would be the div of temp_index with 7
You can use dense_rank and divide the rank by 7. You need to subtract 1 before dividing because in SQL, ranks start from 1 rather than 0.
from pyspark.sql import functions as F, Window
df2 = df.withColumn(
'week',
(
(F.dense_rank().over(Window.partitionBy('group').orderBy('date_col')) - 1) / 7
).cast('int')
)
df2.show()
+----------+-----+----+
| date_col|group|week|
+----------+-----+----+
|2010-01-20| b| 0|
|2010-01-21| b| 0|
|2010-02-20| b| 0|
|2010-01-20| a| 0|
|2010-01-20| a| 0|
|2010-01-21| a| 0|
|2010-01-21| a| 0|
|2010-02-21| a| 0|
|2010-02-22| a| 0|
|2010-02-23| a| 0|
|2010-02-24| a| 0|
|2010-02-25| a| 0|
|2010-02-26| a| 1|
|2010-02-27| a| 1|
+----------+-----+----+

Pyspark: Dropping columns with no distinct values only using transformations [duplicate]

Related question: How to drop columns which have same values in all rows via pandas or spark dataframe?
So I have a pyspark dataframe, and I want to drop the columns where all values are the same in all rows while keeping other columns intact.
However the answers in the above question are only for pandas. Is there a solution for pyspark dataframe?
Thanks
You can apply the countDistinct() aggregation function on each column to get count of distinct values per column. Column with count=1 means it has only 1 value in all rows.
# apply countDistinct on each column
col_counts = df.agg(*(countDistinct(col(c)).alias(c) for c in df.columns)).collect()[0].asDict()
# select the cols with count=1 in an array
cols_to_drop = [col for col in df.columns if col_counts[col] == 1 ]
# drop the selected column
df.drop(*cols_to_drop).show()
You can use approx_count_distinct function (link) to count the number of distinct elements in a column. In case there is just one distinct, the remove the corresponding column.
Creating the DataFrame
from pyspark.sql.functions import approx_count_distinct
myValues = [(1,2,2,0),(2,2,2,0),(3,2,2,0),(4,2,2,0),(3,1,2,0)]
df = sqlContext.createDataFrame(myValues,['value1','value2','value3','value4'])
df.show()
+------+------+------+------+
|value1|value2|value3|value4|
+------+------+------+------+
| 1| 2| 2| 0|
| 2| 2| 2| 0|
| 3| 2| 2| 0|
| 4| 2| 2| 0|
| 3| 1| 2| 0|
+------+------+------+------+
Couting number of distinct elements and converting it into dictionary.
count_distinct_df=df.select([approx_count_distinct(x).alias("{0}".format(x)) for x in df.columns])
count_distinct_df.show()
+------+------+------+------+
|value1|value2|value3|value4|
+------+------+------+------+
| 4| 2| 1| 1|
+------+------+------+------+
dict_of_columns = count_distinct_df.toPandas().to_dict(orient='list')
dict_of_columns
{'value1': [4], 'value2': [2], 'value3': [1], 'value4': [1]}
#Storing those keys in the list which have just 1 distinct key.
distinct_columns=[k for k,v in dict_of_columns.items() if v == [1]]
distinct_columns
['value3', 'value4']
Drop the columns having distinct values
df=df.drop(*distinct_columns)
df.show()
+------+------+
|value1|value2|
+------+------+
| 1| 2|
| 2| 2|
| 3| 2|
| 4| 2|
| 3| 1|
+------+------+

Do Spark/Parquet partitions maintain ordering?

If I partition a data set, will it be in the correct order when I read it back? For example, consider the following pyspark code:
# read a csv
df = sql_context.read.csv(input_filename)
# add a hash column
hash_udf = udf(lambda customer_id: hash(customer_id) % 4, IntegerType())
df = df.withColumn('hash', hash_udf(df['customer_id']))
# write out to parquet
df.write.parquet(output_path, partitionBy=['hash'])
# read back the file
df2 = sql_context.read.parquet(output_path)
I am partitioning on a customer_id bucket. When I read back the whole data set, are the partitions guaranteed to be merged back together in the original insertion order?
Right now, I'm not so sure, so I'm adding a sequence column:
df = df.withColumn('seq', monotonically_increasing_id())
However, I don't know if this is redundant.
No, it's not guaranteed. Try it with even a tiny data set:
df = spark.createDataFrame([(1,'a'),(2,'b'),(3,'c'),(4,'d')],['customer_id', 'name'])
# add a hash column
hash_udf = udf(lambda customer_id: hash(customer_id) % 4, IntegerType())
df = df.withColumn('hash', hash_udf(df['customer_id']))
# write out to parquet
df.write.parquet("test", partitionBy=['hash'], mode="overwrite")
# read back the file
df2 = spark.read.parquet("test")
df.show()
+-----------+----+----+
|customer_id|name|hash|
+-----------+----+----+
| 1| a| 1|
| 2| b| 2|
| 3| c| 3|
| 4| d| 0|
+-----------+----+----+
df2.show()
+-----------+----+----+
|customer_id|name|hash|
+-----------+----+----+
| 2| b| 2|
| 1| a| 1|
| 4| d| 0|
| 3| c| 3|
+-----------+----+----+

Saving iteratively to a new DataFrame in Pyspark

I'm performing computations based on 3 different PySpark DataFrames.
This script works in the sense that it performs the computation as it should, however, I struggle with working properly with the results of said computation.
import sys
import numpy as np
from pyspark import SparkConf, SparkContext, SQLContext
sc = SparkContext("local")
sqlContext = SQLContext(sc)
# Dummy Data
df = sqlContext.createDataFrame([[0,1,0,0,0],[1,1,0,0,1],[0,0,1,0,1],[1,0,1,1,0],[1,1,0,0,0]], ['p1', 'p2', 'p3', 'p4', 'p5'])
df.show()
+---+---+---+---+---+
| p1| p2| p3| p4| p5|
+---+---+---+---+---+
| 0| 1| 0| 0| 0|
| 1| 1| 0| 0| 1|
| 0| 0| 1| 0| 1|
| 1| 0| 1| 1| 0|
| 1| 1| 0| 0| 0|
+---+---+---+---+---+
# Values
values = sqlContext.createDataFrame([(0,1,'p1'),(None,1,'p2'),(0,0,'p3'),(None,0, 'p4'),(1,None,'p5')], ('f1', 'f2','index'))
values.show()
+----+----+-----+
| f1| f2|index|
+----+----+-----+
| 0| 1| p1|
|null| 1| p2|
| 0| 0| p3|
|null| 0| p4|
| 1|null| p5|
+----+----+-----+
# Weights
weights = sqlContext.createDataFrame([(4,3,'p1'),(None,1,'p2'),(2,2,'p3'),(None, 3, 'p4'),(3,None,'p5')], ('f1', 'f2','index'))
weights.show()
+----+----+-----+
| f1| f2|index|
+----+----+-----+
| 4| 3| p1|
|null| 1| p2|
| 2| 2| p3|
|null| 3| p4|
| 3|null| p5|
+----+----+-----+
# Function: it sums the vector W for the values of Row equal to the value of V and then divide by the length of V.
# If there a no similarities between Row and V outputs 0
def W_sum(row,v,w):
if len(w[row==v])>0:
return float(np.sum(w[row==v])/len(w))
else:
return 0.0
For each of the columns and for each row in Data, the above function is applied.
# We iterate over the columns of Values (except the last one called index)
for val in values.columns[:-1]:
# we filter the data to work only with the columns that are defined for the selected Value
defined_col = [i[0] for i in values.where(F.col(val) >= 0).select(values.index).collect()]
# we select only the useful columns
df_select= df.select(defined_col)
# we retrieve the reference value and weights
V = np.array(values.where(values.index.isin(defined_col)).select(val).collect()).flatten()
W = np.array(weights.where(weights.index.isin(defined_col)).select(val).collect()).flatten()
W_sum_udf = F.udf(lambda row: W_sum(row, V, W), FloatType())
df_select.withColumn(val, W_sum_udf(F.array(*(F.col(x) for x in df_select.columns))))
This gives :
+---+---+---+---+---+---+
| p1| p2| p3| p4| p5| f1|
+---+---+---+---+---+---+
| 0| 1| 0| 0| 0|2.0|
| 1| 1| 0| 0| 1|1.0|
| 0| 0| 1| 0| 1|2.0|
| 1| 0| 1| 1| 0|0.0|
| 1| 1| 0| 0| 0|0.0|
+---+---+---+---+---+---+
It added the column to the sliced DataFrame as I asked it to. The problem is that I would rather collect the data into a new one that I could access at the end to consult the results.
It it possible to grow (somewhat efficiently) a DataFrame in PySpark as I would with pandas?
Edit to make my goal clearer:
Ideally I would get a DataFrame with the just the computed columns, like this:
+---+---+
| f1| f2|
+---+---+
|2.0|1.0|
|1.0|2.0|
|2.0|0.0|
|0.0|0.0|
|0.0|2.0|
+---+---+
There are some issues with your question...
First, your for loop will produce an error, since df_select in the last line is nowhere defined; there is also no assignment at the end (what does it produce?).
Assuming that df_select is actually your subsubsample dataframe, defined some lines before, and that your last line is something like
new_df = subsubsample.withColumn(val, W_sum_udf(F.array(*(F.col(x) for x in subsubsample.columns))))
then your problem starts getting more clear. Since
values.columns[:-1]
# ['f1', 'f2']
the result of the whole loop would be just
+---+---+---+---+---+
| p1| p2| p3| p4| f2|
+---+---+---+---+---+
| 0| 1| 0| 0|1.0|
| 1| 1| 0| 0|2.0|
| 0| 0| 1| 0|0.0|
| 1| 0| 1| 1|0.0|
| 1| 1| 0| 0|2.0|
+---+---+---+---+---+
i.e. with only the column f2 included (natural, since the results with f1 are simply overwritten).
Now, as I said, assuming that the situation is like this, and that your problem is actually how to have both columns f1 & f2 together rather in different dataframes, you can just forget subsubsample and append columns to your initial df, possibly dropping afterwards the unwanted ones:
init_cols = df.columns
init_cols
# ['p1', 'p2', 'p3', 'p4', 'p5']
new_df = df
for val in values.columns[:-1]:
# we filter the data to work only with the columns that are defined for the selected Value
defined_col = [i[0] for i in values.where(F.col(val) >= 0).select(values.index).collect()]
# we retrieve the reference value and weights
V = np.array(values.where(values.index.isin(defined_col)).select(val).collect()).flatten()
W = np.array(weights.where(weights.index.isin(defined_col)).select(val).collect()).flatten()
W_sum_udf = F.udf(lambda row: W_sum(row, V, W), FloatType())
new_df = new_df.withColumn(val, W_sum_udf(F.array(*(F.col(x) for x in defined_col)))) # change here
# drop initial columns:
for i in init_cols:
new_df = new_df.drop(i)
The resulting new_df will be:
+---+---+
| f1| f2|
+---+---+
|2.0|1.0|
|1.0|2.0|
|2.0|0.0|
|0.0|0.0|
|0.0|2.0|
+---+---+
UPDATE (after comment): To force the division in your W_sum function to be a float, use:
from __future__ import division
new_df now will be:
+---------+----+
| f1| f2|
+---------+----+
| 2.0| 1.5|
|1.6666666|2.25|
|2.3333333|0.75|
| 0.0|0.75|
|0.6666667|2.25|
+---------+----+
with f2 exactly as it should be according to your comment.

Resources