Explode multiple array columns with variable lengths - apache-spark

How can I explode multiple array columns with variable lengths and potential nulls?
My input data looks like this:
+----+------------+--------------+--------------------+
|col1| col2| col3| col4|
+----+------------+--------------+--------------------+
| 1|[id_1, id_2]| [tim, steve]| [apple, pear]|
| 2|[id_3, id_4]| [jenny]| [avocado]|
| 3| null|[tommy, megan]| [apple, strawberry]|
| 4| null| null|[banana, strawberry]|
+----+------------+--------------+--------------------+
I need to explode this such that:
Array items with the same index are mapped to the same row
If there is only 1 entry in a column, it applies to every exploded row
If an array is null, it applies to every row
My output should look like this:
+----+----+-----+----------+
|col1|col2|col3 |col4 |
+----+----+-----+----------+
|1 |id_1|tim |apple |
|1 |id_2|steve|pear |
|2 |id_3|jenny|avocado |
|2 |id_4|jenny|avocado |
|3 |null|tommy|apple |
|3 |null|megan|strawberry|
|4 |null|null |banana |
|4 |null|null |strawberry|
+----+----+-----+----------+
I have been able to achieve this using the following code, but I feel like there must be a more straightforward approach:
df = spark.createDataFrame(
[
(1, ["id_1", "id_2"], ["tim", "steve"], ["apple", "pear"]),
(2, ["id_3", "id_4"], ["jenny"], ["avocado"]),
(3, None, ["tommy", "megan"], ["apple", "strawberry"]),
(4, None, None, ["banana", "strawberry"])
],
["col1", "col2", "col3", "col4"]
)
df.createOrReplaceTempView("my_table")
spark.sql("""
with cte as (
SELECT
col1,
col2,
col3,
col4,
greatest(size(col2), size(col3), size(col4)) as max_array_len
FROM my_table
), arrays_extended as (
select
col1,
case
when col2 is null then array_repeat(null, max_array_len)
else col2
end as col2,
case
when size(col3) = 1 then array_repeat(col3[0], max_array_len)
when col3 is null then array_repeat(null, max_array_len)
else col3
end as col3,
case
when size(col4) = 1 then array_repeat(col4[0], max_array_len)
when col4 is null then array_repeat(null, max_array_len)
else col4
end as col4
from cte),
arrays_zipped as (
select *, explode(arrays_zip(col2, col3, col4)) as zipped
from arrays_extended
)
select
col1,
zipped.col2,
zipped.col3,
zipped.col4
from arrays_zipped
""").show(truncate=False)

After you get max_array_len, just use sequence function to iterate through the arrays, transform them into a struct, and then explode the resulting array of structs, see below SQL:
spark.sql("""
with cte as (
SELECT
col1,
col2,
col3,
col4,
greatest(size(col2), size(col3), size(col4)) as max_array_len
FROM my_table
)
SELECT inline_outer(
transform(
sequence(0,max_array_len-1), i -> (
col1 as col1,
col2[i] as col2,
coalesce(col3[i], col3[0]) as col3, /* fill null with the first array item of col3 */
coalesce(col4[i], element_at(col4,-1)) as col4 /* fill null with the last array item of col4 */
)
)
)
FROM cte
""").show()
+----+----+-----+----------+
|col1|col2| col3| col4|
+----+----+-----+----------+
| 1|id_1| tim| apple|
| 1|id_2|steve| pear|
| 2|id_3|jenny| avocado|
| 2|id_4|jenny| avocado|
| 3|null|tommy| apple|
| 3|null|megan|strawberry|
| 4|null| null| banana|
| 4|null| null|strawberry|
+----+----+-----+----------+
A similar question here.

You can use inline_outer in conjuction with selectExpr and additionally coalesce for the first non-null to handle size mismatches within the different arrays
Data Preparation
inp_data = [
(1,['id_1', 'id_2'],['tim', 'steve'],['apple', 'pear']),
(2,['id_3', 'id_4'],['jenny'],['avocado']),
(3,None,['tommy','megan'],['apple', 'strawberry']),
(4,None,None,['banana', 'strawberry'])
]
inp_schema = StructType([
StructField('col1',IntegerType(),True)
,StructField('col2',ArrayType(StringType(), True))
,StructField('col3',ArrayType(StringType(), True))
,StructField('col4',ArrayType(StringType(), True))
]
)
sparkDF = sql.createDataFrame(data=inp_data,schema=inp_schema)\
sparkDF.show(truncate=False)
+----+------------+--------------+--------------------+
|col1|col2 |col3 |col4 |
+----+------------+--------------+--------------------+
|1 |[id_1, id_2]|[tim, steve] |[apple, pear] |
|2 |[id_3, id_4]|[jenny] |[avocado] |
|3 |null |[tommy, megan]|[apple, strawberry] |
|4 |null |null |[banana, strawberry]|
+----+------------+--------------+--------------------+
Inline Outer
sparkDF.selectExpr("col1"
,"""inline_outer(arrays_zip(
coalesce(col2,array()),
coalesce(col3,array()),
coalesce(col4,array())
)
)""").show(truncate=False)
+----+----+-----+----------+
|col1|0 |1 |2 |
+----+----+-----+----------+
|1 |id_1|tim |apple |
|1 |id_2|steve|pear |
|2 |id_3|jenny|avocado |
|2 |id_4|null |null |
|3 |null|tommy|apple |
|3 |null|megan|strawberry|
|4 |null|null |banana |
|4 |null|null |strawberry|
+----+----+-----+----------+

You can use an UDF function:
from pyspark.sql import functions as F, types as T
cols_of_interest = [c for c in df.columns if c != 'col1']
#F.udf(returnType=T.ArrayType(T.ArrayType(T.StringType())))
def get_sequences(*cols):
"""Equivalent of arrays_zip, but handling different lengths of the arrays.
For shorter array than the maximum length last element is repeated.
"""
# Get the length of the longest array in the row
max_len = max(map(len, filter(lambda x: x, cols)))
return list(zip(*[
# create a list for each column with a length equal to the max_len.
# If the original column has less elements than needed, repeat the last one.
# None values will be filled with a list of Nones with length max_len.
[c[min(i, len(c) - 1)] for i in range(max_len)] if c else [None] * max_len for c in cols
]))
df2 = (
df
.withColumn('temp', F.explode(get_sequences(*cols_of_interest)))
.select('col1',
*[F.col('temp').getItem(i).alias(c) for i, c in enumerate(cols_of_interest)])
)
df2 is the following DataFrame:
+----+----+-----+----------+
|col1|col2| col3| col4|
+----+----+-----+----------+
| 1|id_1| tim| apple|
| 1|id_2|steve| pear|
| 2|id_3|jenny| avocado|
| 2|id_4|jenny| avocado|
| 3|null|tommy| apple|
| 3|null|megan|strawberry|
| 4|null| null| banana|
| 4|null| null|strawberry|
+----+----+-----+----------+

I used your logic and shortened it a little.
import pyspark.sql.functions as func
arrcols = ['col2', 'col3', 'col4']
data_sdf. \
selectExpr(*['coalesce({0}, array()) as {0}'.format(c) if c in arrcols else c for c in data_sdf.columns]). \
withColumn('max_size', func.greatest(*[func.size(c) for c in arrcols])). \
selectExpr('col1',
*['flatten(array({0}, array_repeat(element_at({0}, -1), max_size-size({0})))) as {0}'.format(c) for c in arrcols]
). \
withColumn('arrzip', func.arrays_zip(*arrcols)). \
selectExpr('col1', 'inline(arrzip)'). \
orderBy('col1', 'col2'). \
show()
# +----+----+-----+----------+
# |col1|col2| col3| col4|
# +----+----+-----+----------+
# | 1|id_1| tim| apple|
# | 1|id_2|steve| pear|
# | 2|id_3|jenny| avocado|
# | 2|id_4|jenny| avocado|
# | 3|null|megan|strawberry|
# | 3|null|tommy| apple|
# | 4|null| null| banana|
# | 4|null| null|strawberry|
# +----+----+-----+----------+
approach steps
fill nulls with empty arrays, and take the maximum size within all the array columns
add elements to arrays that are smaller in size compared to others
i took the last element of the array and used array_repeat on it (similar to your approach)
the number of times to be repeated is calculated by checking the max size against the size of the array being worked on (max_size-size({0}))
with the aforementioned steps, you will now have same number of elements in each of the array column which enables you to zip (arrays_zip) them and explode (using inline() sql function)
the list comprehension in the second selectExpr generates the following
['flatten(array({0}, array_repeat(element_at({0}, -1), max_size-size({0})))) as {0}'.format(c) for c in arrcols]
# ['flatten(array(col2, array_repeat(element_at(col2, -1), max_size-size(col2)))) as col2',
# 'flatten(array(col3, array_repeat(element_at(col3, -1), max_size-size(col3)))) as col3',
# 'flatten(array(col4, array_repeat(element_at(col4, -1), max_size-size(col4)))) as col4']
if it helps, here are the optimized logical plan and physical plan that spark generated
== Optimized Logical Plan ==
Generate inline(arrzip#363), [1], false, [col2#369, col3#370, col4#371]
+- Project [col1#0L, arrays_zip(flatten(array(coalesce(col2#1, []), array_repeat(element_at(coalesce(col2#1, []), -1, false), (greatest(size(coalesce(col2#1, []), true), size(coalesce(col3#2, []), true), size(coalesce(col4#3, []), true)) - size(coalesce(col2#1, []), true))))), flatten(array(coalesce(col3#2, []), array_repeat(element_at(coalesce(col3#2, []), -1, false), (greatest(size(coalesce(col2#1, []), true), size(coalesce(col3#2, []), true), size(coalesce(col4#3, []), true)) - size(coalesce(col3#2, []), true))))), flatten(array(coalesce(col4#3, []), array_repeat(element_at(coalesce(col4#3, []), -1, false), (greatest(size(coalesce(col2#1, []), true), size(coalesce(col3#2, []), true), size(coalesce(col4#3, []), true)) - size(coalesce(col4#3, []), true))))), col2, col3, col4) AS arrzip#363]
+- Filter (size(arrays_zip(flatten(array(coalesce(col2#1, []), array_repeat(element_at(coalesce(col2#1, []), -1, false), (greatest(size(coalesce(col2#1, []), true), size(coalesce(col3#2, []), true), size(coalesce(col4#3, []), true)) - size(coalesce(col2#1, []), true))))), flatten(array(coalesce(col3#2, []), array_repeat(element_at(coalesce(col3#2, []), -1, false), (greatest(size(coalesce(col2#1, []), true), size(coalesce(col3#2, []), true), size(coalesce(col4#3, []), true)) - size(coalesce(col3#2, []), true))))), flatten(array(coalesce(col4#3, []), array_repeat(element_at(coalesce(col4#3, []), -1, false), (greatest(size(coalesce(col2#1, []), true), size(coalesce(col3#2, []), true), size(coalesce(col4#3, []), true)) - size(coalesce(col4#3, []), true))))), col2, col3, col4), true) > 0)
+- LogicalRDD [col1#0L, col2#1, col3#2, col4#3], false
== Physical Plan ==
Generate inline(arrzip#363), [col1#0L], false, [col2#369, col3#370, col4#371]
+- *(1) Project [col1#0L, arrays_zip(flatten(array(coalesce(col2#1, []), array_repeat(element_at(coalesce(col2#1, []), -1, false), (greatest(size(coalesce(col2#1, []), true), size(coalesce(col3#2, []), true), size(coalesce(col4#3, []), true)) - size(coalesce(col2#1, []), true))))), flatten(array(coalesce(col3#2, []), array_repeat(element_at(coalesce(col3#2, []), -1, false), (greatest(size(coalesce(col2#1, []), true), size(coalesce(col3#2, []), true), size(coalesce(col4#3, []), true)) - size(coalesce(col3#2, []), true))))), flatten(array(coalesce(col4#3, []), array_repeat(element_at(coalesce(col4#3, []), -1, false), (greatest(size(coalesce(col2#1, []), true), size(coalesce(col3#2, []), true), size(coalesce(col4#3, []), true)) - size(coalesce(col4#3, []), true))))), col2, col3, col4) AS arrzip#363]
+- *(1) Filter (size(arrays_zip(flatten(array(coalesce(col2#1, []), array_repeat(element_at(coalesce(col2#1, []), -1, false), (greatest(size(coalesce(col2#1, []), true), size(coalesce(col3#2, []), true), size(coalesce(col4#3, []), true)) - size(coalesce(col2#1, []), true))))), flatten(array(coalesce(col3#2, []), array_repeat(element_at(coalesce(col3#2, []), -1, false), (greatest(size(coalesce(col2#1, []), true), size(coalesce(col3#2, []), true), size(coalesce(col4#3, []), true)) - size(coalesce(col3#2, []), true))))), flatten(array(coalesce(col4#3, []), array_repeat(element_at(coalesce(col4#3, []), -1, false), (greatest(size(coalesce(col2#1, []), true), size(coalesce(col3#2, []), true), size(coalesce(col4#3, []), true)) - size(coalesce(col4#3, []), true))))), col2, col3, col4), true) > 0)
+- *(1) Scan ExistingRDD[col1#0L,col2#1,col3#2,col4#3]

Related

How to create a spark dataframe from one of the column in the existing dataframe

Requirements:
I wanted to create a dataframe out of one column (existing dataframe ). That column value is multiple json list.
Problem:
Since the json does not have a fixed schema, i wasn't able to use the from_json function since it needs schema before to parse the columns.
Example
| Column A | Column B |
| 1 | [{"id":"123","phone":"124"}] |
| 3 | [{"id":"456","phone":"741"}] |
Expected output:
| id | phone|
| 123 | 124 |
| 456 | 741 |
Any thoughts on this ?
Try using Spark SQL to explode the "Column B" Array
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType
spark = SparkSession.builder.appName("Test_app").getOrCreate()
input_data = [
(1, [{"id":"123","phone":"124"}]),
(3, [{"id":"456","phone":"741"}])
]
schema = StructType([
StructField("Column A", IntegerType(), True),
StructField("Column B", ArrayType(StructType([
StructField("id", StringType(), True),
StructField("phone", StringType(), True)
])), True)
])
df = spark.createDataFrame(input_data, schema)
df_exploded = df.selectExpr("Column A", "explode(Column B) as e") \
.select("e.id", "e.phone")
df_exploded.show()
Output is below ;
+---+-----+
| id|phone|
+---+-----+
|123| 124|
|456| 741|
+---+-----+
Convert it into an rdd and then read it as json. For testing I have removed the id element in the second row.
input_data = [
(1, [{"id":"123","phone":"124"}]),
(3, [{"phone":"741"}])
]
df = spark.createDataFrame(input_data, ["ColA","ColB"])
spark.read.json(df.rdd.map(lambda r: r.ColB)).show()
+----+-----+
| id|phone|
+----+-----+
| 123| 124|
|null| 741|
+----+-----+

PySpark Schema structure to read nested data

I am getting following error.
ValueError: field col4: Length of object (1) does not match with length of fields (2)
The data is in this format.
[
["N","S","3",null,null],
["N","P","4",[{"key1":"val1","key2":"val2"}],null],
["N","I","5",null,[{"key1":"val1","key2":"val2"}]],
["N","S","3",null,null]
]
The Schema I have defined is following:
schema = StructType(
StructField("col1", StringType(), True),
StructField("col2", StringType(), True),
StructField("col3", StringType(), True),
StructField("col4",
StructType(
StructField("key1", StringType(), True),
StructField("key2", StringType(), True)
)
),
StructField("col5",
StructType(
StructField("key1", StringType(), True),
StructField("key2", StringType(), True)
)
)
)
Please help me in identifying how I can read the data of this format.
Welcome to StackOverflow community.
Coming to your question, first you need to replace null with None, as null is not a keyword in either python or pyspark (unless you are using spark-sql).
Now regarding your schema - you need to define it as ArrayType wherever complex or list column structure is there. Inside that, you again need to specify StructType because within your list there is a dictionary with key and value pairs.
See below structure to visualize it better -
data = [["N","S","3",None,None], ["N","P","4",[{"key1":"val1","key2":"val2"}],None], ["N","I","5",None,[{"key1":"val1","key2":"val2"}]], ["N","S","3",None, None] ]
You need to convert this to a RDD as below -
data_rdd = sc.parallelize(data)
Once you're RDD is created then you need to create your dataframe using the schema I explained above -
from pyspark.sql.types import *
schema = schema = StructType(
[StructField("col1", StringType(), True),
StructField("col2", StringType(), True),
StructField("col3", StringType(), True),
StructField("col4",
ArrayType(
StructType([StructField("key1", StringType(), True),
StructField("key2", StringType(), True)])
)
),
StructField("col5",
ArrayType(
StructType([StructField("key1", StringType(), True),
StructField("key2", StringType(), True)])
)
)
]
)
df = spark.createDataFrame(data=data_rdd, schema=schema)
Output
df.show()
+----+----+----+--------------+--------------+
|col1|col2|col3| col4| col5|
+----+----+----+--------------+--------------+
| N| S| 3| null| null|
| N| P| 4|[{val1, val2}]| null|
| N| I| 5| null|[{val1, val2}]|
| N| S| 3| null| null|
+----+----+----+--------------+--------------+
Additionally, if you need the key and value as separate columns for both col4 and col5, in that case you need to create the schema as below -
schema = StructType(
[StructField("col1", StringType(), True),
StructField("col2", StringType(), True),
StructField("col3", StringType(), True),
StructField("col4",
ArrayType(
MapType(StringType(), StringType())
)
),
StructField("col5",
ArrayType(
MapType(StringType(), StringType())
)
)
]
)
from pyspark.sql.functions import *
df = spark.createDataFrame(data=sc.parallelize(data), schema=schema)
df.show(truncate=False)
#Input dataframe output -
+----+----+----+------------------------------+------------------------------+
|col1|col2|col3|col4 |col5 |
+----+----+----+------------------------------+------------------------------+
|N |S |3 |null |null |
|N |P |4 |[{key1 -> val1, key2 -> val2}]|null |
|N |I |5 |null |[{key1 -> val1, key2 -> val2}]|
|N |S |3 |null |null |
+----+----+----+------------------------------+------------------------------+
Finally, explode these columns col4 and col5 as below -
(df.withColumn('explode_col4', explode_outer( col('col4')))
.withColumn('explode_col5', explode_outer( col('col5')))
.select("col1", "col2", "col3", (explode_outer( col('explode_col4') ).alias('col4_key', 'col4_value')) , "explode_col5")
.select("col1", "col2", "col3", "col4_key", "col4_value", (explode_outer( col('explode_col5') ).alias('col5_key', 'col5_value')))
).show(truncate=False)
Output
+----+----+----+--------+----------+--------+----------+
|col1|col2|col3|col4_key|col4_value|col5_key|col5_value|
+----+----+----+--------+----------+--------+----------+
|N |S |3 |null |null |null |null |
|N |P |4 |key1 |val1 |null |null |
|N |P |4 |key2 |val2 |null |null |
|N |I |5 |null |null |key1 |val1 |
|N |I |5 |null |null |key2 |val2 |
|N |S |3 |null |null |null |null |
+----+----+----+--------+----------+--------+----------+

Pyspark join with functions and difference between timestamps

I am trying to join 2 tables with user events. I want to join table_a with table_b by user_id (id) and when the difference timestamps smaller than 5s (5000ms).
Here is what I am doing:
table_a = (
table_a
.join(
table_b,
table_a.uid == table_b.uid
& abs(table_b.b_timestamp - table_a.a_timestamp) < 5000
& table_a.a_timestamp.isNotNull()
,
how = 'left'
)
)
I am getting 2 errors:
Error 1)
ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.
Error 2 when if I remove the 2nd condition on the join and leave only the 1st and 3rd:
org.apache.spark.sql.AnalysisException: cannot resolve '(uidAND (a_timestampIS NOT NULL))' due to data type mismatch: differing types in '(uidAND (a_timestampIS NOT NULL))' (string and boolean).;;
Any help is much appreciated!
You just need parentheses around each filtering condition. For example, the following works:
df1 = spark.createDataFrame([
(1, 20),
(1, 21),
(1, 25),
(1, 30),
(2, 21),
], ['id', 'val'])
df2 = spark.createDataFrame([
(1, 21),
(2, 30),
], ['id', 'val'])
df1.join(
df2,
(df1.id == df2.id)
& (abs(df1.val - df2.val) < 5)
).show()
# +---+---+---+---+
# | id|val| id|val|
# +---+---+---+---+
# | 1| 20| 1| 21|
# | 1| 21| 1| 21|
# | 1| 25| 1| 21|
# +---+---+---+---+
But without parens:
df1.join(
df2,
df1.id == df2.id
& abs(df1.val - df2.val) < 5
).show()
# ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.

Replace null values with other Dataframe in PySpark

I have some data with products (DF), however some don't have a description. I have an excel file with the description of some (loaded as Map). Now I would like to fill the missing values in DF with those of Map and the rows that already have a description keep them untouched using Pyspark.
DF
Id | Desc
01 | 'desc1'
02 | null
03 | 'desc3'
04 | null
Map
Key | Value
2 | 'desc2'
4 | 'desc4'
Output
Id | Desc
1 | 'desc1'
2 | 'desc2'
3 | 'desc3'
4 | 'desc4'
Thanks in advance
You'll want to make sure the DF.Id field and the Map.Key field are the same type/values (currently, they don't look like it with the leading 0), then do a left join, and then select the desired columns with a coalesce(). My pySpark is a bit rusty, so I'll provide the solution in scala. The logic should be the same.
val df = Seq(
(1, "desc1"),
(2, null),
(3, "desc3"),
(4, null)
).toDF("Id", "Desc")
val map = Seq(
(2, "desc2"),
(4, "desc4")
).toDF("Key", "Value")
df.show()
map.show()
df.join(map, df("Id") === map("Key"), "left")
.select(
df("Id"),
coalesce(df("Desc"), $"Value").as("Desc")
)
.show()
Yields:
+---+-----+
| Id| Desc|
+---+-----+
| 1|desc1|
| 2| null|
| 3|desc3|
| 4| null|
+---+-----+
+---+-----+
|Key|Value|
+---+-----+
| 2|desc2|
| 4|desc4|
+---+-----+
+---+-----+
| Id| Desc|
+---+-----+
| 1|desc1|
| 2|desc2|
| 3|desc3|
| 4|desc4|
+---+-----+
In PySpark, with the help of an UDF:
schema = StructType([StructField("Index", IntegerType(), True),
StructField("Desc", StringType(), True)])
DF = sc.parallelize([(1, "desc1"), (2,None), (3,"desc3"), (4, None)]).toDF(schema)
myMap = {
2: "desc2",
4 : "desc4"
}
myMapBroadcasted = sc.broadcast(myMap)
#udf(StringType())
def fillNone(Index, Desc):
if Desc is None:
if Index in myMapBroadcasted.value:
return myMapBroadcasted.value[Index]
return Desc
DF.withColumn('Desc', fillNone(col('Index'), col('Desc'))).show()
It's hard to know the cardinality of the datasets that you've provided... some examples of how that might change a solution here are:
If "DF" and "Map" have overlapping Desc... how should we prioritize which table has the "right" description?
Does the final dataframe that you are looking to create need to be fully inclusive of a list of ID's or descriptions? Do either of these dataframes have the full list? This could also change the solution.
I've made some assumptions so that you can determine for yourself what is the right approach here:
I'm assuming that "DF" contains the whole list of IDs
I'm assuming that "Map" only has a subset of IDs and is not wholly inclusive of the broader set of IDs that exist within "DF"
I'm using PySpark here:
DF = DF.na.drop() # we'll eliminate the missing values from the parent dataframe
DF_Output = DF.join(Map, on = "ID", how = 'outer')
We can divide DF into two dataframes, operate on them separately, and then union them:
val df = Seq(
(1, "desc1"),
(2, null),
(3, "desc3"),
(4, null)
).toDF("Id", "Desc")
val Map = Seq(
(2, "desc2"),
(4, "desc4")
).toDF("Key", "Value")
val nullDF = df.where(df("Desc").isNull)
val nonNullDF = df.where(df("Desc").isNotNull)
val joinedWithKeyDF = nullDF.drop("Desc").join(Map, nullDF("Id")===Map("Key")).withColumnRenamed("Value", "Desc").drop("Key")
val outputDF = joinedWithKeyDF.union(nonNullDF)

How could a PySpark RDD linear list be converted to a DataFrame?

I'd like to convert a linear list to a dataframe.
i.e. given the following list,
a = ["a1", "a2", "a3", b1", "b2", "b3", "c1", "c2", "c3"]
Expected result is,
+--------------------+
| col1 | col2 | col3 |
+--------------------+
| a1 | a2 | a3 |
| b1 | b2 | b3 |
| c1 | c2 | c3 |
+--------------------+
I tried the following but got an error.
from pyspark.sql.types import *
a = ["a1", "a2", "a3", "b1", "b2", "b3", "c1", "c2", "c3"]
rdd = sc.parallelize(a)
schema = StructType([
StructField("a", StringType(), True),
StructField("b", StringType(), True),
StructField("c", StringType(), True)
])
df = sqlContext.createDataFrame(rdd, schema)
df.show()
The last show() statement gets an error "Job aborted due to stage failure".
Please someone tell me the solution?
Thanks.
Based on your comment, I presume that you start with the rdd and not the list.
I further assume that you are determining order based on the index of the rdd. If these assumptions are correct, you can use zipWithIndex() to add a row number to each record.
Then divide the row number by 3 (use integer division) to group every 3 consecutive records. Next use groupByKey() to aggregate the records with the same key into a tuple.
Finally, drop the key and call toDF()
rdd.zipWithIndex()\
.map(lambda row: (row[1]//3, row[0]))\
.groupByKey()\
.map(lambda row: tuple(row[1]))\
.toDF(["a", "b", "c"])\
.show()
#+---+---+---+
#| a| b| c|
#+---+---+---+
#| a1| a2| a3|
#| c1| c2| c3|
#| b1| b2| b3|
#+---+---+---+
Here is a way that should hopefully meet your criteria
# First get a 1 column DF
df = sql.createDataFrame(sc.parallelize(a).map(lambda x: [x]), schema=['col'])
# split each value into a number and letter e.g. 'a1' --> ['a','1'])
df = df.withColumn('letter', f.split('col', '').getItem(0))
df = df.withColumn('number', f.split('col', '').getItem(1))
# Now pivot to get what you want (dropping extraneous columns and ordering
# to get exact output
output = (df.groupBy('letter')
.pivot('number')
.agg(f.first('col'))
.select([f.col(column).alias('col%s'%(column)) for column in ['1','2','3']])
.orderBy('col1')
.drop('letter'))

Resources