How to zip two column in pyspark? [duplicate] - python-3.x

This question already has answers here:
How to zip two array columns in Spark SQL
(4 answers)
Closed 2 years ago.
I use: Python 3.6 and PySpark 2.3.0. In the following exaple I have only tow items in item but also I can have more information like first_name, last_name, city.
I have a data frame with the following schema:
|-- email: string (nullable = true)
| -- item: struct(nullable=true)
| | -- item: array(nullable=true)
| | | -- element: struct(containsNull=true)
| | | | -- data: string(nullable=true)
| | | | -- fieldid: string(nullable=true)
| | | | -- fieldname: string(nullable=true)
| | | | -- fieldtype: string(nullable=true)
This is my output:
+-----+-----------------------------------------------------------------------------------------+
|email|item |
+-----+-----------------------------------------------------------------------------------------+
|x |[[[Gmail, 32, Email Client, dropdown], [Device uses Proxy Server, 33, Device, dropdown]]]|
|y |[[[IE, 32, Email Client, dropdown], [Personal computer, 33, Device, dropdown]]] |
+-----+-----------------------------------------------------------------------------------------+
I want to transform this data frame to:
+-----+-------------------------------------+
|email|Email Client|Device |
+-----+-------------------------------------+
|x |Gmail |Device uses Proxy Server|
|y |IE |Personal computer |
+-----+-------------------------------------+
I do some transformations:
df = df.withColumn('item', df.item.item)
df = df.withColumn('column_names', df.item.fieldname)
df = df.withColumn('column_values', df.item.data)
And now my output is:
+-----+----------------------+---------------------------------+
|email|column_names |column_values |
+-----+----------------------+---------------------------------+
|x |[Email Client, Device]|[Gmail, Device uses Proxy Server]|
|y |[Email Client, Device]|[IE, Personal computer] |
+-----+----------------------+---------------------------------+
From here I want a method how to zip these columns.

You asked how to zip the arrays, but you can actually get to your desired output without the intermediate steps of creating the column_names and column_values columns.
Use the getItem() function to grab the desired values by index:
import pyspark.sql.functions as f
df = df.select(
'email',
f.col('item.data').getItem(0).alias('Email Client'),
f.col('item.data').getItem(1).alias('Device')
)
df.show(truncate=False)
#+-----+------------+------------------------+
#|email|Email Client|Device |
#+-----+------------+------------------------+
#|x |Gmail |Device uses Proxy Server|
#|y |IE |Personal computer |
#+-----+------------+------------------------+
This assumes that the Email Client field is always at index 0 and Device is at index 1.
If you can't assume that the fields are always in the same order in each row, another option is to create a map from the values in the column_names and column_values using pyspark.sql.functions.create_map().
This function takes takes a:
list of column names (string) or list of Column expressions that [are] grouped as key-value pairs, e.g. (key1, value1, key2, value2, ...).
We iterate over the items in column_names and column_values to create a list of the pairs, and then use list(chain.from_iterable(...)) to flatten the list.
After the list is made, you can select the field by name.
from itertools import chain
# first create a map type column called 'map'
df.select(
'email',
f.create_map(
list(
chain.from_iterable(
[[f.col('column_names').getItem(i), f.col('column_values').getItem(i)]
for i in range(2)]
)
)
).alias('map')
)
df.show(truncte=False)
#+-----+--------------------------------------------------------------+
#|email|map |
#+-----+--------------------------------------------------------------+
#|x |Map(Email Client -> Gmail, Device -> Device uses Proxy Server)|
#|y |Map(Email Client -> IE, Device -> Personal computer) |
#+-----+--------------------------------------------------------------+
# now select the fields by key
df = df.select(
'email',
f.col('map').getField("Email Client").alias("Email Client"),
f.col('map').getField("Device").alias("Device")
)
This assumes that there will always be at least 2 elements in each array.
If you wanted to zip lists of arbitrary length, you would have to use a udf.
# define the udf
zip_lists = f.udf(lambda x, y: [list(z) for z in zip(x, y)], ArrayType(StringType()))
# use the udf to zip the lists
df.select(
'email',
zip_lists(f.col('column_names'), f.col('column_values')).alias('zipped')
).show(truncate=False)
#+-----+-----------------------------------------------------------+
#|email|zipped |
#+-----+-----------------------------------------------------------+
#|x |[[Email Client, Gmail], [Device, Device uses Proxy Server]]|
#|y |[[Email Client, IE], [Device, Personal computer]] |
#+-----+-----------------------------------------------------------+
Or you could use a udf to create the map:
make_map = f.udf(lambda x, y: dict(zip(x, y)), MapType(StringType(), StringType()))
df.select(
'email',
make_map(f.col('column_names'), f.col('column_values')).alias('map')
).show(truncate=False)
#+-----+--------------------------------------------------------------+
#|email|map |
#+-----+--------------------------------------------------------------+
#|x |Map(Device -> Device uses Proxy Server, Email Client -> Gmail)|
#|y |Map(Device -> Personal computer, Email Client -> IE) |
#+-----+--------------------------------------------------------------+

Related

How does Spark SQL implement the group by aggregate

How does Spark SQL implement the group by aggregate? I want to group by name field and based on the latest data to get the latest salary. How to write the SQL
The data is:
+-------+------|+---------|
// | name |salary|date |
// +-------+------|+---------|
// |AA | 3000|2022-01 |
// |AA | 4500|2022-02 |
// |BB | 3500|2022-01 |
// |BB | 4000|2022-02 |
// +-------+------+----------|
The expected result is:
+-------+------|
// | name |salary|
// +-------+------|
// |AA | 4500|
// |BB | 4000|
// +-------+------+
Assuming that the dataframe is registered as a temporary view named tmp, first use the row_number windowing function for each group (name) in reverse order by date Assign the line number (rn), and then take all the lines with rn=1.
sql = """
select name, salary from
(select *, row_number() over (partition by name order by date desc) as rn
from tmp)
where rn = 1
"""
df = spark.sql(sql)
df.show(truncate=False)
First convert your string to a date.
Covert the date to an UNixTimestamp.(number representation of a date, so you can use Max)
User "First" as an aggregate
function that retrieves a value of your aggregate results. (The first results, so if there is a date tie, it could pull either one.)
:
simpleData = [("James","Sales","NY",90000,34,'2022-02-01'),
("Michael","Sales","NY",86000,56,'2022-02-01'),
("Robert","Sales","CA",81000,30,'2022-02-01'),
("Maria","Finance","CA",90000,24,'2022-02-01'),
("Raman","Finance","CA",99000,40,'2022-03-01'),
("Scott","Finance","NY",83000,36,'2022-04-01'),
("Jen","Finance","NY",79000,53,'2022-04-01'),
("Jeff","Marketing","CA",80000,25,'2022-04-01'),
("Kumar","Marketing","NY",91000,50,'2022-05-01')
]
schema = ["employee_name","name","state","salary","age","updated"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)
df.withColumn(
"dateUpdated",
unix_timestamp(
to_date(
col("updated") ,
"yyyy-MM-dd"
)
)
).groupBy("name")
.agg(
max("dateUpdated"),
first("salary").alias("Salary")
).show()
+---------+----------------+------+
| name|max(dateUpdated)|Salary|
+---------+----------------+------+
| Sales| 1643691600| 90000|
| Finance| 1648785600| 90000|
|Marketing| 1651377600| 80000|
+---------+----------------+------+
My usual trick is to "zip" date and salary together (depends on what do you want to sort first)
from pyspark.sql import functions as F
(df
.groupBy('name')
.agg(F.max(F.array('date', 'salary')).alias('max_date_salary'))
.withColumn('max_salary', F.col('max_date_salary')[1])
.show()
)
+----+---------------+----------+
|name|max_date_salary|max_salary|
+----+---------------+----------+
| AA|[2022-02, 4500]| 4500|
| BB|[2022-02, 4000]| 4000|
+----+---------------+----------+

How to filter text after some stop word?

I have a text. From each line I want to filter everything after some stop word. For example :
stop_words=['with','is', '/']
One of the rows is:
senior manager with experience
I want to remove everything after with (including with) so the output is:
senior manager
I have big-data and am working with Spark in Python.
You can find the location of the stop words using instr, and get a substring up to that location.
import pyspark.sql.functions as F
stop_words = ['with', 'is', '/']
df = spark.createDataFrame([
['senior manager with experience'],
['is good'],
['xxx//'],
['other text']
]).toDF('col')
df.show(truncate=False)
+------------------------------+
|col |
+------------------------------+
|senior manager with experience|
|is good |
|xxx // |
|other text |
+------------------------------+
df2 = df.withColumn('idx',
F.coalesce(
# Get the smallest index of a stop word in the string
F.least(*[F.when(F.instr('col', s) != 0, F.instr('col', s)) for s in stop_words]),
# If no stop words found, get the whole string
F.length('col') + 1)
).selectExpr('trim(substring(col, 1, idx-1)) col')
df2.show()
+--------------+
| col|
+--------------+
|senior manager|
| |
| xxx|
| other text|
+--------------+
You can use udf and get index of first occurrence of stop word in col, then again using one more udf, you can substring col message.
val df = List("senior manager with experience", "is good", "xxx//", "other text").toDF("col")
val index_udf = udf ( (col_value :String ) => {val result = for (elem <- stop_words; if col_value.contains(elem)) yield col_value.indexOf(elem)
if (result.isEmpty) col_value.length else result.min } )
val substr_udf = udf((elem:String, index:Int) => elem.substring(0, index))
val df3 = df.withColumn("index", index_udf($"col")).withColumn("substr_message", substr_udf($"col", $"index")).select($"substr_message").withColumnRenamed("substr_message", "col")
df3.show()
+---------------+
| col|
+---------------+
|senior manager |
| |
| xxx|
| other text|
+---------------+

Trouble spliting a column into more columns on Pyspark

I'm having trouble spliting a dataframe's column into more columns in PySpark:
I have a list of lists and I want to transform it into a dataframe, each value in one column.
What I have tried:
I created a dataframe from this list:
[['COL-4560', 'COL-9655', 'NWG-0610', 'D81-3754'],
['DLL-7760', 'NAT-9885', 'PED-0550', 'MAR-0004', 'LLL-5554']]
Using this code:
from pyspark.sql import Row
R = Row('col1', 'col2')
# use enumerate to add the ID column
df_from_list = spark.createDataFrame([R(i, x) for i, x in enumerate(recs_list)])
The result I got is:
+----+--------------------+
|col1| col2|
+----+--------------------+
| 0|[COL-4560, COL-96...|
| 1|[DLL-7760, NAT-98...|
+----+--------------------+
I want to separate the values by comma into columns, so I tried:
from pyspark.sql import functions as F
df2 = df_from_list.select('col1', F.split('col2', ', ').alias('col2'))
# If you don't know the number of columns:
df_sizes = df2.select(F.size('col2').alias('col2'))
df_max = df_sizes.agg(F.max('col2'))
nb_columns = df_max.collect()[0][0]
df_result = df2.select('col1', *[df2['col2'][i] for i in range(nb_columns)])
df_result.show()
But I get an error on this line df2 = df_from_list.select('col1', F.split('col2', ', ').alias('col2')):
AnalysisException: cannot resolve 'split(`col2`, ', ', -1)' due to data type mismatch: argument 1 requires string type, however, '`col2`' is of array<string> type.;;
My ideal final output would be like this:
+----------+----------+----------+----------+----------+
| SKU | REC_01 | REC_02 | REC_03 | REC_04 |
+----------+----------+----------+----------+----------+
| COL-4560 | COL-9655 | NWG-0610 | D81-3754 | null |
| DLL-7760 | NAT-9885 | PED-0550 | MAR-0004 | LLL-5554 |
+---------------------+----------+----------+----------+
Some rows may have four values, but some my have more or less, I don't know the exact number of columns the final dataframe will have.
Does anyone have any idea of what is happening? Thank you very much in advance.
Dataframe df_from_list col2 column is already array type, so no need to split (as split works with stringtype here we have arraytype).
Here are the steps that will work for you.
recs_list=[['COL-4560', 'COL-9655', 'NWG-0610', 'D81-3754'],
['DLL-7760', 'NAT-9885', 'PED-0550', 'MAR-0004', 'LLL-5554']]
from pyspark.sql import Row
R = Row('col1', 'col2')
# use enumerate to add the ID column
df_from_list = spark.createDataFrame([R(i, x) for i, x in enumerate(recs_list)])
from pyspark.sql import functions as F
df2 = df_from_list
# If you don't know the number of columns:
df_sizes = df2.select(F.size('col2').alias('col2'))
df_max = df_sizes.agg(F.max('col2'))
nb_columns = df_max.collect()[0][0]
cols=['SKU','REC_01','REC_02','REC_03','REC_04']
df_result = df2.select(*[df2['col2'][i] for i in range(nb_columns)]).toDF(*cols)
df_result.show()
#+--------+--------+--------+--------+--------+
#| SKU| REC_01| REC_02| REC_03| REC_04|
#+--------+--------+--------+--------+--------+
#|COL-4560|COL-9655|NWG-0610|D81-3754| null|
#|DLL-7760|NAT-9885|PED-0550|MAR-0004|LLL-5554|
#+--------+--------+--------+--------+--------+

Printing a list of dictionaries as a table

How can I format the below data into tabular form using Python ?
Is there any way to print/write the data as per the expected format ?
[{"itemcode":null,"productname":"PKS543452","value_2018":null},
{"itemcode":null,"productname":"JHBG6%&9","value_2018":null},
{"itemcode":null,"productname":"VATER3456","value_2018":null},
{"itemcode":null,"productname":"ACDFER3434","value_2018":null}]
Expected output:
|itemcode | Productname | Value_2018 |
|null |PKS543452|null|
|null |JHBG6%&9|null|
|null |VATER3456|null|
|null |ACDFER3434|null|
You can use pandas to generate a dataframe from the list of dictionaries:
import pandas as pd
null = "null"
lst = [{"itemcode":null,"productname":"PKS543452","value_2018":null},
{"itemcode":null,"productname":"JHBG6%&9","value_2018":null},
{"itemcode":null,"productname":"VATER3456","value_2018":null},
{"itemcode":null,"productname":"ACDFER3434","value_2018":null}]
df = pd.DataFrame.from_dict(lst)
print(df)
Output:
itemcode productname value_2018
0 null PKS543452 null
1 null JHBG6%&9 null
2 null VATER3456 null
3 null ACDFER3434 null
This makes it easy to manipulate data in the table later on. Otherwise, you can print your desired output using built-in string methods:
output=[]
col_names = '|' + ' | '.join(lst[0].keys()) + '|'
print(col_names)
for dic in lst:
row = '|' + ' | '.join(dic.values()) + '|'
print(row)
Output:
|itemcode | productname | value_2018|
|null | PKS543452 | null|
|null | JHBG6%&9 | null|
|null | VATER3456 | null|
|null | ACDFER3434 | null|
You can try like this as well (without using pandas). I have commented each and every line in code itself so don't forget to read them.
Note: Actually, the list/array that you have have pasted is either the result of json.dumps() (in Python, a text) or you have copied the API response (JSON).
null is from JavaScript and the pasted list/array is not a valid Python list but it can be considered as text and converted back to Python list using json.loads(). In this case, null will be converted to None.
And that's why to form the wanted o/p we need another check like "null" if d[key] is None else d[key].
import json
# `null` is used in JavaScript (JSON is JavaScript), so I considered it as string
json_text = """[{"itemcode":null,"productname":"PKS543452","value_2018":null},
{"itemcode":null,"productname":"JHBG6%&9","value_2018":null},
{"itemcode":null,"productname":"VATER3456","value_2018":null},
{"itemcode":null,"productname":"ACDFER3434","value_2018":null}]"""
# Will contain the rows (text)
texts = []
# Converting to original list object, `null`(JavaScript) will transform to `None`(Python)
l = json.loads(json_text)
# Obtain keys (Note that dictionary is an unorederd data type)
# So it is imp to get keys for ordered iteration in all dictionaries of list
# Column may be in different position but related data will be perfect
# If you wish you can hard code the `keys`, here I am getting using `l[0].keys()`
keys = l[0].keys()
# Form header and add to `texts` list
header = '|' + ' | '.join(keys) + " |"
texts.append(header)
# Form body (rows) and append to `texts` list
rows = ['| ' + "|".join(["null" if d[key] is None else d[key] for key in keys]) + "|" for d in l]
texts.extend(rows)
# Print all rows (including header) separated by newline '\n'
answer = '\n'.join(texts)
print(answer)
Output
|itemcode | productname | value_2018 |
| null|PKS543452|null|
| null|JHBG6%&9|null|
| null|VATER3456|null|
| null|ACDFER3434|null|

Pyspark DataFrame: find difference between two DataFrames (values and column names)

I am having total 100+ columns in dataframe.
I am trying to compare two data frame and find unmatched record with column name.
I got a output bellow code but When I run the code for 100+ columns job got aborted.
I am doing this for SCD Type 2 delta process error finding.
from pyspark.sql.types import *
from pyspark.sql.functions import *
d2 = sc.parallelize([("A1", 500,1005) ,("A2", 700,10007)])
dataFrame1 = sqlContext.createDataFrame(d2, ["ID", "VALUE1", "VALUE2"])
d2 = sc.parallelize([("A1", 600,1005),("A2", 700,10007)])
dataFrame2 = sqlContext.createDataFrame(d2, ["ID", "VALUE1", "VALUE2"])
key_id_col_name="ID"
key_id_value="A1"
dataFrame1.select("ID","VALUE1").subtract(dataFrame2.select("ID",col("VALUE1").alias("value"))).show()
def unequalColumnValuesTwoDF(dataFrame1,dataFrame2,key_id_col_name,key_id_value):
chk_fst=True
dataFrame1 = dataFrame1.where(dataFrame1[key_id_col_name] == key_id_value)
dataFrame2 = dataFrame2.where(dataFrame2[key_id_col_name] == key_id_value)
col_names = list(set(dataFrame1.columns).intersection(dataFrame2.columns))
col_names.remove(key_id_col_name)
for col_name in col_names:
if chk_fst == True:
df_tmp = dataFrame1.select(col(key_id_col_name).alias("KEY_ID"),col(col_name).alias("VALUE")).subtract(dataFrame2.select(col(key_id_col_name).alias("KEY_ID"),col(col_name).alias("VALUE"))).withColumn("COL_NAME",lit(col_name))
chk_fst = False
else:
df_tmp = df_tmp.unionAll(dataFrame1.select(col(key_id_col_name).alias("KEY_ID"),col(col_name).alias("VALUE")).subtract(dataFrame2.select(col(key_id_col_name).alias("KEY_ID"),col(col_name).alias("VALUE"))).withColumn("COL_NAME",lit(col_name)))
return df_tmp
res_df = unequalColumnValuesTwoDF(dataFrame1,dataFrame2,key_id_col_name,key_id_value)
res_df.show()
>>> dataFrame1.show()
+---+------+------+
| ID|VALUE1|VALUE2|
+---+------+------+
| A1| 500| 1005|
| A2| 700| 10007|
+---+------+------+
>>> dataFrame2.show()
+---+------+------+
| ID|VALUE1|VALUE2|
+---+------+------+
| A1| 600| 1005|
| A2| 700| 10007|
+---+------+------+
>>> res_df.show()
+------+-----+--------+
|KEY_ID|VALUE|COL_NAME|
+------+-----+--------+
| A1| 500| VALUE1|
+------+-----+--------+
Please suggest any other way.
Here is another approach:
Join the two DataFrames using the ID column.
Then for each row, create a new column which contains the columns for which there is a difference.
Create this new column as a key-value pair map using pyspark.sql.functions.create_map().1
The key for the map will be the column name.
Using pyspark.sql.functions.when(), set the value to the corresponding value in in dataFrame1 (as it seems like that is what you want from your example) if there is a difference between the two DataFrames. Otherwise, we set the value to None.
Use pyspark.sql.functions.explode() on the map column, and filter out any rows where the difference is not null using pyspark.sql.functions.isnull().
Select the columns you want and rename using alias().
Example:
import pyspark.sql.functions as f
columns = [c for c in dataFrame1.columns if c != 'ID']
dataFrame1.alias('r').join(dataFrame2.alias('l'), on='ID')\
.withColumn(
'diffs',
f.create_map(
*reduce(
list.__add__,
[
[
f.lit(c),
f.when(
f.col('r.'+c) != f.col('l.'+c),
f.col('r.'+c)
).otherwise(None)
]
for c in columns
]
)
)
)\
.select([f.col('ID'), f.explode('diffs')])\
.where(~f.isnull(f.col('value')))\
.select(
f.col('ID').alias('KEY_ID'),
f.col('value').alias('VALUE'),
f.col('key').alias('COL_NAME')
)\
.show(truncate=False)
#+------+-----+--------+
#|KEY_ID|VALUE|COL_NAME|
#+------+-----+--------+
#|A1 |500 |VALUE1 |
#+------+-----+--------+
Notes
1 The syntax *reduce(list.__add__, [[f.lit(c), ...] for c in columns]) as the argument to create_map() is some python-fu that helps create the map dynamically.
create_map() expects an even number of arguments- it assumes that the first argument in every pair is the key and the second is the value. In order to put the arguments in that order, the list comprehension yields a list for each iteration. We reduce this list of lists into a flat list using list.__add__.
Finally the * operator is used to unpack the list.
Here is the intermediate output, which may make the logic clearer:
dataFrame1.alias('r').join(dataFrame2.alias('l'), on='ID')\
.withColumn(
'diffs',
f.create_map(
*reduce(
list.__add__,
[
[
f.lit(c),
f.when(
f.col('r.'+c) != f.col('l.'+c),
f.col('r.'+c)
).otherwise(None)
]
for c in columns
]
)
)
)\
.select('ID', 'diffs').show(truncate=False)
#+---+-----------------------------------+
#|ID |diffs |
#+---+-----------------------------------+
#|A2 |Map(VALUE1 -> null, VALUE2 -> null)|
#|A1 |Map(VALUE1 -> 500, VALUE2 -> null) |
#+---+-----------------------------------+

Resources