I have a Spark dataframe that looks something like this:
columns = ["object_type", "object_name"]
data = [("galaxy", "andromeda,milky way,condor,andromeda"),
("planet", "mars,jupiter,venus,mars,saturn,venus,earth,mars,venus,earth"),
("star", "mira,sun,altair,sun,sirius,rigel,mira,sirius,aldebaran"),
("natural satellites", "moon,io,io,elara,moon,kale,titan,kale,phobos,titan,europa")]
init_df = spark.createDataFrame(data).toDF(*columns)
init_df.show(truncate = False)
+------------------+-----------------------------------------------------------+
|object_type |object_name |
+------------------+-----------------------------------------------------------+
|galaxy |andromeda,milky way,condor,andromeda |
|planet |mars,jupiter,venus,mars,saturn,venus,earth,mars,venus,earth|
|star |mira,sun,altair,sun,sirius,rigel,mira,sirius,aldebaran |
|natural satellites|moon,io,io,elara,moon,kale,titan,kale,phobos,titan,europa |
+------------------+-----------------------------------------------------------+
I need to create a new column with the most frequent words from the object_name column using PySpark.
Conditions:
if there is one dominant word in the row (mode = 1), then choose this word as most frequent (like "andromeda" in the first row)
if there are two dominant words in the row that occur the equal number of times (mode = 2), then select both these words (like "mars" and "venus" in the second row - they occur by 3 times, while the rest of the words are less common)
if there are three dominant words in the row that occur an equal number of times, then pick all these three words (like "mira", "sun" and "sirius" which occur by 2 times, while the rest of the words only once)
if there are four or more dominant words in the row that occur an equal number of times (like in the fourth row), then set the "many objects" flag.
Expected output:
+-----------------+-----------------------------------------------------------+---------------+
|object_type |object_name |most_frequent |
+-----------------+-----------------------------------------------------------+---------------+
|galaxy |andromeda,milky way,condor,andromeda |andromeda |
|planet |mars,jupiter,venus,mars,saturn,venus,earth,mars,venus,earth|mars,venus |
|star |mira,sun,altair,sun,sirius,rigel,mira,sirius,aldebaran |mira,sun,sirius|
|natural satellite|moon,io,io,elara,moon,kale,titan,kale,phobos,titan,europa |many objects |
+-----------------+-----------------------------------------------------------+---------------+
I'll be very grateful for any advice!
You can try this,
res_df = init_df.withColumn("list_obj", F.split(F.col("object_name"),",")) \
.withColumn("most_frequent", F.udf(lambda x: ', '.join([mitem[1] for mitem in zip((x.count(item) for item in set(x)),set(x)) if mitem[0] == max((x.count(item) for item in set(x)))]))(F.col("list_obj"))) \
.drop("list_obj")
res_df.show(truncate=False)
+------------------+-----------------------------------------------------------+---------------------+
|object_type |object_name |most_frequent |
+------------------+-----------------------------------------------------------+---------------------+
|galaxy |andromeda,milky way,condor,andromeda |andromeda |
|planet |mars,jupiter,venus,mars,saturn,venus,earth,mars,venus,earth|venus, mars |
|star |mira,sun,altair,sun,sirius,rigel,mira,sirius,aldebaran |sirius, mira, sun |
|natural satellites|moon,io,io,elara,moon,kale,titan,kale,phobos,titan,europa |moon, kale, titan, io|
+------------------+-----------------------------------------------------------+---------------------+
EDIT:
According to OP's suggestion, we can achieve their desired output by doing something like this,
from pyspark.sql.types import *
res_df = init_df.withColumn("list_obj", F.split(F.col("object_name"),",")) \
.withColumn("most_frequent", F.udf(lambda x: [mitem[1] for mitem in zip((x.count(item) for item in set(x)),set(x)) if mitem[0] == max((x.count(item) for item in set(x)))], ArrayType(StringType()))(F.col("list_obj"))) \
.withColumn("most_frequent", F.when(F.size(F.col("most_frequent")) >= 4, F.lit("many objects")).otherwise(F.concat_ws(", ", F.col("most_frequent")))) \
.drop("list_obj")
res_df.show(truncate=False)
+------------------+-----------------------------------------------------------+-----------------+
|object_type |object_name |most_frequent |
+------------------+-----------------------------------------------------------+-----------------+
|galaxy |andromeda,milky way,condor,andromeda |andromeda |
|planet |mars,jupiter,venus,mars,saturn,venus,earth,mars,venus,earth|venus, mars |
|star |mira,sun,altair,sun,sirius,rigel,mira,sirius,aldebaran |sirius, mira, sun|
|natural satellites|moon,io,io,elara,moon,kale,titan,kale,phobos,titan,europa |many objects |
+------------------+-----------------------------------------------------------+-----------------+
Try this:
from pyspark.sql import functions as psf
from pyspark.sql.window import Window
columns = ["object_type", "object_name"]
data = [("galaxy", "andromeda,milky way,condor,andromeda"),
("planet", "mars,jupiter,venus,mars,saturn,venus,earth,mars,venus,earth"),
("star", "mira,sun,altair,sun,sirius,rigel,mira,sirius,aldebaran"),
("natural satellites", "moon,io,io,elara,moon,kale,titan,kale,phobos,titan,europa")]
init_df = spark.createDataFrame(data).toDF(*columns)
# unpivot the object name and count
df_exp = init_df.withColumn('object_name_exp', psf.explode(psf.split('object_name',',')))
df_counts = df_exp.groupBy('object_type', 'object_name_exp').count()
window_spec = Window.partitionBy('object_type').orderBy(psf.col('count').desc())
df_ranked = df_counts.withColumn('rank', psf.dense_rank().over(window_spec))
# rank the counts, keeping the top ranked object names
df_top_ranked = df_ranked.filter(psf.col('rank')==psf.lit(1)).drop('count')
# count the number of top ranked object names
df_top_counts = df_top_ranked.groupBy('object_type', 'rank').count()
# join these back to the original object names
df_with_counts = df_top_ranked.join(df_top_counts, on='object_type', how='inner')
# implement the rules whether to retain the reference to the object name or state 'many objects'
df_most_freq = df_with_counts.withColumn('most_frequent'
, psf.when(psf.col('count')<=psf.lit(3), psf.col('object_name_exp')).otherwise(psf.lit('many objects'))
)
# collect the object names retained back into and array and de-duplicate them
df_results = df_most_freq.groupBy('object_type').agg(psf.array_distinct(psf.collect_list('most_frequent')).alias('most_frequent'))
# show output
df_results.show()
+------------------+-------------------+
| object_type| most_frequent|
+------------------+-------------------+
| galaxy| [andromeda]|
|natural satellites| [many objects]|
| planet| [mars, venus]|
| star|[sirius, mira, sun]|
+------------------+-------------------+
I have a PySpark data frame with a string column(URL) and all records look in the following way
ID URL
1 https://app.xyz.com/inboxes/136636/conversations/2686735685
2 https://app.xyz.com/inboxes/136636/conversations/2938415796
3 https://app.drift.com/inboxes/136636/conversations/2938419189
I want to basically extract the number after conversations/ from URL column using regex into another column.
I tried the following code but it doesn't give me any results.
df1 = df.withColumn('CONV_ID', split(convo_influ_new['URL'], '(?<=conversations/).*').getItem(0))
Expected:
ID URL CONV_ID
1 https://app.xyz.com/inboxes/136636/conversations/2686735685 2686735685
2 https://app.xyz.com/inboxes/136636/conversations/2938415796 2938415796
3 https://app.drift.com/inboxes/136636/conversations/2938419189 2938419189
Result:
ID URL CONV_ID
1 https://app.xyz.com/inboxes/136636/conversations/2686735685 https://app.xyz.com/inboxes/136636/conversations/2686735685
2 https://app.xyz.com/inboxes/136636/conversations/2938415796 https://app.xyz.com/inboxes/136636/conversations/2938415796
3 https://app.drift.com/inboxes/136636/conversations/2938419189 https://app.drift.com/inboxes/136636/conversations/2938419189
Not sure what's happening here. I tried the regex script in different online regex tester toolds and it highlights the part I want but never works in PySpark. I tried different PySpark functions like f.split, regexp_extract, regexp_replace, but none of them work.
If you are URLs have always that form, you can actually just use substring_index to get the last path element :
import pyspark.sql.functions as F
df1 = df.withColumn("CONV_ID", F.substring_index("URL", "/", -1))
df1.show(truncate=False)
#+---+-------------------------------------------------------------+----------+
#|ID |URL |CONV_ID |
#+---+-------------------------------------------------------------+----------+
#|1 |https://app.xyz.com/inboxes/136636/conversations/2686735685 |2686735685|
#|2 |https://app.xyz.com/inboxes/136636/conversations/2938415796 |2938415796|
#|3 |https://app.drift.com/inboxes/136636/conversations/2938419189|2938419189|
#+---+-------------------------------------------------------------+----------+
You can use regexp_extract instead:
import pyspark.sql.functions as F
df1 = df.withColumn(
'CONV_ID',
F.regexp_extract('URL', 'conversations/(.*)', 1)
)
df1.show()
+---+--------------------+----------+
| ID| URL| CONV_ID|
+---+--------------------+----------+
| 1|https://app.xyz.c...|2686735685|
| 2|https://app.xyz.c...|2938415796|
| 3|https://app.drift...|2938419189|
+---+--------------------+----------+
Or if you want to use split, you don't need to specify .*. You just need to specify the pattern used for splitting.
import pyspark.sql.functions as F
df1 = df.withColumn(
'CONV_ID',
F.split('URL', '(?<=conversations/)')[1] # just using 'conversations/' should also be enough
)
df1.show()
+---+--------------------+----------+
| ID| URL| CONV_ID|
+---+--------------------+----------+
| 1|https://app.xyz.c...|2686735685|
| 2|https://app.xyz.c...|2938415796|
| 3|https://app.drift...|2938419189|
+---+--------------------+----------+
I have a Dataframe with 20 columns and I want to update one particular column (whose data is null) with the data extracted from another column and do some formatting. Below is a sample input
+------------------------+----+
|col1 |col2|
+------------------------+----+
|This_is_111_222_333_test|NULL|
|This_is_111_222_444_test|3296|
|This_is_555_and_666_test|NULL|
|This_is_999_test |NULL|
+------------------------+----+
and my output should be like below
+------------------------+-----------+
|col1 |col2 |
+------------------------+-----------+
|This_is_111_222_333_test|111,222,333|
|This_is_111_222_444_test|3296 |
|This_is_555_and_666_test|555,666 |
|This_is_999_test |999 |
+------------------------+-----------+
Here is the code I have tried and it is working only when the the numeric is continuous, could you please help me with a solution.
df.withColumn("col2",when($"col2".isNull,regexp_replace(regexp_replace(regexp_extract($"col1","([0-9]+_)+",0),"_",","),".$","")).otherwise($"col2")).show(false)
I can do this by creating a UDF, but I am thinking is it possible with the spark in-built functions. My Spark version is 2.2.0
Thank you in advance.
A UDF is a good choice here. Performance is similar to that of the withColumn approach given in the OP (see benchmark below), and it works even if the numbers are not contiguous, which is one of the issues mentioned in the OP.
import org.apache.spark.sql.functions.udf
import scala.util.Try
def getNums = (c: String) => {
c.split("_").map(n => Try(n.toInt).getOrElse(0)).filter(_ > 0)
}
I recreated your data as follows
val data = Seq(("This_is_111_222_333_test", null.asInstanceOf[Array[Int]]),
("This_is_111_222_444_test",Array(3296)),
("This_is_555_666_test",null.asInstanceOf[Array[Int]]),
("This_is_999_test",null.asInstanceOf[Array[Int]]))
.toDF("col1","col2")
data.createOrReplaceTempView("data")
Register the UDF and run it in a query
spark.udf.register("getNums",getNums)
spark.sql("""select col1,
case when size(col2) > 0 then col2 else getNums(col1) end new_col
from data""").show
Which returns
+--------------------+---------------+
| col1| new_col|
+--------------------+---------------+
|This_is_111_222_3...|[111, 222, 333]|
|This_is_111_222_4...| [3296]|
|This_is_555_666_test| [555, 666]|
| This_is_999_test| [999]|
+--------------------+---------------+
Performance was tested with a larger data set created as follows:
val bigData = (0 to 1000).map(_ => data union data).reduce( _ union _)
bigData.createOrReplaceTempView("big_data")
With that, the solution given in the OP was compared to the UDF solution and found to be about the same.
// With UDF
spark.sql("""select col1,
case when length(col2) > 0 then col2 else getNums(col1) end new_col
from big_data""").count
/// OP solution:
bigData.withColumn("col2",when($"col2".isNull,regexp_replace(regexp_replace(regexp_extract($"col1","([0-9]+_)+",0),"_",","),".$","")).otherwise($"col2")).count
Here is another way, please check the performance.
df.withColumn("col2", expr("coalesce(col2, array_join(filter(split(col1, '_'), x -> CAST(x as INT) IS NOT NULL), ','))"))
.show(false)
+------------------------+-----------+
|col1 |col2 |
+------------------------+-----------+
|This_is_111_222_333_test|111,222,333|
|This_is_111_222_444_test|3296 |
|This_is_555_666_test |555,666 |
|This_is_999_test |999 |
+------------------------+-----------+
How can I format the below data into tabular form using Python ?
Is there any way to print/write the data as per the expected format ?
[{"itemcode":null,"productname":"PKS543452","value_2018":null},
{"itemcode":null,"productname":"JHBG6%&9","value_2018":null},
{"itemcode":null,"productname":"VATER3456","value_2018":null},
{"itemcode":null,"productname":"ACDFER3434","value_2018":null}]
Expected output:
|itemcode | Productname | Value_2018 |
|null |PKS543452|null|
|null |JHBG6%&9|null|
|null |VATER3456|null|
|null |ACDFER3434|null|
You can use pandas to generate a dataframe from the list of dictionaries:
import pandas as pd
null = "null"
lst = [{"itemcode":null,"productname":"PKS543452","value_2018":null},
{"itemcode":null,"productname":"JHBG6%&9","value_2018":null},
{"itemcode":null,"productname":"VATER3456","value_2018":null},
{"itemcode":null,"productname":"ACDFER3434","value_2018":null}]
df = pd.DataFrame.from_dict(lst)
print(df)
Output:
itemcode productname value_2018
0 null PKS543452 null
1 null JHBG6%&9 null
2 null VATER3456 null
3 null ACDFER3434 null
This makes it easy to manipulate data in the table later on. Otherwise, you can print your desired output using built-in string methods:
output=[]
col_names = '|' + ' | '.join(lst[0].keys()) + '|'
print(col_names)
for dic in lst:
row = '|' + ' | '.join(dic.values()) + '|'
print(row)
Output:
|itemcode | productname | value_2018|
|null | PKS543452 | null|
|null | JHBG6%&9 | null|
|null | VATER3456 | null|
|null | ACDFER3434 | null|
You can try like this as well (without using pandas). I have commented each and every line in code itself so don't forget to read them.
Note: Actually, the list/array that you have have pasted is either the result of json.dumps() (in Python, a text) or you have copied the API response (JSON).
null is from JavaScript and the pasted list/array is not a valid Python list but it can be considered as text and converted back to Python list using json.loads(). In this case, null will be converted to None.
And that's why to form the wanted o/p we need another check like "null" if d[key] is None else d[key].
import json
# `null` is used in JavaScript (JSON is JavaScript), so I considered it as string
json_text = """[{"itemcode":null,"productname":"PKS543452","value_2018":null},
{"itemcode":null,"productname":"JHBG6%&9","value_2018":null},
{"itemcode":null,"productname":"VATER3456","value_2018":null},
{"itemcode":null,"productname":"ACDFER3434","value_2018":null}]"""
# Will contain the rows (text)
texts = []
# Converting to original list object, `null`(JavaScript) will transform to `None`(Python)
l = json.loads(json_text)
# Obtain keys (Note that dictionary is an unorederd data type)
# So it is imp to get keys for ordered iteration in all dictionaries of list
# Column may be in different position but related data will be perfect
# If you wish you can hard code the `keys`, here I am getting using `l[0].keys()`
keys = l[0].keys()
# Form header and add to `texts` list
header = '|' + ' | '.join(keys) + " |"
texts.append(header)
# Form body (rows) and append to `texts` list
rows = ['| ' + "|".join(["null" if d[key] is None else d[key] for key in keys]) + "|" for d in l]
texts.extend(rows)
# Print all rows (including header) separated by newline '\n'
answer = '\n'.join(texts)
print(answer)
Output
|itemcode | productname | value_2018 |
| null|PKS543452|null|
| null|JHBG6%&9|null|
| null|VATER3456|null|
| null|ACDFER3434|null|
I am having total 100+ columns in dataframe.
I am trying to compare two data frame and find unmatched record with column name.
I got a output bellow code but When I run the code for 100+ columns job got aborted.
I am doing this for SCD Type 2 delta process error finding.
from pyspark.sql.types import *
from pyspark.sql.functions import *
d2 = sc.parallelize([("A1", 500,1005) ,("A2", 700,10007)])
dataFrame1 = sqlContext.createDataFrame(d2, ["ID", "VALUE1", "VALUE2"])
d2 = sc.parallelize([("A1", 600,1005),("A2", 700,10007)])
dataFrame2 = sqlContext.createDataFrame(d2, ["ID", "VALUE1", "VALUE2"])
key_id_col_name="ID"
key_id_value="A1"
dataFrame1.select("ID","VALUE1").subtract(dataFrame2.select("ID",col("VALUE1").alias("value"))).show()
def unequalColumnValuesTwoDF(dataFrame1,dataFrame2,key_id_col_name,key_id_value):
chk_fst=True
dataFrame1 = dataFrame1.where(dataFrame1[key_id_col_name] == key_id_value)
dataFrame2 = dataFrame2.where(dataFrame2[key_id_col_name] == key_id_value)
col_names = list(set(dataFrame1.columns).intersection(dataFrame2.columns))
col_names.remove(key_id_col_name)
for col_name in col_names:
if chk_fst == True:
df_tmp = dataFrame1.select(col(key_id_col_name).alias("KEY_ID"),col(col_name).alias("VALUE")).subtract(dataFrame2.select(col(key_id_col_name).alias("KEY_ID"),col(col_name).alias("VALUE"))).withColumn("COL_NAME",lit(col_name))
chk_fst = False
else:
df_tmp = df_tmp.unionAll(dataFrame1.select(col(key_id_col_name).alias("KEY_ID"),col(col_name).alias("VALUE")).subtract(dataFrame2.select(col(key_id_col_name).alias("KEY_ID"),col(col_name).alias("VALUE"))).withColumn("COL_NAME",lit(col_name)))
return df_tmp
res_df = unequalColumnValuesTwoDF(dataFrame1,dataFrame2,key_id_col_name,key_id_value)
res_df.show()
>>> dataFrame1.show()
+---+------+------+
| ID|VALUE1|VALUE2|
+---+------+------+
| A1| 500| 1005|
| A2| 700| 10007|
+---+------+------+
>>> dataFrame2.show()
+---+------+------+
| ID|VALUE1|VALUE2|
+---+------+------+
| A1| 600| 1005|
| A2| 700| 10007|
+---+------+------+
>>> res_df.show()
+------+-----+--------+
|KEY_ID|VALUE|COL_NAME|
+------+-----+--------+
| A1| 500| VALUE1|
+------+-----+--------+
Please suggest any other way.
Here is another approach:
Join the two DataFrames using the ID column.
Then for each row, create a new column which contains the columns for which there is a difference.
Create this new column as a key-value pair map using pyspark.sql.functions.create_map().1
The key for the map will be the column name.
Using pyspark.sql.functions.when(), set the value to the corresponding value in in dataFrame1 (as it seems like that is what you want from your example) if there is a difference between the two DataFrames. Otherwise, we set the value to None.
Use pyspark.sql.functions.explode() on the map column, and filter out any rows where the difference is not null using pyspark.sql.functions.isnull().
Select the columns you want and rename using alias().
Example:
import pyspark.sql.functions as f
columns = [c for c in dataFrame1.columns if c != 'ID']
dataFrame1.alias('r').join(dataFrame2.alias('l'), on='ID')\
.withColumn(
'diffs',
f.create_map(
*reduce(
list.__add__,
[
[
f.lit(c),
f.when(
f.col('r.'+c) != f.col('l.'+c),
f.col('r.'+c)
).otherwise(None)
]
for c in columns
]
)
)
)\
.select([f.col('ID'), f.explode('diffs')])\
.where(~f.isnull(f.col('value')))\
.select(
f.col('ID').alias('KEY_ID'),
f.col('value').alias('VALUE'),
f.col('key').alias('COL_NAME')
)\
.show(truncate=False)
#+------+-----+--------+
#|KEY_ID|VALUE|COL_NAME|
#+------+-----+--------+
#|A1 |500 |VALUE1 |
#+------+-----+--------+
Notes
1 The syntax *reduce(list.__add__, [[f.lit(c), ...] for c in columns]) as the argument to create_map() is some python-fu that helps create the map dynamically.
create_map() expects an even number of arguments- it assumes that the first argument in every pair is the key and the second is the value. In order to put the arguments in that order, the list comprehension yields a list for each iteration. We reduce this list of lists into a flat list using list.__add__.
Finally the * operator is used to unpack the list.
Here is the intermediate output, which may make the logic clearer:
dataFrame1.alias('r').join(dataFrame2.alias('l'), on='ID')\
.withColumn(
'diffs',
f.create_map(
*reduce(
list.__add__,
[
[
f.lit(c),
f.when(
f.col('r.'+c) != f.col('l.'+c),
f.col('r.'+c)
).otherwise(None)
]
for c in columns
]
)
)
)\
.select('ID', 'diffs').show(truncate=False)
#+---+-----------------------------------+
#|ID |diffs |
#+---+-----------------------------------+
#|A2 |Map(VALUE1 -> null, VALUE2 -> null)|
#|A1 |Map(VALUE1 -> 500, VALUE2 -> null) |
#+---+-----------------------------------+