How to make join between dataframes of pspark - python-3.x

I have two DataFrames called DF1 and DF2, the content of each DataFrame is as follows:
df1:
line_item_usage_account_id line_item_unblended_cost name
100000000001 12.05 account1
200000000001 52 account2
300000000003 12.03 account3
df2:
accountname accountproviderid clustername app_pmo app_costcenter
account1 100000000001 cluster1 111111 11111111
account2 200000000001 cluster2 222222 22222222
I need to make a join for fields df1.line_item_usage_account_id and df2.accountproviderid
When both fields have the same ID, the value of the DF1 line_item_unblended_cost column must be added.
And when the value of the line_item_usage_account_id field of the DF1 is not in the accountproviderid column of the DF2, the df1 fields must be aggregated as follows:
accountname accountproviderid clustername app_pmo app_costcenter line_item_unblended_cost
account1 100000000001 cluster1 111111 11111111 12.05
account2 200000000001 cluster2 222222 22222222 52
account3 300000000003 NA NA NA 12.03
The account3 data was added at the end of the new DataFrame by filling with "na" the columns of the DF2.
Any help thanks in advance.

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df1 = spark.createDataFrame([
[100000000001, 12.05, 'account1'],
[200000000001, 52.00, 'account2'],
[300000000003, 12.03, 'account3']],
schema=['line_item_usage_account_id', 'line_item_unblended_cost', 'name' ])
df1.show()
df1.printSchema()
df2 = spark.createDataFrame([
['account1', 100000000001, 'cluster1', 111111, 11111111],
['account2', 200000000001, 'cluster2', 222222, 22222222]],
schema=['accountname', 'accountproviderid', 'clustername', 'app_pmo', 'app_costcenter'])
df2.printSchema()
df2.show()
cols = ['name', 'line_item_usage_account_id', 'clustername', 'app_pmo', 'app_costcenter', 'line_item_unblended_cost']
resDF = df1.join(df2, df1.line_item_usage_account_id == df2.accountproviderid, "leftouter").select(*cols).withColumnRenamed('name', 'accountname').withColumnRenamed('line_item_usage_account_id', 'accountproviderid').orderBy('accountname')
resDF.printSchema()
# |-- accountname: string (nullable = true)
# |-- accountproviderid: long (nullable = true)
# |-- clustername: string (nullable = true)
# |-- app_pmo: long (nullable = true)
# |-- app_costcenter: long (nullable = true)
# |-- line_item_unblended_cost: double (nullable = true)
resDF.show()
# +-----------+-----------------+-----------+-------+--------------+------------------------+
# |accountname|accountproviderid|clustername|app_pmo|app_costcenter|line_item_unblended_cost|
# +-----------+-----------------+-----------+-------+--------------+------------------------+
# | account1| 100000000001| cluster1| 111111| 11111111| 12.05|
# | account2| 200000000001| cluster2| 222222| 22222222| 52.0|
# | account3| 300000000003| null| null| null| 12.03|
# +-----------+-----------------+-----------+-------+--------------+------------------------+

Related

spark schema difference in partitions

I have to read data from a path which is partitioned by region.
US region has columns a,b,c,d,e
EUR region has only a,b,c,d
When I read data from the path and doing a printSchema, I am seeing only a,b,c,d 'e' is missing.
Is there any way to handle this situation? Like column e automatically gets populated with null for EUR data...?
You can use the mergeSchema option that should do exactly what you are looking for as long as columns with the same name have the same type.
Example:
spark.read.option("mergeSchema", "true").format("parquet").load(...)
Once you read the data from the path, you can check if data frame contains column 'e'. If it does not, then you could add this with default value which is None is this case.
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
spark = SparkSession.builder \
.appName('example') \
.getOrCreate()
df = spark.createDataFrame(data=data, schema = columns)
if 'e' not in df.columns:
df = df.withColumn('e',lit(None))
You can collect all the possible columns from both dataset then fill None if that column is not available in each dataset
df_ab = (spark
.sparkContext
.parallelize([
('a1', 'b1'),
('a2', 'b2'),
])
.toDF(['a', 'b'])
)
df_ab.show()
# +---+---+
# | a| b|
# +---+---+
# | a1| b1|
# | a2| b2|
# +---+---+
df_abcd = (spark
.sparkContext
.parallelize([
('a3', 'b3', 'c3', 'd3'),
('a4', 'b4', 'c4', 'd4'),
])
.toDF(['a', 'b', 'c', 'd'])
)
df_abcd.show()
# +---+---+---+---+
# | a| b| c| d|
# +---+---+---+---+
# | a3| b3| c3| d3|
# | a4| b4| c4| d4|
# +---+---+---+---+
unique_columns = list(set(df_ab.columns + df_abcd.columns))
# ['d', 'b', 'a', 'c']
for col in unique_columns:
if col not in df_ab.columns:
df_ab = df_ab.withColumn(col, F.lit(None))
if col not in df_abcd.columns:
df_abcd = df_abcd.withColumn(col, F.lit(None))
df_ab.printSchema()
# root
# |-- a: string (nullable = true)
# |-- b: string (nullable = true)
# |-- d: null (nullable = true)
# |-- c: null (nullable = true)
df_ab.show()
# +---+---+----+----+
# | a| b| d| c|
# +---+---+----+----+
# | a1| b1|null|null|
# | a2| b2|null|null|
# +---+---+----+----+
df_abcd.printSchema()
# root
# |-- a: string (nullable = true)
# |-- b: string (nullable = true)
# |-- c: string (nullable = true)
# |-- d: string (nullable = true)
df_abcd.show()
# +---+---+---+---+
# | a| b| c| d|
# +---+---+---+---+
# | a3| b3| c3| d3|
# | a4| b4| c4| d4|
# +---+---+---+---+
I used pyspark and SQLContext. Hope this implementation will help you to get an idea. Spark provides an environment to use SQL and it is very convenient to use SPARK SQL for these type of things.
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions
from pyspark.sql import SQLContext
import sys
import os
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
class getData(object):
"""docstring for getData"""
def __init__(self):
def get_data(self, n):
spark = SparkSession.builder.appName('YourProjectName').getOrCreate()
data2 = [("region 1","region 2","region 3","region 4"),
("region 5","region 6","region 7","region 8")
]
schema = StructType([ \
StructField("a",StringType(),True), \
StructField("b",StringType(),True), \
StructField("c",StringType(),True), \
StructField("d", StringType(), True) \
])
data3 = [("EU region 1","EU region 2","EU region 3"),
("EU region 5","EU region 6","EU region 7")
]
schema3 = StructType([ \
StructField("a",StringType(),True), \
StructField("b",StringType(),True), \
StructField("c",StringType(),True) \
])
df = spark.createDataFrame(data=data2,schema=schema)
df.createOrReplaceTempView("USRegion")
sqlDF = self.sparkSession1.sql("SELECT * FROM USRegion")
sqlDF.show(n=600)
df1 = spark.createDataFrame(data=data3,schema=schema3)
df1.createOrReplaceTempView("EURegion")
sqlDF1 = self.sparkSession1.sql("SELECT * FROM EURegion")
sqlDF1.show(n=600)
sql_union_df = self.sparkSession1.sql("SELECT a, b, c, d FROM USRegion uNION ALL SELECT a,b, c, '' as d FROM EURegion ")
sql_union_df.show(n=600)
#call the class
conn = getData()
#call the method implemented inside the class
print(conn.get_data(10))

Pyspark 'from_json', data frame return null for the all json values

I have below logs which contains text and json string
2020-09-24T08:03:01.633Z 11.21.23.1 {"EventTime":"2020-09-24 13:33:01","Hostname":"abc-cde.india.local","Keywords":-1234}
created DF for the above logs as seen below
| Date |Source IP | Event Type
|2020-09-24|11.21.23.1 | {"EventTime":"202|
crated schema for converting json string to another data frame
json_schema = StructType([
StructField("EventTime", StringType()),
StructField("Hostname", StringType()),
StructField("Keywords", IntegerType())
])
json_converted_df= df.select(F.from_json(F.col('Event Type'), json_schema).alias("data")).select("data.*").show()
but the Data Frame rerun null for all new json schema
+---------+--------+--------
|EventTime|Hostname|Keywords|
+---------+--------+--------
| null| null|null |
+---------+--------+--------
How to resolve this issue?
Works fine with me ...
# Preparation of test dataset
a = [
(
"2020-09-24T08:03:01.633Z",
"11.21.23.1",
'{"EventTime":"2020-09-24 13:33:01","Hostname":"abc-cde.india.local","Keywords":-1234}',
),
]
b = ["Date", "Source IP", "Event Type"]
df = spark.createDataFrame(a, b)
df.show()
#+--------------------+----------+--------------------+
#| Date| Source IP| Event Type|
#+--------------------+----------+--------------------+
#|2020-09-24T08:03:...|11.21.23.1|{"EventTime":"202...|
#+--------------------+----------+--------------------+
df.printSchema()
#root
# |-- Date: string (nullable = true)
# |-- Source IP: string (nullable = true)
# |-- Event Type: string (nullable = true)
# Your code executed
from pyspark.sql.types import *
json_schema = StructType(
[
StructField("EventTime", StringType()),
StructField("Hostname", StringType()),
StructField("Keywords", IntegerType()),
]
)
json_converted_df = df.select(
F.from_json(F.col("Event Type"), json_schema).alias("data")
).select("data.*")
json_converted_df.show()
#+-------------------+-------------------+--------+
#| EventTime| Hostname|Keywords|
#+-------------------+-------------------+--------+
#|2020-09-24 13:33:01|abc-cde.india.local| -1234|
#+-------------------+-------------------+--------+
json_converted_df.printSchema()
#root
# |-- EventTime: string (nullable = true)
# |-- Hostname: string (nullable = true)
# |-- Keywords: integer (nullable = true)

Aggregate one column, but show all columns in select

I try to show maximum value from column while I group rows by date column.
So i tried this code
maxVal = dfSelect.select('*')\
.groupBy('DATE')\
.agg(max('CLOSE'))
But output looks like that:
+----------+----------+
| DATE|max(CLOSE)|
+----------+----------+
|1987-05-08| 43.51|
|1987-05-29| 39.061|
+----------+----------+
I wanna have output like below
+------+---+----------+------+------+------+------+------+---+----------+
|TICKER|PER| DATE| TIME| OPEN| HIGH| LOW| CLOSE|VOL|max(CLOSE)|
+------+---+----------+------+------+------+------+------+---+----------+
| CDG| D|1987-01-02|000000|50.666|51.441|49.896|50.666| 0| 50.666|
| ABC| D|1987-01-05|000000|51.441| 52.02|51.441|51.441| 0| 51.441|
+------+---+----------+------+------+------+------+------+---+----------+
So my question is how to change the code to have output with all columns and aggregated 'CLOSE' column?
Scheme of my data looks like below:
root
|-- TICKER: string (nullable = true)
|-- PER: string (nullable = true)
|-- DATE: date (nullable = true)
|-- TIME: string (nullable = true)
|-- OPEN: float (nullable = true)
|-- HIGH: float (nullable = true)
|-- LOW: float (nullable = true)
|-- CLOSE: float (nullable = true)
|-- VOL: integer (nullable = true)
|-- OPENINT: string (nullable = true)
If you want the same aggregation all your columns in the original dataframe, then you can do something like,
import pyspark.sql.functions as F
expr = [F.max(coln).alias(coln) for coln in df.columns if 'date' not in coln] # df is your datafram
df_res = df.groupby('date').agg(*expr)
If you want multiple aggregations, then you can do like,
sub_col1 = # define
sub_col2=# define
expr1 = [F.max(coln).alias(coln) for coln in sub_col1 if 'date' not in coln]
expr2 = [F.first(coln).alias(coln) for coln in sub_col2 if 'date' not in coln]
expr=expr1+expr2
df_res = df.groupby('date').agg(*expr)
If you want only one of the columns aggregated and added to your original dataframe, then you can do a selfjoin after aggregating
df_agg = df.groupby('date').agg(F.max('close').alias('close_agg')).withColumn("dummy",F.lit("dummmy")) # dummy column is needed as a workaround in spark issues of self join
df_join = df.join(df_agg,on='date',how='left')
or you can use a windowing function
from pyspark.sql import Window
w= Window.partitionBy('date')
df_res = df.withColumn("max_close",F.max('close').over(w))

How to perform calculation in spark dataframe that select from its own dataframe using pyspark

I have a pyspark schema which look like this :
root
|-- id: string (nullable = true)
|-- long: float (nullable = true)
|-- lat: float (nullable = true)
|-- geohash: string (nullable = true)
|-- neighbors: array (nullable = true)
| |-- element: string (containsNull = true)
The data look like this :
+---+---------+----------+---------+--------------------+
| id| lat| long|geohash_8| neighbors|
+---+---------+----------+---------+--------------------+
| 0|-6.361755| 106.79653| qqggy1yu|[qqggy1ys, qqggy1...|
| 1|-6.358584|106.793945| qqggy4ky|[qqggy4kw, qqggy4...|
| 2|-6.362967|106.798775| qqggy38m|[qqggy38j, qqggy3...|
| 3|-6.358316| 106.79832| qqggy680|[qqggy4xb, qqggy6...|
| 4| -6.36016| 106.7981| qqggy60j|[qqggy4pv, qqggy6...|
| 5|-6.357476| 106.79842| qqggy68j|[qqggy4xv, qqggy6...|
| 6|-6.360814| 106.79435| qqggy4j3|[qqggy4j1, qqggy4...|
| 7|-6.358231|106.794365| qqggy4t2|[qqggy4t0, qqggy4...|
| 8|-6.357654| 106.79736| qqggy4x7|[qqggy4x5, qqggy4...|
| 9|-6.358781|106.794624| qqggy4mm|[qqggy4mj, qqggy4...|
| 10|-6.357654| 106.79443| qqggy4t7|[qqggy4t5, qqggy4...|
| 11|-6.357079| 106.79443| qqggy4tr|[qqggy4tp, qqggy4...|
| 12|-6.359929| 106.79698| qqggy4pn|[qqggy4ny, qqggy4...|
| 13|-6.358111| 106.79633| qqggy4w9|[qqggy4w3, qqggy4...|
| 14|-6.359685| 106.79607| qqggy4q8|[qqggy4q2, qqggy4...|
| 15|-6.357945|106.794945| qqggy4td|[qqggy4t6, qqggy4...|
| 16|-6.360725|106.795456| qqggy4n4|[qqggy4jf, qqggy4...|
| 17|-6.363701| 106.79653| qqggy1wb|[qqggy1w8, qqggy1...|
| 18| -6.36329|106.794586| qqggy1t7|[qqggy1t5, qqggy1...|
| 19|-6.363304| 106.79429| qqggy1t5|[qqggy1sg, qqggy1...|
+---+---------+----------+---------+--------------------+
I want to calculate the distance from each id with its lat long and select all the lat long from all his neighbors then calculate the distance. Then every id will have list of distances in meters with all his neighbors.
I tried using iterative way, which loop every rows then select a dataframe then compute the haversine distance, However the performance is awful. I am stuck on how to apply using functional way in spark. Can anyone help with some suggestion or references.
Updated to address desire for combinations
If you want to do all the combinations, the steps are basically, associate each neighbor ID with it's lat/long, group them together into a single row for each combination set, then do compute distance on all the combinations. Here is example code:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row
import itertools
schema = StructType([
StructField("id", StringType()),
StructField("lat", FloatType()),
StructField("long", FloatType()),
StructField("geohash_8", StringType()),
StructField("neighbors", ArrayType(StringType()))
])
data = [
("0", 10.0, 11.0, "A", ["B", "C", "D"]),
("1", 12.0, 13.0, "B", ["D"]),
("2", 14.0, 15.0, "C", []),
("3", 16.0, 17.0, "D", [])
]
input_df = spark.createDataFrame(sc.parallelize(data), schema)
# Explode to get a row for each comparison pair
df = input_df.withColumn('neighbor', explode('neighbors')).drop('neighbors')
# Join to get the lat/lon of the neighbor
neighbor_map = input_df.selectExpr('geohash_8 as nid', 'lat as nlat', 'long as nlong')
df = df.join(neighbor_map , col('neighbor') == col('nid'), 'inner').drop('nid')
# Add in rows for the root (geohash_8) records before grouping
root_rows = input_df.selectExpr("id", "lat", "long", "geohash_8", "geohash_8 as neighbor", "lat as nlat", "long as nlong")
df = df.unionAll(root_rows)
# Group by to roll the rows back up but now associating the lat/lon w/ the neighbors
df = df.selectExpr("id", "lat", "long", "geohash_8", "struct(neighbor, nlat, nlong) as neighbors")
df = df.groupBy("id", "lat", "long", "geohash_8").agg(collect_set("neighbors").alias("neighbors"))
# You now have all the data you need in one field, so you can write a python udf to do the combinations
def compute_distance(left_lat, left_lon, right_lat, right_lon):
return 10.0
def combinations(neighbors):
result = []
for left, right in itertools.combinations(neighbors, 2):
dist = compute_distance(left['nlat'], left['nlong'], right['nlat'], right['nlong'])
result.append(Row(left=left['neighbor'], right=right['neighbor'], dist=dist))
return result
udf_schema = ArrayType(StructType([
StructField("left", StringType()),
StructField("right", StringType()),
StructField("dist", FloatType())
]))
combinations_udf = udf(combinations, udf_schema)
# Finally, apply the UDF
df = df.withColumn('neighbors', combinations_udf(col('neighbors')))
df.printSchema()
df.show()
Which produces this:
root
|-- id: string (nullable = true)
|-- lat: float (nullable = true)
|-- long: float (nullable = true)
|-- geohash_8: string (nullable = true)
|-- neighbors: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- neighbor: string (nullable = true)
| | |-- nlat: float (nullable = true)
| | |-- nlong: float (nullable = true)
+---+----+----+---------+------------------------------------------------------------------------------------+
|id |lat |long|geohash_8|neighbors |
+---+----+----+---------+------------------------------------------------------------------------------------+
|0 |10.0|11.0|A |[[D, C, 10.0], [D, A, 10.0], [D, B, 10.0], [C, A, 10.0], [C, B, 10.0], [A, B, 10.0]]|
|2 |14.0|15.0|C |[] |
|1 |12.0|13.0|B |[[D, B, 10.0]] |
|3 |16.0|17.0|D |[] |
+---+----+----+---------+------------------------------------------------------------------------------------+

Select columns that satisfy a condition

I'm running the following notebook in zeppelin:
%spark.pyspark
l = [('user1', 33, 1.0, 'chess'), ('user2', 34, 2.0, 'tenis'), ('user3', None, None, ''), ('user4', None, 4.0, ' '), ('user5', None, 5.0, 'ski')]
df = spark.createDataFrame(l, ['name', 'age', 'ratio', 'hobby'])
df.show()
root
|-- name: string (nullable = true)
|-- age: long (nullable = true)
|-- ratio: double (nullable = true)
|-- hobby: string (nullable = true)
+-----+----+-----+-----+
| name| age|ratio|hobby|
+-----+----+-----+-----+
|user1| 33| 1.0|chess|
|user2| 34| 2.0|tenis|
|user3|null| null| |
|user4|null| 4.0| |
|user5|null| 5.0| ski|
+-----+----+-----+-----+
agg_df = df.select(*[(1.0 - (count(c) / count('*'))).alias(c) for c in df.columns])
agg_df.show()
root
|-- name: string (nullable = true)
|-- age: long (nullable = true)
|-- ratio: double (nullable = true)
|-- hobby: string (nullable = true)
+----+---+-------------------+-----+
|name|age| ratio|hobby|
+----+---+-------------------+-----+
| 0.0|0.6|0.19999999999999996| 0.0|
+----+---+-------------------+-----+
Now, I want to select in agg_df only columns which value is < 0.35. In this case it should return ['name', 'ratio', 'hobby']
I can't figure out how to do it. Any hint?
you mean values < 0.35?. This should do
>>> [ key for (key,value) in agg_df.collect()[0].asDict().items() if value < 0.35 ]
['hobby', 'ratio', 'name']
to replace blank string with Null use the following udf function.
from pyspark.sql.functions import udf
process = udf(lambda x: None if not x else (x if x.strip() else None))
df.withColumn('hobby', process(df.hobby)).show()
+-----+----+-----+-----+
| name| age|ratio|hobby|
+-----+----+-----+-----+
|user1| 33| 1.0|chess|
|user2| 34| 2.0|tenis|
|user3|null| null| null|
|user4|null| 4.0| null|
|user5|null| 5.0| ski|
+-----+----+-----+-----+
Here is my attempt for the function I was looking for based on rogue-one indications. Not sure if it is the fastest or most optimized:
from pyspark.sql.functions import udf, count
from functools import reduce
def filter_columns(df, threshold=0.35):
process = udf(lambda x: None if not x else (x if x.strip() else None)) # udf for stripping string values
string_cols = ([c for c in df.columns if df.select(c).dtypes[0][1] == 'string']) # string columns
new_df = reduce(lambda df, x: df.withColumn(x, process(x)), string_cols, df) # process all string columns
agg_df = new_df.select(*[(1.0 - (count(c) / count('*'))).alias(c) for c in new_df.columns]) # compute non-null/df.count ratio
cols_match_threshold = [ key for (key, value) in agg_df.collect()[0].asDict().items() if value < threshold ] # select only cols which value < threshold
return new_df.select(cols_match_threshold)
filter_columns(df, 0.35).show()
+-----+-----+
|ratio| name|
+-----+-----+
| 1.0|user1|
| 2.0|user2|
| null|user3|
| 4.0|user4|
| 5.0|user5|
+-----+-----+

Resources