PySpark - grouping_id throws exception - apache-spark

from pyspark.sql import SparkSession
from pyspark.sql.functions import grouping_id
spark = SparkSession.builder.appName("grouping_id_example").getOrCreate()
data = [("John", "M", 25, "London", 5000),
("Jane", "F", 30, "Paris", 6000),
("Jake", "M", 45, "London", 9000),
("Julie", "F", 50, "Berlin", 10000)]
df = spark.createDataFrame(data, ["Name", "Gender", "Age", "City", "Salary"])
grouped_df = df.groupBy("City", "Gender")
grouped_df = grouped_df.agg({"Salary": "sum"})
grouped_df = grouped_df.withColumn("grouping_id", functions.grouping_id(["City", "Gender"]))
This is throwing as
TypeError: Invalid argument, not a string or column: ['City', 'Gender'] of type <class 'list'>.
For column literals, use 'lit', 'array', 'struct' or 'create_map' function.
Once I change the list to a string, I get
pyspark.sql.utils.AnalysisException: grouping_id() can only be used with GroupingSets/Cube/Rollup;;

Related

plotly: descending order in grouped bar chart

I'm trying to
create a nice plot which is sorted by LABEL and then by Value inside each LABEL.
If possible remove labels on the bottom of the chart because I have explanation in legend.
libraries:
from plotly import graph_objs as go
import plotly.express as px
import pandas as pd
My data looks like this:
df = pd.DataFrame({'LABEL': ['1', '1', '2', '2', '3', '3', '3', '3'],
'Cat2': ['a', 'b', 'a', 'b', 'c', 'a', 'e', 'f'],
'Value': [3, 2, 1, 4, 1, 3, 4, 1]})
df.sort_values(by=['LABEL', 'Value'], ascending=[True, False],inplace=True)
Here is my try:
COLOR_MAP = {str(i): c for i, c in enumerate(px.colors.qualitative.Light24)}
fig = go.Figure()
for i in df['LABEL'].unique():
df_ = df[df['LABEL'] == i]
fig.add_trace(go.Bar(
x=[df_['LABEL'], df_['Cat2']],
y=df_['Value'],
marker=dict(color=COLOR_MAP[i]),
name=f'{i}'))
fig.update_layout(legend_title='Cat1')
fig.update_layout(
xaxis=dict(tickangle=45))
fig.update_layout(xaxis={'categoryorder': 'trace'}) # I tried: 'total descending', 'category descending', 'array'
Result:
My expectation:
Thanks in advance!!
it's much simpler in plotly express
define a new column in dataframe that defines x
from plotly import graph_objs as go
import plotly.express as px
import pandas as pd
df = pd.DataFrame(
{
"LABEL": ["1", "1", "2", "2", "3", "3", "3", "3"],
"Cat2": ["a", "b", "a", "b", "c", "a", "e", "f"],
"Value": [3, 2, 1, 4, 1, 3, 4, 1],
}
)
df.sort_values(by=["LABEL", "Value"], ascending=[True, False], inplace=True)
# define a concatenated column for x
df = df.assign(labx=df["LABEL"] + df["Cat2"])
px.bar(
df,
x="labx",
y="Value",
hover_data=["Cat2"],
color="LABEL",
color_discrete_sequence=px.colors.qualitative.Light24,
).update_layout(
xaxis={"tickmode": "array", "tickvals": df["labx"], "ticktext": df["Cat2"]}
)
without plotly express
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(
go.Bar(
x=df["labx"],
y=df["Value"],
marker_color=df["LABEL"]
.map(
{v: c for v, c in zip(df["LABEL"].unique(), px.colors.qualitative.Light24)}
)
.values,
)
).update_layout(
xaxis={"tickmode": "array", "tickvals": df["labx"], "ticktext": df["Cat2"]}
)

Aggregation into mean, min and max of a List of Dictionaries using Python 3.7 and not using pandas

I have the following data
data = [{'item': 'A', 'price': '$52'},
{'item': 'B', 'price': '$21'},
{'item': 'C', 'price': '$35'},
{'item': 'A', 'price': '$52'},
{'item': 'A', 'price': '$52'},
{'item': 'B', 'price': '$21'}]
How do I aggregate item A, B and C this without using pandas and only python 3+
Appreciate any links to help me out here.
Expected output is
A $156
B $42
C $35
I have tried the following code:
for key, group in itertools.groupby(sorted_data, lambda item: item['item']):
print(key, sum([item["item"] for item in group]))
I get the following error:
TypeError: unsupported operand type(s) for +: 'int' and 'str'
Are there any simpler solutions to this:
from collections import defaultdict
data = [
{"item": "A", "price": "$52"},
{"item": "B", "price": "$21"},
{"item": "C", "price": "$35"},
{"item": "A", "price": "$52"},
{"item": "A", "price": "$52"},
{"item": "B", "price": "$21"},
]
result = defaultdict(list)
for d in data:
result[d["item"]].append(d["price"])
result = {key: f"${sum(int(v[1:]) for v in value)}" for key, value in result.items()}
print(result)
Output:
{'A': '$156', 'B': '$42', 'C': '$35'}
You are trying to sum item, instead of price.
price is a string starting with '$' so you can't expect sum to magically handle it (ie, remove '$', sum the rest as numbers then prepend '$' when it's done).
The easiest solution will be to manually strip off the first character of price, use sum then prepend the $ again when printing.
def key_func(d):
return d['item']
for key, group in itertools.groupby(sorted(data, key=key_func), key=key_func):
print(key, '$' + str(sum(int(item['price'].lstrip('$')) for item in group)))
outputs
A $156
B $42
C $35

How to convert some pyspark dataframe's column into a dict with its column name and combine them to be a json column?

I have data in the following format, and I want to change its format using pyspark with two columns ('tag' and 'data').
The 'tag' column values are unique, and the 'data' column values are a json string obtained from the orginial column 'date、stock、price'
in which combine 'stock' and 'price' to be the 'A' columns value, combine 'date' and 'num' to be the 'B' columns value.
I didn't find or write good funcitions to realize this effect.
my spark version is 2.1.0
original DataFrame
date, stock, price, tag, num
1388534400, GOOG, 50, a, 1
1388534400, FB, 60, b, 2
1388534400, MSFT, 55, c, 3
1388620800, GOOG, 52, d, 4
I expect the output:
new DataFrame
tag| data
'a'| "{'A':{'stock':'GOOD', 'price': 50}, B:{'date':1388534400, 'num':1}"
'b'| "{'A':{'stock':'FB', 'price': 60}, B:{'date':1388534400, 'num':2}"
'c'| "{'A':{'stock':'MSFT', 'price': 55}, B:{'date':1388534400, 'num':3}"
'd'| "{'A':{'stock':'GOOG', 'price': 52}, B:{'date':1388620800, 'num':4}"
+--+--------------------------------------------------------------+
from pyspark.sql import SparkSession
from pyspark.sql.functions import create_map
spark = SparkSession.builder.appName("example").getOrCreate()
df = spark.createDataFrame([
(1388534400, "GOOG", 50, 'a', 1),
(1388534400, "FB", 60, 'b', 2),
(1388534400, "MSFT", 55, 'c', 3),
(1388620800, "GOOG", 52, 'd', 4)]
).toDF("date", "stock", "price", 'tag', 'num')
df.show()
tag_cols = {'A':['stock', 'price'], 'B':['date', 'num']}
# todo, change the Dataframe columns format
IIUC, just use pyspark.sql.functions.struct and pyspark.sql.functions.to_json (both should be available in spark 2.1)
from pyspark.sql import functions as F
# skip df initialization[enter link description here][1]
df_new = df.withColumn('A', F.struct('stock', 'price')) \
.withColumn('B', F.struct('date', 'num')) \
.select('tag', F.to_json(F.struct('A', 'B')).alias('data'))
>>> df_new.show(5,0)
+---+-----------------------------------------------------------------+
|tag|data |
+---+-----------------------------------------------------------------+
|a |{"A":{"stock":"GOOG","price":50},"B":{"date":1388534400,"num":1}}|
|b |{"A":{"stock":"FB","price":60},"B":{"date":1388534400,"num":2}} |
|c |{"A":{"stock":"MSFT","price":55},"B":{"date":1388534400,"num":3}}|
|d |{"A":{"stock":"GOOG","price":52},"B":{"date":1388620800,"num":4}}|
+---+-----------------------------------------------------------------+

Column data value consistency check PySpark SQL

I have two tables with same column names, same data, same number of rows but ordering of rows might differ. Now I select column A from table_1 and column A from table_2 and compare the values. How can i achieve this using PySpark SQL can I do sha2/md5 checksum and compare?
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
from pyspark.sql.types import *
from pyspark.sql import Row
import pyspark.sql.functions as f
app_name="test"
table1="DB1.department"
table2="DB2.department"
conf = SparkConf().setAppName(app_name)
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)
query1="select * from %s" %(table1)
df1 = sqlContext.sql(query1)
query2="select * from %s" %(table2)
df2 = sqlContext.sql(query2)
df3=sqlContext.sql(SELECT DB1.departmentid FROM DB1.department a FULL JOIN
DB2.department b ON a.departmentid = b.departmentid WHERE a.departmentid
IS NULL OR b.departmentid IS NULL)
df5=sqlContext.sql("select md5(departmentid) from department1")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/spark/python/pyspark/sql/context.py", line 580, in sql
return DataFrame(self._ssql_ctx.sql(sqlQuery), self)
File "/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line
813, in __call__
File "/usr/lib/spark/python/pyspark/sql/utils.py", line 51, in deco
raise AnalysisException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.AnalysisException: u"cannot resolve 'md5(departmentid)'
due to data type mismatch: argument 1 requires binary type, however,
'departmentid' is of bigint type.; line 1 pos 11"
when tried with md5 checksum it is saying it expects binarytype but department id is bigint
Table1:
departmentid departmentname departmentaddress
1 A Newyork
2 B Newjersey
3 C SanJose
4 D WashingtonDC
5 E Mexico
6 F Delhi
7 G Pune
8 H chennai
Table2:
departmentid departmentname departmentaddress
7 G Pune
8 H chennai
1 A Newyork
2 B Newjersey
3 C SanJose
4 D WashingtonDC
5 E Mexico
6 F Delhi
Here in table two order of rows has just changed but still data remained so, now technically these two tables are identical. Until and unless new row gets added or values modified the two tables are identical (Tables are taken for example and explanation, in real we deal with Bigdata)
The simplest solution is:
def is_identical(x, y):
return (x.count() == y.count()) and (x.subtract(y).count() == 0)
Example data:
df1 = spark.createDataFrame(
[(1, "A", "Newyork"), (2, "B", "Newjersey"),
(3, "C", "SanJose"), (4, "D", "WashingtonDC"), (5, "E", "Mexico"), (6, "F", "Delhi"),
(7, "G", "Pune"), (8, "H", "chennai")],
("departmentid", "departmentname", "departmentadd"))
df2 = spark.createDataFrame(
[(7, "G", "Pune"), (8, "H", "chennai"), (1, "A", "Newyork"), (2, "B", "Newjersey"),
(3, "C", "SanJose"), (4, "D", "WashingtonDC"), (5, "E", "Mexico"), (6, "F", "Delhi")],
("departmentid", "departmentname", "departmentadd"))
df3 = spark.createDataFrame(
[(1, "A", "New York"), (2, "B", "New Jersey"),
(3, "C", "SanJose"), (4, "D", "WashingtonDC"), (5, "E", "Mexico"), (6, "F", "Delhi"),
(7, "G", "Pune"), (8, "H", "chennai")],
("departmentid", "departmentname", "departmentadd"))
df4 = spark.createDataFrame(
[(3, "C", "SanJose"), (4, "D", "WashingtonDC"), (5, "E", "Mexico"), (6, "F", "Delhi")],
("departmentid", "departmentname", "departmentadd"))
Checks:
is_identical(df1, df2)
# True
is_identical(df1, df3)
# False
is_identical(df1, df4)
# False
is_identical(df4, df4)
# True
With outer join
from pyspark.sql.functions import col, coalesce, lit
from functools import reduce
from operator import and_
def is_identical_(x, y, keys=("departmentid", )):
def both_null(c):
return (col("x.{}".format(c)).isNull() &
col("y.{}".format(c)).isNull())
def both_equal(c):
return coalesce((col("x.{}".format(c)) ==
col("y.{}".format(c))), lit(False))
p = reduce(and_, [both_null(c) | both_equal(c) for c in x.columns if c not in keys])
return (x.alias("x").join(y.alias("y"), list(keys), "full_outer")
.where(~p).count() == 0)
you'd get the same result:
is_identical_(df1, df2)
# True
is_identical_(df1, df3)
# False
is_identical_(df1, df4)
# False
is_identical_(df4, df4)
# True
md5 is not use for you, because it is not an aggregation function. It computes checksum for a specific value.

Flatmap a collect_set in pyspark dataframe

I have two dataframe and I'm using collect_set() in agg after using groupby. What's the best way to flatMap the resulting array after aggregating.
schema = ['col1', 'col2', 'col3', 'col4']
a = [[1, [23, 32], [11, 22], [9989]]]
df1 = spark.createDataFrame(a, schema=schema)
b = [[1, [34], [43, 22], [888, 777]]]
df2 = spark.createDataFrame(b, schema=schema)
df = df1.union(
df2
).groupby(
'col1'
).agg(
collect_set('col2').alias('col2'),
collect_set('col3').alias('col3'),
collect_set('col4').alias('col4')
)
df.collect()
I'm getting this as output:
[Row(col1=1, col2=[[34], [23, 32]], col3=[[11, 22], [43, 22]], col4=[[9989], [888, 777]])]
But, I want this as output:
[Row(col1=1, col2=[23, 32, 34], col3=[11, 22, 43], col4=[9989, 888, 777])]
You can use udf:
from itertools import chain
from pyspark.sql.types import *
from pyspark.sql.functions import udf
flatten = udf(lambda x: list(chain.from_iterable(x)), ArrayType(IntegerType()))
df.withColumn('col2_flat', flatten('col2'))
Without UDF I supposed this should work :
from pyspark.sql.functions import array_distinct, flatten
df.withColumn('col2_flat', array_distinct(flatten('col2')))
It will flatten the nested arrays, and then deduplicates.

Resources