splitting dictionary column into multiple columns in pyspark - apache-spark

Column Names
Production_uint_id,batch_id,items_produced,items_discarded
Data:
P188 gv962 {'scissor': 141, 'paper': 274, 'rock': 218}
{'scissor': 14,'paper': 135, 'rock': 24}
P258 mr005 {'scissor': 151, 'paper': 143, 'rock': 225}
{'scissor': 24, 'paper': 60, 'rock': 17}
Code:
from pyspark.sql.types import *
sc = spark.sparkContext
production_rdd = sc.textFile("/Production_logs.tsv")
production_parts = production_rdd.map(lambda l: l.split("\t"))
production = production_parts.map(lambda p: (p[0], p[1], p[2], p[3].strip()))
schemaStringProduction = "production_unit_id batch_id items_produced items_discarded"
fieldsProduction = [StructField(field_name, StringType(), True) for field_name in schemaStringProduction.split()]
schemaProduction = StructType(fieldsProduction)
schemaProductionDF = spark.createDataFrame(production, schemaProduction)
I am Trying to explode
exploding = schemaProductionDF.select("production_unit_id", explode("items_produced").alias("item_p", "item_p_count"), "items_discarded")
Getting this error:
pyspark.sql.utils.AnalysisException: u"cannot resolve 'explode(`items_produced`)' due to data type mismatch:
input to function explode should be array or map type, not string;
Please help

Explode is UDTF function, which will return new Row for each array element.
For explode: Explode in PySpark
For your question please try below code:
from pyspark import SparkContext
from pyspark.sql import Row
sc= SparkContext.getOrCreate()
import pandas as pd
rdd1=sc.textFile("D:\MOCK_DATA\*_dict.txt")
lineRDD=rdd1.map(lambda line: line.split("\t"))
header="Production_uint_id,batch_id,items_produced,items_discarded"
col_name=[x.encode("utf-8") for x in header.split(',')]
production = lineRDD.map(lambda p: (eval(p[0]), eval(p[1]), eval(p[2]), eval(p[3]).strip()))
flatRDD=lineRDD.map(lambda a : ((a[0],a[1],eval(a[2]).values(),eval(a[3]).values())))
DF1=flatRDD.toDF(col_name)
DF1.printSchema()
from pyspark.sql import functions as f
DF2=DF1
lst='scissor,paper,rock'
col_lst='items_produced,items_discarded'
for col_ele in col_lst.split(","):
count=0
for i in lst.split(','):
DF2=DF2.withColumn(col_ele+'.'+i, DF2[col_ele][count])
count=count+1
DF2.show()

Related

How to use chaining in pyspark?

I have a dataframe called Incitoand in Supplier Inv Nocolumn of that data frame consists of comma separated values. I need to recreate the data frame by appropriately repeating those comma separated values using pyspark.I am using following python code for that.Can I convert this into pyspark?Is it possible via pyspark?
from itertools import chain
def chainer(s):
return list(chain.from_iterable(s.str.split(',')))
incito['Supplier Inv No'] = incito['Supplier Inv No'].astype(str)
# calculate lengths of splits
lens = incito['Supplier Inv No'].str.split(',').map(len)
# create new dataframe, repeating or chaining as appropriate
dfnew = pd.DataFrame({'Supplier Inv No': chainer(incito['Supplier Inv No']),
'Forwarder': np.repeat(incito['Forwarder'], lens),
'Mode': np.repeat(incito['Mode'], lens),
'File No': np.repeat(incito['File No'], lens),
'ETD': np.repeat(incito['ETD'], lens),
'Flight No': np.repeat(incito['Flight No'], lens),
'Shipped Country': np.repeat(incito['Shipped Country'], lens),
'Port': np.repeat(incito['Port'], lens),
'Delivered_Country': np.repeat(incito['Delivered_Country'], lens),
'AirWeight': np.repeat(incito['AirWeight'], lens),
'FREIGHT CHARGE': np.repeat(incito['FREIGHT CHARGE'], lens)})
This is what I tried in pyspark.But I am not getting the expected outcome.
from pyspark.context import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F
import pandas as pd
conf = SparkConf().setAppName("appName").setMaster("local")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
ddf = spark.createDataFrame(dfnew)
exploded = ddf.withColumn('d', F.explode("Supplier Inv No"))
exploded.show()
Something like this, using repeat?
from pyspark.sql import functions as F
df = (spark
.sparkContext
.parallelize([
('ABCD',),
('EFGH',),
])
.toDF(['col_a'])
)
(df
.withColumn('col_b', F.repeat(F.col('col_a'), 2))
.withColumn('col_c', F.repeat(F.lit('X'), 10))
.show()
)
# +-----+--------+----------+
# |col_a| col_b| col_c|
# +-----+--------+----------+
# | ABCD|ABCDABCD|XXXXXXXXXX|
# | EFGH|EFGHEFGH|XXXXXXXXXX|
# +-----+--------+----------+

search value in column

I want to search if a column contains a value.
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pandas as pd
df_init = pd.DataFrame({'id':['1', '2'], 'val':[100, 200]})
spark = SparkSession.builder.appName('pandasToSparkDF').getOrCreate()
mySchema = StructType([ StructField("id", StringType(), True),
StructField("val", IntegerType(), True)])
df = spark.createDataFrame(df_init, schema=mySchema)
if df.filter(df.id == "3"):
print('Yes')
else:
print('No')
It always prints 'Yes'.
In a pandas dataframe, I would do:
if '3' in df_init['id].values:
print('Yes')
else:
print('No')```
but with pyspark I don't know how to handle this.
I tried using 'contains' , 'isin' but still the same.
You can use collect_list to get all the values in the 'id' column as a list. And then check if your element is in this list:
from pyspark.sql import functions as F
if '3' in df.select(F.collect_list('id')).first()[0]:
print("Yes")
else:
print('No')
OR just check if the count is >=1 after the filter operation:
if df.filter(df.id == "3").count() >= 1:
print("Yes")
else:
print('No')

Error when returning an ArrayType of StructType from UDF (and using a single function in multiple UDFs)

(EDIT) changed field names (from foo, bar,... to name and city) because old naming was confusing
I need to use a single function in multiple UDFs and return different Structs depending on the input.
This simplified version of my implementation basically does what I am looking for:
from pyspark.sql.types import IntegerType, StructType, StringType
from pyspark.sql.functions import when, col
df = spark.createDataFrame([1, 2, 3], IntegerType()).toDF('id')
struct_one = StructType().add('name', StringType(), True)
struct_not_one = StructType().add('city', StringType(), True)
def select(id):
if id == 1:
return {'name': 'Alice'}
else:
return {'city': 'Seattle'}
one_udf = udf(select, struct_one)
not_one_udf = udf(select, struct_not_one)
df = df.withColumn('one', when((col('id') == 1), one_udf(col('id'))))\
.withColumn('not_one', when((col('id') != 1), not_one_udf(col('id'))))
display(df)
(EDIT) Output:
id one not_one
1 {"name":"Alice"} null
2 null {"city":"Seattle"}
3 null {"city":"Seattle"}
But, the same code returning an ArrayType of StructType unfortunatly fails:
from pyspark.sql.types import IntegerType, StructType, StringType, ArrayType
from pyspark.sql.functions import when, col
df = spark.createDataFrame([1, 2, 3], IntegerType()).toDF('id')
struct_one = StructType().add('name', StringType(), True)
struct_not_one = ArrayType(StructType().add('city', StringType(), True))
def select(id):
if id == 1:
return {'name': 'Alice'}
else:
return [{'city': 'Seattle'}, {'city': 'Milan'}]
one_udf = udf(select, struct_one)
not_one_udf = udf(select, struct_not_one)
df = df.withColumn('one', when((col('id') == 1), one_udf(col('id'))))\
.withColumn('not_one', when((col('id') != 1), not_one_udf(col('id'))))
display(df)
The error message is:
ValueError: Unexpected tuple 'name' with StructType
(EDIT) Desired Output would be:
id one not_one
1 {"name":"Alice"} null
2 null [{"city":"Seattle"},{"city":"Milan"}]
3 null [{"city":"Seattle"},{"city":"Milan"}]
Returning and ArrayType of other types (StringType, IntegerType,...) for example works, though.
Also returning an Array of StructType when not using a single function in multiple UDFs works:
from pyspark.sql.types import IntegerType, StructType, StringType, ArrayType
from pyspark.sql.functions import when, col
df = spark.createDataFrame([1, 2, 3], IntegerType()).toDF('id')
struct_not_one = ArrayType(StructType().add('city', StringType(), True))
def select(id):
return [{'city': 'Seattle'}, {'city': 'Milan'}]
not_one_udf = udf(select, struct_not_one)
df = df.withColumn('not_one', when((col('id') != 1), not_one_udf(col('id'))))
display(df)
(EDIT) Output:
id not_one
1 null
2 [{"city":"Seattle"},{"city":"Milan"}]
3 [{"city":"Seattle"},{"city":"Milan"}]
Why is returning an ArrayType of StructType and using multiple UDFs with one single function not working?
Thanks!
"Spark SQL (including SQL and the DataFrame and Dataset API) does not guarantee the order of evaluation of subexpressions...
Therefore, it is dangerous to rely on the side effects or order of evaluation of Boolean expressions, and the order of WHERE and HAVING clauses, since such expressions and clauses can be reordered during query optimization and planning. Specifically, if a UDF relies on short-circuiting semantics in SQL for null checking, there’s no guarantee that the null check will happen before invoking the UDF."
See Evaluation order and null checking
To keep your udf generic you could push the 'when filter' into your udf:
from pyspark.sql.types import IntegerType, StructType, StringType, ArrayType
from pyspark.sql.functions import when, col, lit
df = spark.createDataFrame([1, 2, 3], IntegerType()).toDF('id')
struct_one = StructType().add('name', StringType(), True)
struct_not_one = ArrayType(StructType().add('city', StringType(), True))
def select(id, test):
if eval(test.format(id)) is False:
return None
if id == 1:
return {'name': 'Alice'}
else:
return [{'city': 'Seattle'}, {'city': 'Milan'}]
one_udf = udf(select, struct_one)
not_one_udf = udf(select, struct_not_one)
df = df.withColumn('one', one_udf(col('id'), lit('{} == 1')))\
.withColumn('not_one', not_one_udf(col('id'), lit('{} != 1')))
display(df)

Error when converting from pyspark RDD to DataFrame: Cannot infer schema of type 'unicode' [duplicate]

Could someone help me solve this problem I have with Spark DataFrame?
When I do myFloatRDD.toDF() I get an error:
TypeError: Can not infer schema for type: type 'float'
I don't understand why...
Example:
myFloatRdd = sc.parallelize([1.0,2.0,3.0])
df = myFloatRdd.toDF()
Thanks
SparkSession.createDataFrame, which is used under the hood, requires an RDD / list of Row/tuple/list/dict* or pandas.DataFrame, unless schema with DataType is provided. Try to convert float to tuple like this:
myFloatRdd.map(lambda x: (x, )).toDF()
or even better:
from pyspark.sql import Row
row = Row("val") # Or some other column name
myFloatRdd.map(row).toDF()
To create a DataFrame from a list of scalars you'll have to use SparkSession.createDataFrame directly and provide a schema***:
from pyspark.sql.types import FloatType
df = spark.createDataFrame([1.0, 2.0, 3.0], FloatType())
df.show()
## +-----+
## |value|
## +-----+
## | 1.0|
## | 2.0|
## | 3.0|
## +-----+
but for a simple range it would be better to use SparkSession.range:
from pyspark.sql.functions import col
spark.range(1, 4).select(col("id").cast("double"))
* No longer supported.
** Spark SQL also provides a limited support for schema inference on Python objects exposing __dict__.
*** Supported only in Spark 2.0 or later.
from pyspark.sql.types import IntegerType, Row
mylist = [1, 2, 3, 4, None ]
l = map(lambda x : Row(x), mylist)
# notice the parens after the type name
df=spark.createDataFrame(l,["id"])
df.where(df.id.isNull() == False).show()
Basiclly, you need to init your int into Row(), then we can use the schema
Inferring the Schema Using Reflection
from pyspark.sql import Row
# spark - sparkSession
sc = spark.sparkContext
# Load a text file and convert each line to a Row.
orders = sc.textFile("/practicedata/orders")
#Split on delimiters
parts = orders.map(lambda l: l.split(","))
#Convert to Row
orders_struct = parts.map(lambda p: Row(order_id=int(p[0]), order_date=p[1], customer_id=p[2], order_status=p[3]))
for i in orders_struct.take(5): print(i)
#convert the RDD to DataFrame
orders_df = spark.createDataFrame(orders_struct)
Programmatically Specifying the Schema
from pyspark.sql import Row
# spark - sparkSession
sc = spark.sparkContext
# Load a text file and convert each line to a Row.
orders = sc.textFile("/practicedata/orders")
#Split on delimiters
parts = orders.map(lambda l: l.split(","))
#Convert to tuple
orders_struct = parts.map(lambda p: (p[0], p[1], p[2], p[3].strip()))
#convert the RDD to DataFrame
orders_df = spark.createDataFrame(orders_struct)
# The schema is encoded in a string.
schemaString = "order_id order_date customer_id status"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = Struct
ordersDf = spark.createDataFrame(orders_struct, schema)
Type(fields)
from pyspark.sql import Row
myFloatRdd.map(lambda x: Row(x)).toDF()

Spark group by - Pig conversion

I am trying to achieve something like this in spark. The following code snippet is from Pig Latin. Is there anyway I can do the same thing with Spark?
A = load 'student' AS (name:chararray,age:int,gpa:float);
DESCRIBE A;
A: {name: chararray,age: int,gpa: float} DUMP A; (John,18,4.0F)
(Mary,19,3.8F) (Bill,20,3.9F) (Joe,18,3.8F)
B = GROUP A BY age;
Result: (18,{(John,18,4.0F),(Joe,18,3.8F)}) (19,{(Mary,19,3.8F)})
(20,{(Bill,20,3.9F)})
Thanks.
It's easy to do a list of names by age. I believe the Spark API doesn't allow you to collect complete rows and get a complete row list in the same way.
// Input data
val df = {
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import scala.collection.JavaConverters._
import java.time.LocalDate
val simpleSchema = StructType(
StructField("name", StringType) ::
StructField("age", IntegerType) ::
StructField("gpa", FloatType) :: Nil)
val data = List(
Row("John", 18, 4.0f),
Row("Mary", 19, 3.8f),
Row("Bill", 20, 3.9f),
Row("Joe", 18, 3.8f)
)
spark.createDataFrame(data.asJava, simpleSchema)
}
df.show()
val df2 = df.groupBy(col("age")).agg(collect_list(col("name")))
df2.show()

Resources