Data type conversion in spark - python-3.x

I have an column id which had type int but later changed to bigint.
It has both types of values.
from pyspark.sql.functions import *
from pyspark.sql.types import *
df = spark.read.parquet('hdfs path')
df = df.select("id", "code")
df=df.withColumn("id1", df["id"].cast(LongType()))
res1=df.select("id1", "code")
res1.show(1, False)
It shows me the data frame but when i try to perform some operations on them
example:
res1.groupBy('code').agg(countDistinct("id1")).show(1, False)
I get Column: [id], Expected: int, Found: INT64
I tried mergeSchema did not work either.

from pyspark.sql.functions import *
from pyspark.sql.types import *
df1 = spark.read.parquet('hdfs path')
df2 = df1.select("id", "code")
df3 = df2.withColumn("id1", df2["id"].cast(LongType()))
res1=df3.select("id1", "code")
res1.show(1, False)
res1.groupBy("code").agg(countDistinct("id1")).show(1, False)
This should work. In spark Dataframes are immutable so you should not assign the value of transformation operation to a same df variable, you should use a different variable name. In scala it would give you compile time error but in python its allowed so you don't notice it.
if you want you could also chain all of your transformation and get a single df variable and perform groupby operation on it as below :
df = spark.read.parquet('hdfs path').select("id", "code").withColumn("id1", col("id").cast(LongType())).select("id1", "code")
df.groupBy("code").agg(countDistinct("id1")).show(1, False)

Related

Spark order by second field to perform timeseries function

I have a csv with a timeseries:
timestamp, measure-name, value, type, quality
1503377580,x.x-2.A,0.5281250,Float,GOOD
1503377340,x.x-1.B,0.0000000,Float,GOOD
1503377400,x.x-1.B,0.0000000,Float,GOOD
The measure-name should be my partition key and I would like to calculate a moving average with pyspark, here my code (for instance) to calculate the max
def mysplit(line):
ll = line.split(",")
return (ll[1],float(ll[2]))
text_file.map(lambda line: mysplit(line)).reduceByKey(lambda a, b: max(a , b)).foreach(print)
However, for the average I would like to respect the timestamp ordering.
How to order by a second column?
You need to use a window function on pyspark dataframes:
First you should transform your rdd to a dataframe:
from pyspark.sql import HiveContext
hc = HiveContext(sc)
df = hc.createDataFrame(text_file.map(lambda l: l.split(','), ['timestamp', 'measure-name', 'value', 'type', 'quality'])
Or load it directly as a dataframe:
local:
import pandas as pd
df = hc.createDataFrame(pd.read_csv(path_to_csv, sep=",", header=0))
from hdfs:
df = hc.read.format("com.databricks.spark.csv").option("delimiter", ",").load(path_to_csv)
Then use a window function:
from pyspark.sql import Window
import pyspark.sql.functions as psf
w = Window.orderBy('timestamp')
df.withColumn('value_rol_mean', psf.mean('value').over(w))
+----------+------------+--------+-----+-------+-------------------+
| timestamp|measure_name| value| type|quality| value_rol_mean|
+----------+------------+--------+-----+-------+-------------------+
|1503377340| x.x-1.B| 0.0|Float| GOOD| 0.0|
|1503377400| x.x-1.B| 0.0|Float| GOOD| 0.0|
|1503377580| x.x-2.A|0.528125|Float| GOOD|0.17604166666666665|
+----------+------------+--------+-----+-------+-------------------+
in .orderByyou can order by as many columns as you want

HiveQL to PySpark - issue with aggregated column in SELECT statement

I have following HQL script which needs to be puti nto pyspark, spark 1.6
insert into table db.temp_avg
select
a,
avg(b) ,
c
from db.temp WHERE flag is not null GROUP BY a, c;
I created few versions of spark code, but I'm stuggling how to get this averaged column into select.
Also I found out that groupped data cannot be write this way:
df3 = df2.groupBy...
df3.write.mode('overwrite').saveAsTable('db.temp_avg')
part of pyspark code:
temp_table = sqlContext.table("db.temp")
df = temp_table.select('a', 'avg(b)', 'c', 'flag').toDF('a', 'avg(b)', 'c', 'flag')
df = df.where(['flag'] != 'null'))
# this ofc does not work along with the avg(b)
df2 = df.groupBy('a', 'c')
df3.write.mode('overwrite').saveAsTable('db.temp_avg')
Thx for your help.
Correct solution:
import pyspark.sql.functions as F
df = sqlContext.sql("SELECT * FROM db.temp_avg").alias("temp")
df = df.select('a', 'b', 'c')\
.filter(F.col("temp.flag").isNotNULL())\
.groupby('a', 'c')\
.agg(F.avg('b').alias("avg_b"))
import pyspark.sql.functions as F
df = sqlContext.sql("select * from db.temp_avg")
df = df.select('a',
b,
'c')\
.filter(F.col("flag").isNotNULL())\
.groupby('a', 'c')\
.agg(F.avg('b').alias("avg_b"))
Then you can save the table by
df.saveAsTable("tabe_name")

Calculate time between two dates in pyspark

Hoping this is fairly elementary. I have a Spark dataframe containing a Date column, I want to add a new column with number of days since that date. Google fu is failing me.
Here's what I've tried:
from pyspark.sql.types import *
import datetime
today = datetime.date.today()
schema = StructType([StructField("foo", DateType(), True)])
l = [(datetime.date(2016,12,1),)]
df = sqlContext.createDataFrame(l, schema)
df = df.withColumn('daysBetween',today - df.foo)
df.show()
it fails with error:
u"cannot resolve '(17212 - foo)' due to data type mismatch: '(17212 -
foo)' requires (numeric or calendarinterval) type, not date;"
I've tried fiddling around but gotten nowhere. I can't think that this is too hard. Can anyone help?
OK, figured it out
from pyspark.sql.types import *
import pyspark.sql.functions as funcs
import datetime
today = datetime.date(2017,2,15)
schema = StructType([StructField("foo", DateType(), True)])
l = [(datetime.date(2017,2,14),)]
df = sqlContext.createDataFrame(l, schema)
df = df.withColumn('daysBetween',funcs.datediff(funcs.lit(today), df.foo))
df.collect()
returns [Row(foo=datetime.date(2017, 2, 14), daysBetween=1)]
You can simply do the following:
import pyspark.sql.functions as F
df = df.withColumn('daysSince', F.datediff(F.current_date(), df.foo))

Error when converting from pyspark RDD to DataFrame: Cannot infer schema of type 'unicode' [duplicate]

Could someone help me solve this problem I have with Spark DataFrame?
When I do myFloatRDD.toDF() I get an error:
TypeError: Can not infer schema for type: type 'float'
I don't understand why...
Example:
myFloatRdd = sc.parallelize([1.0,2.0,3.0])
df = myFloatRdd.toDF()
Thanks
SparkSession.createDataFrame, which is used under the hood, requires an RDD / list of Row/tuple/list/dict* or pandas.DataFrame, unless schema with DataType is provided. Try to convert float to tuple like this:
myFloatRdd.map(lambda x: (x, )).toDF()
or even better:
from pyspark.sql import Row
row = Row("val") # Or some other column name
myFloatRdd.map(row).toDF()
To create a DataFrame from a list of scalars you'll have to use SparkSession.createDataFrame directly and provide a schema***:
from pyspark.sql.types import FloatType
df = spark.createDataFrame([1.0, 2.0, 3.0], FloatType())
df.show()
## +-----+
## |value|
## +-----+
## | 1.0|
## | 2.0|
## | 3.0|
## +-----+
but for a simple range it would be better to use SparkSession.range:
from pyspark.sql.functions import col
spark.range(1, 4).select(col("id").cast("double"))
* No longer supported.
** Spark SQL also provides a limited support for schema inference on Python objects exposing __dict__.
*** Supported only in Spark 2.0 or later.
from pyspark.sql.types import IntegerType, Row
mylist = [1, 2, 3, 4, None ]
l = map(lambda x : Row(x), mylist)
# notice the parens after the type name
df=spark.createDataFrame(l,["id"])
df.where(df.id.isNull() == False).show()
Basiclly, you need to init your int into Row(), then we can use the schema
Inferring the Schema Using Reflection
from pyspark.sql import Row
# spark - sparkSession
sc = spark.sparkContext
# Load a text file and convert each line to a Row.
orders = sc.textFile("/practicedata/orders")
#Split on delimiters
parts = orders.map(lambda l: l.split(","))
#Convert to Row
orders_struct = parts.map(lambda p: Row(order_id=int(p[0]), order_date=p[1], customer_id=p[2], order_status=p[3]))
for i in orders_struct.take(5): print(i)
#convert the RDD to DataFrame
orders_df = spark.createDataFrame(orders_struct)
Programmatically Specifying the Schema
from pyspark.sql import Row
# spark - sparkSession
sc = spark.sparkContext
# Load a text file and convert each line to a Row.
orders = sc.textFile("/practicedata/orders")
#Split on delimiters
parts = orders.map(lambda l: l.split(","))
#Convert to tuple
orders_struct = parts.map(lambda p: (p[0], p[1], p[2], p[3].strip()))
#convert the RDD to DataFrame
orders_df = spark.createDataFrame(orders_struct)
# The schema is encoded in a string.
schemaString = "order_id order_date customer_id status"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = Struct
ordersDf = spark.createDataFrame(orders_struct, schema)
Type(fields)
from pyspark.sql import Row
myFloatRdd.map(lambda x: Row(x)).toDF()

Filtering rows in Spark Dataframe based on multiple values in a list [duplicate]

I want to filter a Pyspark DataFrame with a SQL-like IN clause, as in
sc = SparkContext()
sqlc = SQLContext(sc)
df = sqlc.sql('SELECT * from my_df WHERE field1 IN a')
where a is the tuple (1, 2, 3). I am getting this error:
java.lang.RuntimeException: [1.67] failure: ``('' expected but identifier a found
which is basically saying it was expecting something like '(1, 2, 3)' instead of a.
The problem is I can't manually write the values in a as it's extracted from another job.
How would I filter in this case?
String you pass to SQLContext it evaluated in the scope of the SQL environment. It doesn't capture the closure. If you want to pass a variable you'll have to do it explicitly using string formatting:
df = sc.parallelize([(1, "foo"), (2, "x"), (3, "bar")]).toDF(("k", "v"))
df.registerTempTable("df")
sqlContext.sql("SELECT * FROM df WHERE v IN {0}".format(("foo", "bar"))).count()
## 2
Obviously this is not something you would use in a "real" SQL environment due to security considerations but it shouldn't matter here.
In practice DataFrame DSL is a much better choice when you want to create dynamic queries:
from pyspark.sql.functions import col
df.where(col("v").isin({"foo", "bar"})).count()
## 2
It is easy to build and compose and handles all details of HiveQL / Spark SQL for you.
reiterating what #zero323 has mentioned above : we can do the same thing using a list as well (not only set) like below
from pyspark.sql.functions import col
df.where(col("v").isin(["foo", "bar"])).count()
Just a little addition/update:
choice_list = ["foo", "bar", "jack", "joan"]
If you want to filter your dataframe "df", such that you want to keep rows based upon a column "v" taking only the values from choice_list, then
from pyspark.sql.functions import col
df_filtered = df.where( ( col("v").isin (choice_list) ) )
You can also do this for integer columns:
df_filtered = df.filter("field1 in (1,2,3)")
or this for string columns:
df_filtered = df.filter("field1 in ('a','b','c')")
A slightly different approach that worked for me is to filter with a custom filter function.
def filter_func(a):
"""wrapper function to pass a in udf"""
def filter_func_(col):
"""filtering function"""
if col in a.value:
return True
return False
return udf(filter_func_, BooleanType())
# Broadcasting allows to pass large variables efficiently
a = sc.broadcast((1, 2, 3))
df = my_df.filter(filter_func(a)(col('field1'))) \
from pyspark.sql import SparkSession
import pandas as pd
spark=SparkSession.builder.appName('Practise').getOrCreate()
df_pyspark=spark.read.csv('datasets/myData.csv',header=True,inferSchema=True)
df_spark.createOrReplaceTempView("df") # we need to create a Temp table first
spark.sql("SELECT * FROM df where Departments in ('IOT','Big Data') order by Departments").show()

Resources