search value in column - python-3.x

I want to search if a column contains a value.
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pandas as pd
df_init = pd.DataFrame({'id':['1', '2'], 'val':[100, 200]})
spark = SparkSession.builder.appName('pandasToSparkDF').getOrCreate()
mySchema = StructType([ StructField("id", StringType(), True),
StructField("val", IntegerType(), True)])
df = spark.createDataFrame(df_init, schema=mySchema)
if df.filter(df.id == "3"):
print('Yes')
else:
print('No')
It always prints 'Yes'.
In a pandas dataframe, I would do:
if '3' in df_init['id].values:
print('Yes')
else:
print('No')```
but with pyspark I don't know how to handle this.
I tried using 'contains' , 'isin' but still the same.

You can use collect_list to get all the values in the 'id' column as a list. And then check if your element is in this list:
from pyspark.sql import functions as F
if '3' in df.select(F.collect_list('id')).first()[0]:
print("Yes")
else:
print('No')
OR just check if the count is >=1 after the filter operation:
if df.filter(df.id == "3").count() >= 1:
print("Yes")
else:
print('No')

Related

Access accumulator value after using it in user defined function within df.widthColumn in Palantir Foundry

I am trying to use a customized accumulator within Palantir Foundry to aggregate Data within
a user defined function which is applied to each row of a dataframe within a statement df.withColumn(...).
From the resulting dataframe, I see, that the incrementation of the accumulator-value happens as expected. However, the value of the accumulator variable itself in the script does not change during the execution.
I see, that the Python-ID of the accumulator variable in the script differs from the Python-ID of the accumulator within the user defined function. But that might be expected...
How do I access the accumulator value which incrementation can be watched in the resulting dataframe-colun from within the calling script after the execution, as this is the information I am looking for?
from transforms.api import transform_df, Input, Output
import numpy as np
from pyspark.accumulators import AccumulatorParam
from pyspark.sql.functions import udf, struct
global accum
#transform_df(
Output("ri.foundry.main.dataset.xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"),
)
def compute(ctx):
from pyspark.sql.types import StructType, StringType, IntegerType, StructField
data2 = [("James","","Smith","36636","M",3000),
("Michael","Rose","","40288","M",4000),
("Robert","","Williams","42114","M",4000),
("Maria","Anne","Jones","39192","F",4000),
("Jen","Mary","Brown","","F",-1)
]
schema = StructType([ \
StructField("firstname",StringType(),True), \
StructField("middlename",StringType(),True), \
StructField("lastname",StringType(),True), \
StructField("id", StringType(), True), \
StructField("gender", StringType(), True), \
StructField("salary", IntegerType(), True) \
])
df = ctx.spark_session.createDataFrame(data=data2, schema=schema)
####################################
class AccumulatorNumpyArray(AccumulatorParam):
def zero(self, zero: np.ndarray):
return zero
def addInPlace(self, v1, v2):
return v1 + v2
# from pyspark.context import SparkContext
# sc = SparkContext.getOrCreate()
sc = ctx.spark_session.sparkContext
shape = 3
global accum
accum = sc.accumulator(
np.zeros(shape, dtype=np.int64),
AccumulatorNumpyArray(),
)
def func(row):
global accum
accum += np.ones(shape)
return str(accum) + '_' + str(id(accum))
user_defined_function = udf(func, StringType())
new = df.withColumn("processed", user_defined_function(struct([df[col] for col in df.columns])))
new.show(2)
print(accum)
return df
results in
+---------+----------+--------+-----+------+------+--------------------+
|firstname|middlename|lastname| id|gender|salary| processed|
+---------+----------+--------+-----+------+------+--------------------+
| James| | Smith|36636| M| 3000|[1. 1. 1.]_140388...|
| Michael| Rose| |40288| M| 4000|[2. 2. 2.]_140388...|
+---------+----------+--------+-----+------+------+--------------------+
only showing top 2 rows
and
> accum
Accumulator<id=0, value=[0 0 0]>
> id(accum)
140574405092256
If the Foundry-Boiler-Plate is removed, resulting in
import numpy as np
from pyspark.accumulators import AccumulatorParam
from pyspark.sql.functions import udf, struct
from pyspark.sql.types import StructType, StringType, IntegerType, StructField
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
spark = (
SparkSession.builder.appName("Python Spark SQL basic example")
.config("spark.some.config.option", "some-value")
.getOrCreate()
)
# ctx = spark.sparkContext.getOrCreate()
data2 = [
("James", "", "Smith", "36636", "M", 3000),
("Michael", "Rose", "", "40288", "M", 4000),
("Robert", "", "Williams", "42114", "M", 4000),
("Maria", "Anne", "Jones", "39192", "F", 4000),
("Jen", "Mary", "Brown", "", "F", -1),
]
schema = StructType(
[
StructField("firstname", StringType(), True),
StructField("middlename", StringType(), True),
StructField("lastname", StringType(), True),
StructField("id", StringType(), True),
StructField("gender", StringType(), True),
StructField("salary", IntegerType(), True),
]
)
# df = ctx.spark_session.createDataFrame(data=data2, schema=schema)
df = spark.createDataFrame(data=data2, schema=schema)
####################################
class AccumulatorNumpyArray(AccumulatorParam):
def zero(self, zero: np.ndarray):
return zero
def addInPlace(self, v1, v2):
return v1 + v2
sc = SparkContext.getOrCreate()
shape = 3
global accum
accum = sc.accumulator(
np.zeros(shape, dtype=np.int64),
AccumulatorNumpyArray(),
)
def func(row):
global accum
accum += np.ones(shape)
return str(accum) + "_" + str(id(accum))
user_defined_function = udf(func, StringType())
new = df.withColumn(
"processed", user_defined_function(struct([df[col] for col in df.columns]))
)
new.show(2, False)
print(id(accum))
print(accum)
the output obtained within a regular Python environment with pyspark version 3.3.1 on Ubuntu meets the expectations and is
+---------+----------+--------+-----+------+------+--------------------------+
|firstname|middlename|lastname|id |gender|salary|processed |
+---------+----------+--------+-----+------+------+--------------------------+
|James | |Smith |36636|M |3000 |[1. 1. 1.]_139642682452576|
|Michael |Rose | |40288|M |4000 |[1. 1. 1.]_139642682450224|
+---------+----------+--------+-----+------+------+--------------------------+
only showing top 2 rows
140166944013424
[3. 3. 3.]
The code that runs outside of the transform is ran in a different environment than the code within your transform. When you commit, you'll be running your checks which runs the code outside the transform to generate the jobspec which is technically your executable transform. You can find these within the "details" of your dataset after the checks pass.
The logic within your transform is then detached and runs in isolation each time you hit build. The global accum you define outside the transform is never ran and doesn't exist when the code inside the compute is running.
global accum <-- runs in checks
#transform_df(
Output("ri.foundry.main.dataset.c0d4fc0c-bb1d-4c7b-86ce-a13ec6666490"),
)
def compute(ctx):
bla bla some logic <-- runs during build
The prints you are doing during your second code example, happen after the df is processed, because you are asking spark to compute with the new.show(2, false). While the print you are doing in the first example happen before the df is processed, since the compute will only happen after your return df.
If you want to try to print after your df is computed, you can use #transform(... instead of #transform_df(... and do a print after writing the dataframe contents. Should be something like this:
#transform(
output=Output("ri.foundry.main.dataset.c0d4fc0c-bb1d-4c7b-86ce-a13ec6666490"),
)
def compute(ctx, output):
df = ... some logic ...
output.write_dataframe(df) # please check the function name I think it was write_dataframe, but may be wrong
print accum

how to print a "dictionary" of StringType() in the form of a table with pyspark

Hi,
The above column is part of a table that I am working with in Databricks. What I wish to do is to turn the "ecommerce" col into a table of its own. In this case, it means that I would have a new table with "detail", "products"....etc as columns. Currently "ecommerce" is a StringType.
I have tried using spark dictionary creation, tabulate and other methods but to no success.
The code that I have currently is
def ecommerce_wtchk_dlt():
df = dlt.read_stream("wtchk_dlt")
ddf = df.select(col("ecommerce"))
header = ddf[0].keys()
rows = [x.values() for x in ddf]
dddf = tabulate.tabulate(rows, header)
return dddf
Whenever I try to forcefully set the type of the ecommerce as MapType I have the error that says that since the original datasource is StringType I can only use the same one as well
I have reproduced the above and able to achieve your requirement in this case by using from_json, json_tuple and explode.
This is my sample data with the same format as yours.
Code:
from pyspark.sql import functions as F
from pyspark.sql.types import *
df2 = df.select(F.json_tuple(df["ecommerce"],"detail")).toDF("detail") \
.select(F.json_tuple(F.col("detail"),"products")).toDF("products")
print("products : ")
df2.show()
schema = ArrayType(StructType([
StructField("name", StringType()),
StructField("id", StringType()),
StructField("price", StringType()),
StructField("brand", StringType()),
StructField("category", StringType()),
StructField("variant", StringType())
]))
final_df=df2.withColumn("products", F.from_json("products", schema)).select(F.explode("products").alias("products")).select("products.*")
print("Final dataframe : ")
final_df.show()
My Result:

Dynamic dictionary in pyspark

I am trying to build a dictionary dynamically using pyspark, by reading the table structure on the oracle database. Here's a simplified version of my code
predefined dictionary (convert_dict.py)
conversions = {
"COL1": lambda c: f.col(c).cast("string"),
"COL2": lambda c: f.from_unixtime(f.unix_timestamp(c, dateFormat)).cast("date"),
"COL3": lambda c: f.from_unixtime(f.unix_timestamp(c, dateFormat)).cast("date"),
"COL4": lambda c: f.col(c).cast("float")
}
Main program
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType
from convert_dict import conversions
spark = SparkSession.builder.appName("file_testing").getOrCreate()
table_name = "TEST_TABLE"
input_file_path = "file:\\\c:\Desktop\foo.txt"
sql_query = "(select listagg(column_name,',') within group(order by column_id) col from user_tab_columns where " \
"table_name = '" + table_name + "' and column_name not in ('COL10', 'COL11','COL12') order by column_id) table_columns"
struct_schema = StructType([\
StructField("COL1", StringType(), True),\
StructField("COL2", StringType(), True),\
StructField("COL3", StringType(), True),\
StructField("COL4", StringType(), True),\
])
data_df = spark.read.schema(struct_schema).option("sep", ",").option("header", "true").csv(input_file_path)
validdateData = lines.withColumn(
"dataTypeValidations",
f.concat_ws(",",
*[
f.when(
v(k).isNull() & f.col(k).isNotNull(),
f.lit(k + " not valid")
).otherwise(f.lit("None"))
for k,v in conversions.items()
]
)
)
data_temp = validdateData
for k,v in conversions.items():
data_temp = data_temp.withColumn(k,v(k))
validateData.show()
spark.stop()
If I am to change the above code to dynamically generate the dictionary from database
DATEFORMAT = "yyyyMMdd"
dict_sql = """
(select column_name,case when data_type = 'VARCHAR2' then 'string' when data_type in ( 'DATE','TIMESTAMP(6)') then 'date' when data_type = 'NUMBER' and NVL(DATA_SCALE,0) <> 0 then 'float' when data_type = 'NUMBER' and NVL(DATA_SCALE,0) = 0 then 'int'
end d_type from user_tab_columns where table_name = 'TEST_TABLE' and column_name not in ('COL10', 'COL11','COL12')) dict
"""
column_df = spark.read.format("jdbc").option("url",url).option("dbtable", dict_sql)\
.option("user",user).option("password",password).option("driver",driver).load()
conversions = {}
for row in column_df.rdd.collect():
column_name = row.COLUMN_NAME
column_type = row.D_TYPE
if column_type == "date":
conversions.update({column_name: lambda c:f.col(c)})
elif column_type == "float":
conversions.update({column_name: lambda c: f.col(c).cast("float")})
elif column_type == "date":
conversions.update({column_name: lambda c: f.from_unixtime(f.unix_timestamp(c, DATEFORMAT)).cast("date")})
elif column_type == "int":
conversions.update({column_name: lambda c: f.col(c).cast("int")})
else:
conversions.update({column_name: lambda c: f.col(c)})
The conversion of data-types doesn't work when the above dynamically generated dictionary is used. For example: if "COL2" contains "20210731", the resulting data from the above code stays the same, i.e. doesn't get converted to the correct date format. Where as the predefined dictionary works in correct manner.
Am I missing something here or is there a better way to implement dynamically generated dictionaries in pyspark?
Had a rookie mistake in my code, in the if-then-else block, I had two separate statements for column_type == "date"

Error when returning an ArrayType of StructType from UDF (and using a single function in multiple UDFs)

(EDIT) changed field names (from foo, bar,... to name and city) because old naming was confusing
I need to use a single function in multiple UDFs and return different Structs depending on the input.
This simplified version of my implementation basically does what I am looking for:
from pyspark.sql.types import IntegerType, StructType, StringType
from pyspark.sql.functions import when, col
df = spark.createDataFrame([1, 2, 3], IntegerType()).toDF('id')
struct_one = StructType().add('name', StringType(), True)
struct_not_one = StructType().add('city', StringType(), True)
def select(id):
if id == 1:
return {'name': 'Alice'}
else:
return {'city': 'Seattle'}
one_udf = udf(select, struct_one)
not_one_udf = udf(select, struct_not_one)
df = df.withColumn('one', when((col('id') == 1), one_udf(col('id'))))\
.withColumn('not_one', when((col('id') != 1), not_one_udf(col('id'))))
display(df)
(EDIT) Output:
id one not_one
1 {"name":"Alice"} null
2 null {"city":"Seattle"}
3 null {"city":"Seattle"}
But, the same code returning an ArrayType of StructType unfortunatly fails:
from pyspark.sql.types import IntegerType, StructType, StringType, ArrayType
from pyspark.sql.functions import when, col
df = spark.createDataFrame([1, 2, 3], IntegerType()).toDF('id')
struct_one = StructType().add('name', StringType(), True)
struct_not_one = ArrayType(StructType().add('city', StringType(), True))
def select(id):
if id == 1:
return {'name': 'Alice'}
else:
return [{'city': 'Seattle'}, {'city': 'Milan'}]
one_udf = udf(select, struct_one)
not_one_udf = udf(select, struct_not_one)
df = df.withColumn('one', when((col('id') == 1), one_udf(col('id'))))\
.withColumn('not_one', when((col('id') != 1), not_one_udf(col('id'))))
display(df)
The error message is:
ValueError: Unexpected tuple 'name' with StructType
(EDIT) Desired Output would be:
id one not_one
1 {"name":"Alice"} null
2 null [{"city":"Seattle"},{"city":"Milan"}]
3 null [{"city":"Seattle"},{"city":"Milan"}]
Returning and ArrayType of other types (StringType, IntegerType,...) for example works, though.
Also returning an Array of StructType when not using a single function in multiple UDFs works:
from pyspark.sql.types import IntegerType, StructType, StringType, ArrayType
from pyspark.sql.functions import when, col
df = spark.createDataFrame([1, 2, 3], IntegerType()).toDF('id')
struct_not_one = ArrayType(StructType().add('city', StringType(), True))
def select(id):
return [{'city': 'Seattle'}, {'city': 'Milan'}]
not_one_udf = udf(select, struct_not_one)
df = df.withColumn('not_one', when((col('id') != 1), not_one_udf(col('id'))))
display(df)
(EDIT) Output:
id not_one
1 null
2 [{"city":"Seattle"},{"city":"Milan"}]
3 [{"city":"Seattle"},{"city":"Milan"}]
Why is returning an ArrayType of StructType and using multiple UDFs with one single function not working?
Thanks!
"Spark SQL (including SQL and the DataFrame and Dataset API) does not guarantee the order of evaluation of subexpressions...
Therefore, it is dangerous to rely on the side effects or order of evaluation of Boolean expressions, and the order of WHERE and HAVING clauses, since such expressions and clauses can be reordered during query optimization and planning. Specifically, if a UDF relies on short-circuiting semantics in SQL for null checking, there’s no guarantee that the null check will happen before invoking the UDF."
See Evaluation order and null checking
To keep your udf generic you could push the 'when filter' into your udf:
from pyspark.sql.types import IntegerType, StructType, StringType, ArrayType
from pyspark.sql.functions import when, col, lit
df = spark.createDataFrame([1, 2, 3], IntegerType()).toDF('id')
struct_one = StructType().add('name', StringType(), True)
struct_not_one = ArrayType(StructType().add('city', StringType(), True))
def select(id, test):
if eval(test.format(id)) is False:
return None
if id == 1:
return {'name': 'Alice'}
else:
return [{'city': 'Seattle'}, {'city': 'Milan'}]
one_udf = udf(select, struct_one)
not_one_udf = udf(select, struct_not_one)
df = df.withColumn('one', one_udf(col('id'), lit('{} == 1')))\
.withColumn('not_one', not_one_udf(col('id'), lit('{} != 1')))
display(df)

splitting dictionary column into multiple columns in pyspark

Column Names
Production_uint_id,batch_id,items_produced,items_discarded
Data:
P188 gv962 {'scissor': 141, 'paper': 274, 'rock': 218}
{'scissor': 14,'paper': 135, 'rock': 24}
P258 mr005 {'scissor': 151, 'paper': 143, 'rock': 225}
{'scissor': 24, 'paper': 60, 'rock': 17}
Code:
from pyspark.sql.types import *
sc = spark.sparkContext
production_rdd = sc.textFile("/Production_logs.tsv")
production_parts = production_rdd.map(lambda l: l.split("\t"))
production = production_parts.map(lambda p: (p[0], p[1], p[2], p[3].strip()))
schemaStringProduction = "production_unit_id batch_id items_produced items_discarded"
fieldsProduction = [StructField(field_name, StringType(), True) for field_name in schemaStringProduction.split()]
schemaProduction = StructType(fieldsProduction)
schemaProductionDF = spark.createDataFrame(production, schemaProduction)
I am Trying to explode
exploding = schemaProductionDF.select("production_unit_id", explode("items_produced").alias("item_p", "item_p_count"), "items_discarded")
Getting this error:
pyspark.sql.utils.AnalysisException: u"cannot resolve 'explode(`items_produced`)' due to data type mismatch:
input to function explode should be array or map type, not string;
Please help
Explode is UDTF function, which will return new Row for each array element.
For explode: Explode in PySpark
For your question please try below code:
from pyspark import SparkContext
from pyspark.sql import Row
sc= SparkContext.getOrCreate()
import pandas as pd
rdd1=sc.textFile("D:\MOCK_DATA\*_dict.txt")
lineRDD=rdd1.map(lambda line: line.split("\t"))
header="Production_uint_id,batch_id,items_produced,items_discarded"
col_name=[x.encode("utf-8") for x in header.split(',')]
production = lineRDD.map(lambda p: (eval(p[0]), eval(p[1]), eval(p[2]), eval(p[3]).strip()))
flatRDD=lineRDD.map(lambda a : ((a[0],a[1],eval(a[2]).values(),eval(a[3]).values())))
DF1=flatRDD.toDF(col_name)
DF1.printSchema()
from pyspark.sql import functions as f
DF2=DF1
lst='scissor,paper,rock'
col_lst='items_produced,items_discarded'
for col_ele in col_lst.split(","):
count=0
for i in lst.split(','):
DF2=DF2.withColumn(col_ele+'.'+i, DF2[col_ele][count])
count=count+1
DF2.show()

Resources