Adding a Vectors Column to a pyspark DataFrame - apache-spark

How do I add a Vectors.dense column to a pyspark dataframe?
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.linalg import DenseVector
py_df = pd.DataFrame.from_dict({"time": [59., 115., 156., 421.], "event": [1, 1, 1, 0]})
sc = SparkContext(master="local")
sqlCtx = SQLContext(sc)
sdf = sqlCtx.createDataFrame(py_df)
sdf.withColumn("features", DenseVector(1))
Gives an error in file anaconda3/lib/python3.6/site-packages/pyspark/sql/dataframe.py, line 1848:
AssertionError: col should be Column
It doesn't like the DenseVector type as a column. Essentially, I have a pandas dataframe that I'd like to transform to a pyspark dataframe and add a column of the type Vectors.dense. Is there another way of doing this?

Constant Vectors cannot be added as literal. You have to use udf:
from pyspark.sql.functions import udf
from pyspark.ml.linalg import VectorUDT
one = udf(lambda: DenseVector([1]), VectorUDT())
sdf.withColumn("features", one()).show()
But I am not sure why you need that at all. If you want to transform existing columns into Vectors use appropriate pyspark.ml tools, like VectorAssembler - Encode and assemble multiple features in PySpark
from pyspark.ml.feature import VectorAssembler
VectorAssembler(inputCols=["time"], outputCol="features").transform(sdf)

Related

How to use a scikit pickle model in spark structured streaming? [duplicate]

I'm trying to apply a scikit model retrieved using a pickle to every row of a structured streaming dataframe.
I've tried using pandas_udf (version code 1), and it gives me this error:
AttributeError: 'numpy.ndarray' object has no attribute 'isnull'
Code:
inputPath = "/FileStore/df_training/streaming_df_1_nh_nd/"
from pyspark.sql import functions as f
from pyspark.sql.types import *
data_schema = data_spark_ts.schema
import pandas as pd
from pyspark.sql.functions import col, pandas_udf, PandasUDFType # User Defines Functions for Pandas Dataframe
from pyspark.sql.types import LongType
get_prediction = pandas_udf(lambda x: gb2.predict(x), IntegerType())
streamingInputDF = (
spark
.readStream
.schema(data_schema) # Set the schema of the JSON data
.option("maxFilesPerTrigger", 1) # Treat a sequence of files as a stream by picking one file at a time
.csv(inputPath)
.fillna(0)
.withColumn("prediction", get_prediction( f.struct([col(x) for x in data_spark.columns]) ))
)
display(streamingInputDF.select("prediction"))
I've tried also using a normal udf instead of the pandas_udf, and it gives me this error:
ValueError: Expected 2D array, got 1D array instead:
[.. ... .. ..]
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
I don't know how to reshape my data.
The model I try to apply is retrieved this way:
#load the pickle
import pickle
gb2 = None
with open('pickle_modello_unico.p', 'rb') as fp:
gb2 = pickle.load(fp)
And it's specification is this one:
GradientBoostingClassifier(criterion='friedman_mse', init=None,
learning_rate=0.1, loss='deviance', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=300,
n_iter_no_change=None, presort='auto', random_state=None,
subsample=1.0, tol=0.0001, validation_fraction=0.1,
verbose=0, warm_start=False)
Any help to solve this?
I solved the issue returning a pd.Series from the pandas_udf.
Here is the working code:
inputPath = "/FileStore/df_training/streaming_df_1_nh_nd/"
from pyspark.sql import functions as f
from pyspark.sql.types import *
data_schema = data_spark_ts.schema
import pandas as pd
from pyspark.sql.functions import col, pandas_udf, PandasUDFType # User Defines Functions for Pandas Dataframe
from pyspark.sql.types import LongType
get_prediction = pandas_udf(lambda x: pd.Series(gb2.predict(x)), StringType())
streamingInputDF = (
spark
.readStream
.schema(data_schema) # Set the schema of the JSON data
.option("maxFilesPerTrigger", 1) # Treat a sequence of files as a stream by picking one file at a time
.csv(inputPath)
.withColumn("prediction", get_prediction( f.struct([col(x) for x in data_spark.columns]) ))
)
display(streamingInputDF.select("prediction"))

Apply StopWordsRemover and RegexTokenizer to multiple columns in spark 2.4.3

I've the following dataframe, df4
|Itemno |fits_assembly_id |fits_assembly_name |assembly_name
|0450056 |13039 135502 141114 4147 138865 2021 9164 |OIL PUMP ASSEMBLY A01EA09CA 4999202399920239A06 A02EA09CA A02EA09CB A02EA09CC |OIL PUMP ASSEMBLY 999202399920239A06
and I am using following code to process/clean the above-mentioned data frame
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer
from pyspark.sql.functions import expr
# Task-1: Regex Tokenizer
tk = RegexTokenizer(pattern=r'(?:\p{Punct}|\s)+', inputCol='fits_assembly_name', outputCol='temp1')
df5 = tk.transform(df4)
#Task-2: StopWordsRemover
sw = StopWordsRemover(inputCol='temp1', outputCol='temp2')
df6 = sw.transform(df5)
# #Task-3: Remove duplicates
df7 = df6.withColumn('fits_assembly_name', expr('concat_ws(" ", array_distinct(temp2))')) \
.drop('temp1', 'temp2')
I want to process both columns fits_assembly_name and assembly_name in RegexTokenizer & StopWordsRemover in one go. Could you please share how it can be achieved?
You can use a list comprehension to handle multiple columns, use pyspark.ml.Pipeline to skip the intermediate dataframes, see below:
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer
from pyspark.ml import Pipeline
from pyspark.sql.functions import expr
# df4 is the initial dataframe and new result will overwrite it.
for col in ['fits_assembly_name', 'assembly_name']:
tk = RegexTokenizer(pattern=r'(?:\p{Punct}|\s)+', inputCol=col, outputCol='temp1')
sw = StopWordsRemover(inputCol='temp1', outputCol='temp2')
pipeline = Pipeline(stages=[tk, sw])
df4 = pipeline.fit(df4).transform(df4) \
.withColumn(col, expr('concat_ws(" ", array_distinct(temp2))')) \
.drop('temp1', 'temp2')

Appending column name to column value using Spark

I have data in comma separated file, I have loaded it in the spark data frame:
The data looks like:
A B C
1 2 3
4 5 6
7 8 9
I want to transform the above data frame in spark using pyspark as:
A B C
A_1 B_2 C_3
A_4 B_5 C_6
--------------
Then convert it to list of list using pyspark as:
[[ A_1 , B_2 , C_3],[A_4 , B_5 , C_6]]
And then run FP Growth algorithm using pyspark on the above data set.
The code that I have tried is below:
from pyspark.sql.functions import col, size
from pyspark.sql.functions import *
import pyspark.sql.functions as func
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.ml.fpm import FPGrowth
from pyspark.sql import Row
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StringType
from pyspark import SQLContext
sqlContext = SQLContext(sc)
df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/tables/data.csv")
names=df.schema.names
Then I thought of doing something inside for loop:
for name in names:
-----
------
After this I will be using fpgrowth:
df = spark.createDataFrame([
(0, [ A_1 , B_2 , C_3]),
(1, [A_4 , B_5 , C_6]),)], ["id", "items"])
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)
A number of concepts here for those who use Scala normally showing how to do with pyspark. Somewhat different but learnsome for sure, although to how many is the big question. I certainly learnt a point on pyspark with zipWithIndex myself. Anyway.
First part is to get stuff into desired format, probably too may imports but leaving as is:
from functools import reduce
from pyspark.sql.functions import lower, col, lit, concat, split
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql import functions as f
source_df = spark.createDataFrame(
[
(1, 11, 111),
(2, 22, 222)
],
["colA", "colB", "colC"]
)
intermediate_df = (reduce(
lambda df, col_name: df.withColumn(col_name, concat(lit(col_name), lit("_"), col(col_name))),
source_df.columns,
source_df
) )
allCols = [x for x in intermediate_df.columns]
result_df = intermediate_df.select(f.concat_ws(',', *allCols).alias('CONCAT_COLS'))
result_df = result_df.select(split(col("CONCAT_COLS"), ",\s*").alias("ARRAY_COLS"))
# Add 0,1,2,3, ... with zipWithIndex, we add it at back, but that does not matter, you can move it around.
# Get new Structure, the fields (one in this case but done flexibly, plus zipWithIndex value.
schema = StructType(result_df.schema.fields[:] + [StructField("index", LongType(), True)])
# Need this dict approach with pyspark, different to Scala.
rdd = result_df.rdd.zipWithIndex()
rdd1 = rdd.map(
lambda row: tuple(row[0].asDict()[c] for c in schema.fieldNames()[:-1]) + (row[1],)
)
final_result_df = spark.createDataFrame(rdd1, schema)
final_result_df.show(truncate=False)
returns:
+---------------------------+-----+
|ARRAY_COLS |index|
+---------------------------+-----+
|[colA_1, colB_11, colC_111]|0 |
|[colA_2, colB_22, colC_222]|1 |
+---------------------------+-----+
Second part is the old zipWithIndex with pyspark if you need 0,1,.. Painful compared to Scala.
In general easier to solve in Scala.
Not sure on performance, not a foldLeft, interesting. I think it is OK actually.

How to convert a rdd of pandas DataFrame to Spark DataFrame

I create a rdd of pandas DataFrame as intermediate result. I want to convert a Spark DataFrame, eventually save it into parquet file.
I want to know what is the efficient way.
Thanks
def create_df(x):
return pd.DataFrame(np.random.rand(5, 3)).\
assign(col=x)
sc.parallelize(range(5)).map(create_df).\
.TO_DATAFRAME()..write.format("parquet").save("parquet_file")
I have tried pd.concat to reduce rdd to a big dataframe, seems not right.
So talking of efficiency, since spark 2.3 Apache Arrow is integrated with Spark and it is supposed to efficiently transfer data between JVM and Python processes thus enhancing the performance of the conversion from pandas dataframe to spark dataframe. You can enable it by
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
If your spark distribution doesn't have arrow integrated, this should not throw an error, will just be ignored.
A sample code to be run at pyspark shell can be like below:
import numpy as np
import pandas as pd
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
pdf = pd.DataFrame(np.random.rand(100, 3))
df = spark.createDataFrame(pdf)
df.write.format("parquet").save('data_parquet_file')
Your create_df method returns a panda dataframe and from that you can create spark dataframe - not sure why you need "sc.parallelize(range(5)).map(create_df)"
So your full code can be like
import pandas as pd
import numpy as np
def create_df(x):
return pd.DataFrame(np.random.rand(5, 3)).assign(col=x)
pdf = create_df(10)
df = spark.createDataFrame(pdf)
df.write.format("parquet").save('data_parquet_file')
import pandas as pd
def create_df(x):
df=pd.DataFrame(np.random.rand(5, 3)).assign(col=x)
return df.values.tolist()
sc.parallelize(range(5)).flatMap(create_df).toDF().\
.write.format("parquet").save("parquet_file")

'RDD' object has no attribute '_jdf' pyspark RDD

I'm new in pyspark. I would like to perform some machine Learning on a text file.
from pyspark import Row
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark import SparkConf
sc = SparkContext
spark = SparkSession.builder.appName("ML").getOrCreate()
train_data = spark.read.text("20ng-train-all-terms.txt")
td= train_data.rdd #transformer df to rdd
tr_data= td.map(lambda line: line.split()).map(lambda words: Row(label=words[0],words=words[1:]))
from pyspark.ml.feature import CountVectorizer
vectorizer = CountVectorizer(inputCol ="words", outputCol="bag_of_words")
vectorizer_transformer = vectorizer.fit(td)
and for my last command, i obtain the error
"AttributeError: 'RDD' object has no attribute '_jdf'
enter image description here
can anyone help me please.
thank you
You shouldn't be using rdd with CountVectorizer. Instead you should try to form the array of words in the dataframe itself as
train_data = spark.read.text("20ng-train-all-terms.txt")
from pyspark.sql import functions as F
td= train_data.select(F.split("value", " ").alias("words")).select(F.col("words")[0].alias("label"), F.col("words"))
from pyspark.ml.feature import CountVectorizer
vectorizer = CountVectorizer(inputCol="words", outputCol="bag_of_words")
vectorizer_transformer = vectorizer.fit(td)
And then it should work so that you can call transform function as
vectorizer_transformer.transform(td).show(truncate=False)
Now, if you want to stick to the old style of converting to the rdd style then you have to modify certain lines of code. Following is the modified complete code (working) of yours
from pyspark import Row
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark import SparkConf
sc = SparkContext
spark = SparkSession.builder.appName("ML").getOrCreate()
train_data = spark.read.text("20ng-train-all-terms.txt")
td= train_data.rdd #transformer df to rdd
tr_data= td.map(lambda line: line[0].split(" ")).map(lambda words: Row(label=words[0], words=words[1:])).toDF()
from pyspark.ml.feature import CountVectorizer
vectorizer = CountVectorizer(inputCol="words", outputCol="bag_of_words")
vectorizer_transformer = vectorizer.fit(tr_data)
But I would suggest you to stick with dataframe way.

Resources