Merge multiple columns in a Spark DataFrame [Java] - apache-spark

How to combine multiple columns (say 3) from a DataFrame in a single column (in a new DataFrame) where each row becomes a Spark DenseVector? Similar to this thread but in Java and with a few tweaks mentioned below.
I tried using a UDF like this:
private UDF3<Double, Double, Double, Row> toColumn = new UDF3<Double, Double, Double, Row>() {
private static final long serialVersionUID = 1L;
public Row call(Double first, Double second, Double third) throws Exception {
Row row = RowFactory.create(Vectors.dense(first, second, third));
return row;
}
};
And then register the UDF:
sqlContext.udf().register("toColumn", toColumn, dataType);
Where the dataType is:
StructType dataType = DataTypes.createStructType(new StructField[]{
new StructField("bla", new VectorUDT(), false, Metadata.empty()),
});
When I call this UDF on a DataFrame with 3 columns and print out the schema of the new DataFrame, I get this:
root
|-- features: struct (nullable = true)
| |-- bla: vector (nullable = false)
The problem here is that I need a vector to be outside, not within a struct.
Something like this:
root
|-- features: vector (nullable = true)
I don't know how to get this since the register function requires the return type of UDF to be DataType (which, in turn, doesn't provide a VectorType)

You actually nested the vector type into a struct manually by using this data type:
new StructField("bla", new VectorUDT(), false, Metadata.empty()),
If you remove the outer StructField, you will get what you want. Of course, in this case, you need to modify a bit the signature of your function definition. That is, you need to return with the type Vector.
Please see below my concrete example of what I mean in the form of a simple JUnit test.
package sample.spark.test;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.VectorUDT;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.api.java.UDF3;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.junit.Test;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class ToVectorTest implements Serializable {
private static final long serialVersionUID = 2L;
private UDF3<Double, Double, Double, Vector> toColumn = new UDF3<Double, Double, Double, Vector>() {
private static final long serialVersionUID = 1L;
public Vector call(Double first, Double second, Double third) throws Exception {
return Vectors.dense(first, second, third);
}
};
#Test
public void testUDF() {
// context
final JavaSparkContext sc = new JavaSparkContext("local", "ToVectorTest");
final SQLContext sqlContext = new SQLContext(sc);
// test input
final DataFrame input = sqlContext.createDataFrame(
sc.parallelize(
Arrays.asList(
RowFactory.create(1.0, 2.0, 3.0),
RowFactory.create(4.0, 5.0, 6.0),
RowFactory.create(7.0, 8.0, 9.0),
RowFactory.create(10.0, 11.0, 12.0)
)),
DataTypes.createStructType(
Arrays.asList(
new StructField("feature1", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("feature2", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("feature3", DataTypes.DoubleType, false, Metadata.empty())
)
)
);
input.registerTempTable("input");
// expected output
final Set<Vector> expectedOutput = new HashSet<>(Arrays.asList(
Vectors.dense(1.0, 2.0, 3.0),
Vectors.dense(4.0, 5.0, 6.0),
Vectors.dense(7.0, 8.0, 9.0),
Vectors.dense(10.0, 11.0, 12.0)
));
// processing
sqlContext.udf().register("toColumn", toColumn, new VectorUDT());
final DataFrame outputDF = sqlContext.sql("SELECT toColumn(feature1, feature2, feature3) AS x FROM input");
final Set<Vector> output = new HashSet<>(outputDF.toJavaRDD().map(r -> r.<Vector>getAs("x")).collect());
// evaluation
assertEquals(expectedOutput.size(), output.size());
for (Vector x : output) {
assertTrue(expectedOutput.contains(x));
}
// show the schema and the content
System.out.println(outputDF.schema());
outputDF.show();
sc.stop();
}
}

Related

Spark java dataframe String cannot be converted to struct

I have the below spark schema defined
StructType state = DataTypes.createStructType(
new StructField[] {
DataTypes.createStructField("version", DataTypes.IntegerType, false),
DataTypes.createStructField("value", DataTypes.StringType, false)
});
ArrayType relationship = DataTypes.createArrayType(DataTypes.createStructType(
new StructField[] {
DataTypes.createStructField("cid", DataTypes.StringType, false),
DataTypes.createStructField("state", state, false),
}));
StructType cr = DataTypes.createStructType(
new StructField[] {
DataTypes.createStructField("cmg", relationship, false)
});
StructType schema = DataTypes.createStructType(
new StructField[] {
DataTypes.createStructField("cr", cr, false)
});
If I create the dataframe as
Row r1 = RowFactory.create("{cr:{cmg:[{cid:\"B06XW5BXJZ\",state:{version:19,value:"approved"}}]}}");
List<Row> rowList = ImmutableList.of(r1);
Dataset<Row> df = spark.sqlContext().createDataFrame(rowList, schema);
The code gives below error
The value ({cr:{cmg:[{cid:"B06XW5BXJZ",state:{version:19,value:"approved"}}]}}) of the type (java.lang.String) cannot be converted to struct<cmg:array<struct<cid:string,state:struct<version:int,value:string>>>>
What am I missing?
When you execute createDataFrame(rowList, schema) Spark tries to interpret the content of each element in rowList using the provided schema.
However, the values in rowList are strings, and not structured objects, so Spark is unable to apply the schema.
You have various options to load that object into a dataframe in structured form.
Load the data as json string and use spark to parse it
String jsonRow = "{cr:{cmg:[{cid:\"B06XW5BXJZ\",state:{version:19,value:\"approved\"}}]}}";
Dataset<Row> df = spark.createDataset(List.of(jsonRow), Encoders.STRING())
.select(functions.from_json(functions.col("value"), schema, Map.of("allowUnquotedFieldNames", "true")));
in this case it first creates a Dataset<String> in which each row contains a single String column (value) and then uses the from_json spark sql function to parse the json using your schema.
Also note the use of the allowUnquotedFieldNames=true option, required because in the input string the field names are not quoted.
Manually create structured rows and load them in a Dataframe
Row structuredRow = RowFactory.create(RowFactory.create(List.of(RowFactory.create("B06XW5BXJZ", RowFactory.create(19, "approved")))));
Dataset<Row> df = spark.createDataFrame(List.of(structuredRow), schema);
This extends your initial attempt to use the RowFactory to manually create the rows. The rows must reflect the structure defined in the schema (or rather, the schema must respect the structure of the rows).
Use a custom Java bean class
Class definitions
public static class State implements Serializable {
private Integer version;
private String value;
// getters, setters, constructors
}
public static class Relationship implements Serializable {
private String cid;
private State state;
// getters, setters, constructors
}
public static class Cr implements Serializable {
private List<Relationship> cmg;
// getters, setters, constructors
}
public static class RowBean implements Serializable {
private Cr cr;
// getters, setters, constructors
}
Use the bean class to create a Dataset
RowBean row = new RowBean(new Cr(List.of(new Relationship("B06XW5BXJZ", new State(19, "approved")))));
Dataset<RowBean> ds = spark.createDataset(List.of(row), Encoders.bean(RowBean.class));
In this case, using a custom Java bean / Scala case class, the schema is extracted directly from the class structure using Encoders.bean()

Returning java.util.Map from spark UDF results in scala.MatchError: {} (of class java.util.HashMap)

I'm new to Apache Spark and I'm learning how to use it in java. I would like to define and use an user defined function (udf) and I get the scala.MatchError by returning a java.util.HashMap.
Here is my code for extracting hashtags from a tweets dataset and adding a new column with a map of the hashtag and it's number of occurrences in the respective tweet:
// Open spark session
SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("TwitterAnalyticsExample").getOrCreate();
// Load training data
Dataset<Row> twitterData = sparkSession.read().format("json").load(inputFilePath);
UDF1 extractHashtags = new UDF1<String, Map<String, Integer>>() {
#Override
public Map<String, Integer> call(String tweet) throws Exception {
Map<String, Integer> result = new HashMap<>();
Pattern pattern = Pattern.compile("#\\w*");
Matcher matcher = pattern.matcher(tweet);
while (matcher.find()) {
result.merge(matcher.group(), 1, (v1, v2) -> v1 + v2);
}
return result;
}
};
sparkSession.sqlContext().udf().register("extractHashtags", extractHashtags, DataTypes.StringType);
twitterData.limit(50).select(callUDF("extractHashtags", col("text"))).show(20);
and following imports:
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.api.java.UDF1;
import org.apache.spark.sql.types.DataTypes;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Any hint, what am I doing wrong? Is the return type java.util.Map a problem for UDF? What could I use instead?

Exception in thread "main" java.lang.IllegalArgumentException: Field "features" does not exist

Exception in thread "main" java.lang.IllegalArgumentException: Field "features" does not exist.
at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:264)
at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:264)
at scala.collection.MapLike$class.getOrElse(MapLike.scala:128)
at scala.collection.AbstractMap.getOrElse(Map.scala:59)
at org.apache.spark.sql.types.StructType.apply(StructType.scala:263)
at org.apache.spark.ml.util.SchemaUtils$.checkColumnType(SchemaUtils.scala:40)
at org.apache.spark.ml.clustering.KMeansParams$class.validateAndTransformSchema(KMeans.scala:92)
at org.apache.spark.ml.clustering.KMeans.validateAndTransformSchema(KMeans.scala:253)
at org.apache.spark.ml.clustering.KMeans.transformSchema(KMeans.scala:330)
at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:74)
at org.apache.spark.ml.clustering.KMeans.fit(KMeans.scala:304)
at sparkExample.spExample.ClusteringDSPOC.main(ClusteringDSPOC.java:45)
17
My code is
package sparkExample.spExample;
import java.util.Properties;
import java.util.regex.Pattern;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.ml.clustering.KMeans;
import org.apache.spark.ml.clustering.KMeansModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
public class ClusteringDSPOC {
private static final Pattern SPACE = Pattern.compile(" ");
private static final SparkContext sc = new SparkContext(new SparkConf().setAppName("SparkJdbcDs").setMaster("local[*]"));
private static final String POSTGRESQL_DRIVER = "org.postgresql.Driver";
private static final String POSTGRESQL_USERNAME = "xyz";
private static final String POSTGRESQL_PWD = "xyz";
private static final String POSTGRESQL_CONNECTION_URL = "jdbc:postgresql://192.168.111.130:5432/xyzdb?user=" + POSTGRESQL_USERNAME + "&password=" + POSTGRESQL_PWD;
private static final String POSTGRESQL_TABLE = "(select id, duration from abc where duration is not null ) as abc";
public static void main(String[] args) throws Exception {
//Datasource options
SparkSession spark = SparkSession.builder().appName("JavaKMeansExample").getOrCreate();
Class.forName(POSTGRESQL_DRIVER);
Properties options = new Properties();
Dataset<Row> sdrDS = spark.read().format("libsvm").jdbc(POSTGRESQL_CONNECTION_URL, POSTGRESQL_TABLE, options);
Dataset<Row> durationDS = sdrDS.select("duration");
KMeans kmeans = new KMeans().setK(2).setSeed(1L);
KMeansModel model = kmeans.fit(durationDS);
}
}
I am following this
https://spark.apache.org/docs/latest/ml-clustering.html.
Getting this error while fit method is called.Please help me on fixing this or else some alternate option to do this.Thanks
Here I am trying to devide duration into 2 to 3 clusters and then map cluster with id.Same thing I am able to do by using Spark mllib library in this way
package sparkExample.spExample;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.mllib.clustering.KMeans;
import org.apache.spark.mllib.clustering.KMeansModel;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
public class ClusteringPOC1 {
private static final Pattern SPACE = Pattern.compile(" ");
private static final JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("SparkJdbcDs").setMaster("local[*]"));
private static final String POSTGRESQL_DRIVER = "org.postgresql.Driver";
private static final String POSTGRESQL_USERNAME = "abc";
private static final String POSTGRESQL_PWD = "abc";
private static final String POSTGRESQL_CONNECTION_URL = "jdbc:postgresql://192.168.111.130:5432/abcdb?user=" + POSTGRESQL_USERNAME + "&password=" + POSTGRESQL_PWD;
private static final SQLContext sqlContext = new SQLContext(sc);
public static void main(String[] args) throws Exception {
//Datasource options
Map<String, String> options = new HashMap<String, String>();
options.put("driver", POSTGRESQL_DRIVER);
options.put("url", POSTGRESQL_CONNECTION_URL);
options.put("dbtable", "(select id, duration from sdr_log where duration is not null ) as sdr_log");
Dataset<Row> sdrDF = sqlContext.load("jdbc", options);
JavaRDD<Row> sdrData = sdrDF.toJavaRDD();
sdrData.cache();
JavaRDD<Vector> durationData = sdrData.map(row -> {
double value = new Double(row.get(2).toString());
return Vectors.dense(value);
});
durationData.cache();
KMeansModel clusters = KMeans.train(durationData.rdd(), numClusters, numIterations);
JavaRDD<Integer> clusterLabel = clusters.predict(durationData);
JavaRDD<Long> id = sdrData.map(row -> new Long(row.get(1).toString()));
JavaPairRDD<Long, Integer> clusterLableData = id.zip(clusterLabel);
clusterLableData.saveAsTextFile("data/mlib/kmeans_output11.txt");
}
}
But I want to do this with spark ml library.
K-means is an unsupervised clustering algorithm that tries to partition a set of points into K sets (clusters) such that the points in each cluster tend to be near each other.
Dataset<Row> durationDS = sdrDS.select("duration");
In your code, you are iterating over row while selecting a single column 'durations' and you're setting the number of clusters as 2. But how can you classify the data into clusters when you are having no basis to do so?
The essence of unsupervised learning algorithms, in this case the Kmeans, is that you are not needed to specify parameters relating to logic of the dataset while using it. You are just needed to pass (fit) the dataset in the model and it classifies it into clusters.
In the K-means algorithm, the model tries to find the K-nearest neighbour. It needs some data to classify the cluster, whereas you're passing a single column.
It is better to use the Spark's Dataframe API to resolve the error you are facing.
Spark automatically reads the schema from the MySQL table and maps its types back to Spark SQL’s types
Import into a Dataframe object
> DataFrame jdbcDF= sql.Context.read().format("libsvm").jdbc(POSTGRESQL_CONNECTION_URL,POSTGRESQL_TABLE, options);
You can now drop columns you don't want using the DF.drop('ColumnName') function.
Or/And fit your dataset this way..
> KMeansModel model = kmeans.fit(jdbcDF);
Also, It would be great if you could provide the dataset

How to create a a spark dataframe from Integer RDD

How can I create a DataFrame from an JavaRDD contains Integers. I have done something like below but not working.
List<Integer> input = Arrays.asList(101, 103, 105);
JavaRDD<Integer> inputRDD = sc.parallelize(input);
DataFrame dataframe = sqlcontext.createDataFrame(inputRDD, Integer.class);
I got ClassCastException saying org.apache.spark.sql.types.IntegerType$ cannot be cast to org.apache.spark.sql.types.StructType
How can I achieve this?
Apparently (although not intuitively), this createDataFrame overload can only work for "Bean" types, which means types that do not correspond to any built-in Spark SQL type.
You can see that in the source code, the class you pass is matched with a Spark SQL type in JavaTypeInference.inferDataType, and the result is cast into a StructType (see dataType.asInstanceOf[StructType] in SQLContext.getSchema - but the built in "primitive" types (like IntegerType) are NOT StructTypes... Looks like a bug or undocumented behavior to me....
WORKAROUNDS:
Wrap your Integers with a "bean" class (that's ugly, I know):
public static class MyBean {
final int value;
MyBean(int value) {
this.value = value;
}
public int getValue() {
return value;
}
}
List<MyBean> input = Arrays.asList(new MyBean(101), new MyBean(103), new MyBean(105));
JavaRDD<MyBean> inputRDD = sc.parallelize(input);
DataFrame dataframe = sqlcontext.createDataFrame(inputRDD, MyBean.class);
dataframe.show(); // this works...
Convert to RDD<Row> yourself:
// convert to Rows:
JavaRDD<Row> rowRdd = inputRDD.map(new Function<Integer, Row>() {
#Override
public Row call(Integer v1) throws Exception {
return RowFactory.create(v1);
}
});
// create schema (this looks nicer in Scala...):
StructType schema = new StructType(new StructField[]{new StructField("number", IntegerType$.MODULE$, false, Metadata.empty())});
DataFrame dataframe = sqlcontext.createDataFrame(rowRdd, schema);
dataframe.show(); // this works...
Now in Spark 2.2 you can do the following to create a Dataset.
Dataset<Integer> dataSet = sqlContext().createDataset(javardd.rdd(), Encoders.INT());

Spark SQL: How to call UDF from DataFrame operation using JAVA

I would like to know how to call UDF function from function of domain-specific language(DSL) in Spark SQL using JAVA.
I have UDF function (just for example):
UDF2 equals = new UDF2<String, String, Boolean>() {
#Override
public Boolean call(String first, String second) throws Exception {
return first.equals(second);
}
};
I've registered it to sqlContext
sqlContext.udf().register("equals", equals, DataTypes.BooleanType);
When I run following query, my UDF is called and I get a result.
sqlContext.sql("SELECT p0.value FROM values p0 WHERE equals(p0.value, 'someString')");
I would transfrom this query using functions of domain specific language in Spark SQL, and I am not sure how to do it.
valuesDF.select("value").where(???);
I found that there exists callUDF() function where one of its parameters is Function2 fnctn but not UDF2.
How can I use UDF and functions from DSL?
I found a solution with which I am half-satisfied.
It is possible to call UDF as a Column Condition such as:
valuesDF.filter("equals(columnName, 'someString')").select("columnName");
But I still wonder if it is possible to call UDF directly.
Edit:
Btw, it is possible to call udf directly e.g:
df.where(callUdf("equals", scala.collection.JavaConversions.asScalaBuffer(
Arrays.asList(col("columnName"), col("otherColumnName"))
).seq())).select("columnName");
import of org.​apache.​spark.​sql.​functions is required.
When querying a dataframe, you should just be able to execute the UDF using something like this:
sourceDf.filter(equals(col("columnName"), "someString")).select("columnName")
where col("columnName") is the column you want to compare.
Here is working code example. It works with Spark 1.5.x and 1.6.x. The trick to calling UDF's from within a pipeline transformer is to use the sqlContext() on the DataFrame to register your UDF
#Test
public void test() {
// https://issues.apache.org/jira/browse/SPARK-12484
logger.info("BEGIN");
DataFrame df = createData();
final String tableName = "myTable";
sqlContext.registerDataFrameAsTable(df, tableName);
logger.info("print schema");
df.printSchema();
logger.info("original data before we applied UDF");
df.show();
MyUDF udf = new MyUDF();
final String udfName = "myUDF";
sqlContext.udf().register(udfName, udf, DataTypes.StringType);
String fmt = "SELECT *, %s(%s) as transformedByUDF FROM %s";
String stmt = String.format(fmt, udfName, tableName+".labelStr", tableName);
logger.info("AEDWIP stmt:{}", stmt);
DataFrame udfDF = sqlContext.sql(stmt);
Row[] results = udfDF.head(3);
for (Row row : results) {
logger.info("row returned by applying UDF {}", row);
}
logger.info("AEDWIP udfDF schema");
udfDF.printSchema();
logger.info("AEDWIP udfDF data");
udfDF.show();
logger.info("END");
}
DataFrame createData() {
Features f1 = new Features(1, category1);
Features f2 = new Features(2, category2);
ArrayList<Features> data = new ArrayList<Features>(2);
data.add(f1);
data.add(f2);
//JavaRDD<Features> rdd = javaSparkContext.parallelize(Arrays.asList(f1, f2));
JavaRDD<Features> rdd = javaSparkContext.parallelize(data);
DataFrame df = sqlContext.createDataFrame(rdd, Features.class);
return df;
}
class MyUDF implements UDF1<String, String> {
private static final long serialVersionUID = 1L;
#Override
public String call(String s) throws Exception {
logger.info("AEDWIP s:{}", s);
String ret = s.equalsIgnoreCase(category1) ? category1 : category3;
return ret;
}
}
public class Features implements Serializable{
private static final long serialVersionUID = 1L;
int id;
String labelStr;
Features(int id, String l) {
this.id = id;
this.labelStr = l;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getLabelStr() {
return labelStr;
}
public void setLabelStr(String labelStr) {
this.labelStr = labelStr;
}
}
this is the output
+---+--------+
| id|labelStr|
+---+--------+
| 1| noise|
| 2| ack|
+---+--------+
root
|-- id: integer (nullable = false)
|-- labelStr: string (nullable = true)
|-- transformedByUDF: string (nullable = true)
+---+--------+----------------+
| id|labelStr|transformedByUDF|
+---+--------+----------------+
| 1| noise| noise|
| 2| ack| signal|
+---+--------+----------------+

Resources