Exception in thread "main" java.lang.IllegalArgumentException: Field "features" does not exist.
at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:264)
at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:264)
at scala.collection.MapLike$class.getOrElse(MapLike.scala:128)
at scala.collection.AbstractMap.getOrElse(Map.scala:59)
at org.apache.spark.sql.types.StructType.apply(StructType.scala:263)
at org.apache.spark.ml.util.SchemaUtils$.checkColumnType(SchemaUtils.scala:40)
at org.apache.spark.ml.clustering.KMeansParams$class.validateAndTransformSchema(KMeans.scala:92)
at org.apache.spark.ml.clustering.KMeans.validateAndTransformSchema(KMeans.scala:253)
at org.apache.spark.ml.clustering.KMeans.transformSchema(KMeans.scala:330)
at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:74)
at org.apache.spark.ml.clustering.KMeans.fit(KMeans.scala:304)
at sparkExample.spExample.ClusteringDSPOC.main(ClusteringDSPOC.java:45)
17
My code is
package sparkExample.spExample;
import java.util.Properties;
import java.util.regex.Pattern;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.ml.clustering.KMeans;
import org.apache.spark.ml.clustering.KMeansModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
public class ClusteringDSPOC {
private static final Pattern SPACE = Pattern.compile(" ");
private static final SparkContext sc = new SparkContext(new SparkConf().setAppName("SparkJdbcDs").setMaster("local[*]"));
private static final String POSTGRESQL_DRIVER = "org.postgresql.Driver";
private static final String POSTGRESQL_USERNAME = "xyz";
private static final String POSTGRESQL_PWD = "xyz";
private static final String POSTGRESQL_CONNECTION_URL = "jdbc:postgresql://192.168.111.130:5432/xyzdb?user=" + POSTGRESQL_USERNAME + "&password=" + POSTGRESQL_PWD;
private static final String POSTGRESQL_TABLE = "(select id, duration from abc where duration is not null ) as abc";
public static void main(String[] args) throws Exception {
//Datasource options
SparkSession spark = SparkSession.builder().appName("JavaKMeansExample").getOrCreate();
Class.forName(POSTGRESQL_DRIVER);
Properties options = new Properties();
Dataset<Row> sdrDS = spark.read().format("libsvm").jdbc(POSTGRESQL_CONNECTION_URL, POSTGRESQL_TABLE, options);
Dataset<Row> durationDS = sdrDS.select("duration");
KMeans kmeans = new KMeans().setK(2).setSeed(1L);
KMeansModel model = kmeans.fit(durationDS);
}
}
I am following this
https://spark.apache.org/docs/latest/ml-clustering.html.
Getting this error while fit method is called.Please help me on fixing this or else some alternate option to do this.Thanks
Here I am trying to devide duration into 2 to 3 clusters and then map cluster with id.Same thing I am able to do by using Spark mllib library in this way
package sparkExample.spExample;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.mllib.clustering.KMeans;
import org.apache.spark.mllib.clustering.KMeansModel;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
public class ClusteringPOC1 {
private static final Pattern SPACE = Pattern.compile(" ");
private static final JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("SparkJdbcDs").setMaster("local[*]"));
private static final String POSTGRESQL_DRIVER = "org.postgresql.Driver";
private static final String POSTGRESQL_USERNAME = "abc";
private static final String POSTGRESQL_PWD = "abc";
private static final String POSTGRESQL_CONNECTION_URL = "jdbc:postgresql://192.168.111.130:5432/abcdb?user=" + POSTGRESQL_USERNAME + "&password=" + POSTGRESQL_PWD;
private static final SQLContext sqlContext = new SQLContext(sc);
public static void main(String[] args) throws Exception {
//Datasource options
Map<String, String> options = new HashMap<String, String>();
options.put("driver", POSTGRESQL_DRIVER);
options.put("url", POSTGRESQL_CONNECTION_URL);
options.put("dbtable", "(select id, duration from sdr_log where duration is not null ) as sdr_log");
Dataset<Row> sdrDF = sqlContext.load("jdbc", options);
JavaRDD<Row> sdrData = sdrDF.toJavaRDD();
sdrData.cache();
JavaRDD<Vector> durationData = sdrData.map(row -> {
double value = new Double(row.get(2).toString());
return Vectors.dense(value);
});
durationData.cache();
KMeansModel clusters = KMeans.train(durationData.rdd(), numClusters, numIterations);
JavaRDD<Integer> clusterLabel = clusters.predict(durationData);
JavaRDD<Long> id = sdrData.map(row -> new Long(row.get(1).toString()));
JavaPairRDD<Long, Integer> clusterLableData = id.zip(clusterLabel);
clusterLableData.saveAsTextFile("data/mlib/kmeans_output11.txt");
}
}
But I want to do this with spark ml library.
K-means is an unsupervised clustering algorithm that tries to partition a set of points into K sets (clusters) such that the points in each cluster tend to be near each other.
Dataset<Row> durationDS = sdrDS.select("duration");
In your code, you are iterating over row while selecting a single column 'durations' and you're setting the number of clusters as 2. But how can you classify the data into clusters when you are having no basis to do so?
The essence of unsupervised learning algorithms, in this case the Kmeans, is that you are not needed to specify parameters relating to logic of the dataset while using it. You are just needed to pass (fit) the dataset in the model and it classifies it into clusters.
In the K-means algorithm, the model tries to find the K-nearest neighbour. It needs some data to classify the cluster, whereas you're passing a single column.
It is better to use the Spark's Dataframe API to resolve the error you are facing.
Spark automatically reads the schema from the MySQL table and maps its types back to Spark SQL’s types
Import into a Dataframe object
> DataFrame jdbcDF= sql.Context.read().format("libsvm").jdbc(POSTGRESQL_CONNECTION_URL,POSTGRESQL_TABLE, options);
You can now drop columns you don't want using the DF.drop('ColumnName') function.
Or/And fit your dataset this way..
> KMeansModel model = kmeans.fit(jdbcDF);
Also, It would be great if you could provide the dataset
Related
I'm new to Apache Spark and I'm learning how to use it in java. I would like to define and use an user defined function (udf) and I get the scala.MatchError by returning a java.util.HashMap.
Here is my code for extracting hashtags from a tweets dataset and adding a new column with a map of the hashtag and it's number of occurrences in the respective tweet:
// Open spark session
SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("TwitterAnalyticsExample").getOrCreate();
// Load training data
Dataset<Row> twitterData = sparkSession.read().format("json").load(inputFilePath);
UDF1 extractHashtags = new UDF1<String, Map<String, Integer>>() {
#Override
public Map<String, Integer> call(String tweet) throws Exception {
Map<String, Integer> result = new HashMap<>();
Pattern pattern = Pattern.compile("#\\w*");
Matcher matcher = pattern.matcher(tweet);
while (matcher.find()) {
result.merge(matcher.group(), 1, (v1, v2) -> v1 + v2);
}
return result;
}
};
sparkSession.sqlContext().udf().register("extractHashtags", extractHashtags, DataTypes.StringType);
twitterData.limit(50).select(callUDF("extractHashtags", col("text"))).show(20);
and following imports:
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.api.java.UDF1;
import org.apache.spark.sql.types.DataTypes;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Any hint, what am I doing wrong? Is the return type java.util.Map a problem for UDF? What could I use instead?
I am a beginner to the Spark. I made the codes and ran them on the multi nodes.
I have one master node and four worker nodes. I ran my codes multiple times and to my surprise, sometimes some of them did't work and sometimes all the worker nodes worked because they were assigned to have the data that master specified.
I didn't setup any detailed configurations so this behavior looks weired to me.
I want to have all my worker nodes process at the same time to get the better and faster results. How to achieve my requirement?
I attached my codes and commands. It is very straightforward so I skipped detailed explanation. Thanks.
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
* Created by dst on 2/1/17.
*/
public class Test {
public static void main(String[] args) throws Exception {
String inputFile = args[0];
String outputFile = args[1];
SparkConf conf = new SparkConf().setAppName("Data Transformation")
.set("spark.serializer","org.apache.spark.serializer.KryoSerializer");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile(inputFile);
JavaRDD<String> newLine = lines.flatMap(new FlatMapFunction<String, String>() {
public Iterator<String> call(String s) throws Exception {
List<String> ret = new ArrayList<String>();
List<String> ls = Arrays.asList(s.split("\t"));
String values = ls.get(ls.size()-1);
List<String> value = Arrays.asList(values.split("\\|"));
for(int i=0;i<value.size();++i){
String ns = ls.get(0)+"\t"+ls.get(1)+"\t"+ls.get(2)+"\t"+ls.get(3)+"\t"+ls.get(4)+"\t"+ls.get(5);
ns = ns + "\t" + value.get(i);
ret.add(ns);
}
return ret.iterator();
}
});
newLine.saveAsTextFile(outputFile);
}
}
Spark-submit.
spark-submit \
--class Test \
--master spark://spark.dso.xxxx \
--executor-memory 10G \
/home/jumbo/user/sclee/dt/jars/dt_01_notcache-1.0-SNAPSHOT.jar \
/user/sclee/data/ /user/sclee/output
Referring to documentation try setting spark.deploy.spreadOut = false and the behavior will remain same after this setting.
I was trying to run a sample spark streaming code. but I get this error:
16/06/02 15:25:42 ERROR streaming.StreamingContext: Error starting the context, marking it as stopped
java.lang.IllegalArgumentException: requirement failed: No output operations registered, so nothing to execute
at scala.Predef$.require(Predef.scala:233)
at org.apache.spark.streaming.DStreamGraph.validate(DStreamGraph.scala:161)
at org.apache.spark.streaming.StreamingContext.validate(StreamingContext.scala:542)
at org.apache.spark.streaming.StreamingContext.liftedTree1$1(StreamingContext.scala:601)
at org.apache.spark.streaming.StreamingContext.start(StreamingContext.scala:600)
at org.apache.spark.streaming.api.java.JavaStreamingContext.start(JavaStreamingContext.scala:624)
at com.streams.spark_consumer.SparkConsumer.main(SparkConsumer.java:56)
Exception in thread "main" java.lang.IllegalArgumentException: requirement failed: No output operations registered, so nothing to execute
at scala.Predef$.require(Predef.scala:233)
at org.apache.spark.streaming.DStreamGraph.validate(DStreamGraph.scala:161)
at org.apache.spark.streaming.StreamingContext.validate(StreamingContext.scala:542)
at org.apache.spark.streaming.StreamingContext.liftedTree1$1(StreamingContext.scala:601)
at org.apache.spark.streaming.StreamingContext.start(StreamingContext.scala:600)
at org.apache.spark.streaming.api.java.JavaStreamingContext.start(JavaStreamingContext.scala:624)
at com.streams.spark_consumer.SparkConsumer.main(SparkConsumer.java:56)
My code is given below. I know there are a few unused imports, because I was doing something else and getting the same error so I modified the same code to run the sample program given on the spark streaming website:
package com.streams.spark_consumer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import scala.Tuple2;
import kafka.serializer.StringDecoder;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.*;
import org.apache.spark.streaming.api.java.*;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.Durations;
import org.apache.spark.api.java.JavaSparkContext;
public class SparkConsumer {
private static final Pattern SPACE = Pattern.compile(" ");
public static void main(String[] args) throws Exception {
System.out.println("Han chal raha hai"); //just to know if this part of the code is executed
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(1));
System.out.println("Han bola na chal raha hau chutiye 1"); //just to know if this part of the code is executed
JavaReceiverInputDStream<String> lines = jssc.socketTextStream("localhost", 9999);
JavaDStream<String> words = lines.flatMap(
new FlatMapFunction<String, String>() {
public Iterable<String> call(String x) {
return Arrays.asList(x.split(" "));
}
});
JavaPairDStream<String, Integer> pairs = words.mapToPair(
new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String s) {
return new Tuple2<String, Integer>(s, 1);
}
});
JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey(
new Function2<Integer, Integer, Integer>() {
public Integer call(Integer i1, Integer i2) {
return i1 + i2;
}
});
jssc.start();
jssc.awaitTermination();
}
}
Can anybody help me out with this?
I am using local master, even then I have tried starting a master and stoping a master (also slaves), I didn't know why that might help but just in case, I have already tried that.
According to Spark documentation
Since the output operations actually allow the transformed data to be consumed by external systems, they trigger the actual execution of all the DStream transformations (similar to actions for RDDs).
So use any of the output operations after your tranformations.
print()
foreachRDD(func)
saveAsObjectFiles(prefix, [suffix])
saveAsTextFiles(prefix, [suffix])
saveAsHadoopFiles(prefix, [suffix])
Is #UDT (http://docs.datastax.com/en/developer/java-driver/2.1/java-driver/reference/mappingUdts.html) supported by Spring-data-Cassandra 1.3.2.RELEASE? If not, how can I add workaround for this
Thanks
See the details here:
https://jira.spring.io/browse/DATACASS-172
I faced with the same issue and it sounds like it does not(
debug process shows me that spring data cassandra check for
#Table, #Persistent or #PrimaryKeyClass Annotation only and raise exception
in other case
>
Invocation of init method failed; nested exception is org.springframework.data.cassandra.mapping.VerifierMappingExceptions:
Cassandra entities must have the #Table, #Persistent or #PrimaryKeyClass Annotation
But I found the solution.
I figured out the approach that allows me to manage entities that include UDT and the ones that don't. In my application I use spring cassandra data project together with using of direct datastax core driver. The repositories that don't contain object with UDT use spring cassanta data approach and the objects that include UDT use custom repositories.
Custom repositories use datastax mapper and they work correctly with UDT
(they located in separate package, see notes below why it's needed):
package com.fyb.cassandra.custom.repositories.impl;
import java.util.List;
import java.util.UUID;
import javax.annotation.PostConstruct;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.cassandra.config.CassandraSessionFactoryBean;
import com.datastax.driver.core.ResultSet;
import com.datastax.driver.mapping.Mapper;
import com.datastax.driver.mapping.MappingManager;
import com.datastax.driver.mapping.Result;
import com.google.common.collect.Lists;
import com.fyb.cassandra.custom.repositories.AccountDeviceRepository;
import com.fyb.cassandra.dto.AccountDevice;
public class AccountDeviceRepositoryImpl implements AccountDeviceRepository {
#Autowired
public CassandraSessionFactoryBean session;
private Mapper<AccountDevice> mapper;
#PostConstruct
void initialize() {
mapper = new MappingManager(session.getObject()).mapper(AccountDevice.class);
}
#Override
public List<AccountDevice> findAll() {
return fetchByQuery("SELECT * FROM account_devices");
}
#Override
public void save(AccountDevice accountDevice) {
mapper.save(accountDevice);
}
#Override
public void deleteByConditions(UUID accountId, UUID systemId, UUID deviceId) {
final String query = "DELETE FROM account_devices where account_id =" + accountId + " AND system_id=" + systemId
+ " AND device_id=" + deviceId;
session.getObject().execute(query);
}
#Override
public List<AccountDevice> findByAccountId(UUID accountId) {
final String query = "SELECT * FROM account_devices where account_id=" + accountId;
return fetchByQuery(query);
}
/*
* Take any valid CQL query and try to map result set to the given list of appropriates <T> types.
*/
private List<AccountDevice> fetchByQuery(String query) {
ResultSet results = session.getObject().execute(query);
Result<AccountDevice> accountsDevices = mapper.map(results);
List<AccountDevice> result = Lists.newArrayList();
for (AccountDevice accountsDevice : accountsDevices) {
result.add(accountsDevice);
}
return result;
}
}
And the spring data related repos that resonsible for managing entities that don't include UDT objects looks like as follows:
package com.fyb.cassandra.repositories;
import org.springframework.data.cassandra.repository.CassandraRepository;
import com.fyb.cassandra.dto.AccountUser;
import org.springframework.data.cassandra.repository.Query;
import org.springframework.stereotype.Repository;
import java.util.List;
import java.util.UUID;
#Repository
public interface AccountUserRepository extends CassandraRepository<AccountUser> {
#Query("SELECT * FROM account_users WHERE account_id=?0")
List<AccountUser> findByAccountId(UUID accountId);
}
I've tested this solution and it's works 100%.
In addition I've attached my POJO objects:
Pojo that uses only data stax annatation:
package com.fyb.cassandra.dto;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import com.datastax.driver.mapping.annotations.ClusteringColumn;
import com.datastax.driver.mapping.annotations.Column;
import com.datastax.driver.mapping.annotations.Frozen;
import com.datastax.driver.mapping.annotations.FrozenValue;
import com.datastax.driver.mapping.annotations.PartitionKey;
import com.datastax.driver.mapping.annotations.Table;
#Table(name = "account_systems")
public class AccountSystem {
#PartitionKey
#Column(name = "account_id")
private java.util.UUID accountId;
#ClusteringColumn
#Column(name = "system_id")
private java.util.UUID systemId;
#Frozen
private Location location;
#FrozenValue
#Column(name = "user_token")
private List<UserToken> userToken;
#Column(name = "product_type_id")
private int productTypeId;
#Column(name = "serial_number")
private String serialNumber;
}
Pojo without using UDT and using only spring data cassandra framework:
package com.fyb.cassandra.dto;
import java.util.Date;
import java.util.UUID;
import org.springframework.cassandra.core.PrimaryKeyType;
import org.springframework.data.cassandra.mapping.Column;
import org.springframework.data.cassandra.mapping.PrimaryKeyColumn;
import org.springframework.data.cassandra.mapping.Table;
#Table(value = "accounts")
public class Account {
#PrimaryKeyColumn(name = "account_id", ordinal = 0, type = PrimaryKeyType.PARTITIONED)
private java.util.UUID accountId;
#Column(value = "account_name")
private String accountName;
#Column(value = "currency")
private String currency;
}
Note, that the entities below use different annotations:
#PrimaryKeyColumn(name = "account_id", ordinal = 0, type = PrimaryKeyType.PARTITIONED)and #PartitionKey
#ClusteringColumn and #PrimaryKeyColumn(name = "area_parent_id", ordinal = 2, type = PrimaryKeyType.CLUSTERED)
At first glance - it's uncomfortable, but it allows you to work with objects that includes UDT and that don't.
One important note. That two repos(that use UDT and don't should reside in different packages) cause Spring config looking for base packages with repos:
#Configuration
#EnableCassandraRepositories(basePackages = {
"com.fyb.cassandra.repositories" })
public class CassandraConfig {
..........
}
User Defined data type is now supported by Spring Data Cassandra. The latest release 1.5.0.RELEASE uses Cassandra Data stax driver 3.1.3 and hence its working now. Follow the below steps to make it working
How to use UserDefinedType(UDT) feature with Spring Data Cassandra :
We need to use the latest jar of Spring data Cassandra (1.5.0.RELEASE)
group: 'org.springframework.data', name: 'spring-data-cassandra', version: '1.5.0.RELEASE'
Make sure it uses below versions of the jar :
datastax.cassandra.driver.version=3.1.3
spring.data.cassandra.version=1.5.0.RELEASE
spring.data.commons.version=1.13.0.RELEASE
spring.cql.version=1.5.0.RELEASE
Create user defined type in Cassandra : The type name should be same as defined in the POJO class
Address data type
CREATE TYPE address_type (
id text,
address_type text,
first_name text,
phone text
);
Create column-family with one of the columns as UDT in Cassandra:
Employee table:
CREATE TABLE employee(
employee_id uuid,
employee_name text,
address frozen,
primary key (employee_id, employee_name)
);
In the domain class, define the field with annotation -CassandraType and DataType should be UDT:
#Table("employee") public class Employee {
-- othere fields--
#CassandraType(type = DataType.Name.UDT, userTypeName = "address_type")
private Address address;
}
Create domain class for the user defined type : We need to make sure that column name in the user defined type schema
has to be same as field name in the domain class.
#UserDefinedType("address_type") public class Address { #CassandraType(type = DataType.Name.TEXT)
private String id; #CassandraType(type = DataType.Name.TEXT) private String address_type; }
In the Cassandra Config, Change this :
#Bean public CassandraMappingContext mappingContext() throws Exception {
BasicCassandraMappingContext mappingContext = new BasicCassandraMappingContext();
mappingContext.setUserTypeResolver(new SimpleUserTypeResolver(cluster().getObject(), cassandraKeyspace));
return mappingContext;
}
User defined type should have the same name across everywhere. for e.g
#UserDefinedType("address_type")
#CassandraType(type = DataType.Name.UDT, userTypeName = "address_type")
CREATE TYPE address_type
Can we convert a String token into a CoreLabel instance?
So far, I am using:
CoreLabelTokenFactory c = new CoreLabelTokenFactory();
CoreLabel tokens = c.makeToken("going",0,"going".length());
The string gets converted, however with this approach, CoreLabel is not working in finding lemma's and pos.
Here is some sample code to demonstrate going from a raw String to an Annotation object:
import java.io.*;
import java.util.*;
import edu.stanford.nlp.io.*;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.ling.CoreAnnotations.*;
import edu.stanford.nlp.util.*;
public class TokenizeExample {
public static void main (String[] args) throws IOException {
String text = "Here is a sentence. Here is another sentence.";
Annotation document = new Annotation(text);
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, pos, lemma");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
pipeline.annotate(document);
for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
for (CoreLabel cl : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
System.out.println("---");
System.out.println(cl);
System.out.println(cl.get(CoreAnnotations.PartOfSpeechAnnotation.class));
System.out.println(cl.get(CoreAnnotations.LemmaAnnotation.class));
}
}
}
}
Make sure you get Stanford CoreNLP 3.5.2 from here: http://nlp.stanford.edu/software/corenlp.shtml