I am getting "Exception in thread "main" java.lang.ClassCastException: org.apache.spark.ml.attribute.UnresolvedAttribute$ cannot be cast to org.apache.spark.ml.attribute.NominalAttribute".
Source code
package com.spark.lograthmicregression;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
import org.apache.spark.ml.classification.GBTClassificationModel;
import org.apache.spark.ml.classification.GBTClassifier;
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
import org.apache.spark.ml.feature.IndexToString;
import org.apache.spark.ml.feature.StringIndexer;
import org.apache.spark.ml.feature.StringIndexerModel;
import org.apache.spark.ml.feature.VectorAssembler;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.catalyst.expressions.AttributeReference;
import org.apache.spark.sql.catalyst.expressions.Expression;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import com.google.common.collect.ImmutableMap;
import scala.collection.mutable.Seq;
public class ClickThroughRateAnalytics {
private static SimpleDateFormat sdf = new SimpleDateFormat("yyMMddHH");
public static void main(String[] args) {
final SparkConf sparkConf = new SparkConf().setAppName("Click Analysis").setMaster("local");
try (JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf)) {
SQLContext sqlContext = new SQLContext(javaSparkContext);
DataFrame dataFrame = sqlContext.read().format("com.databricks.spark.csv").option("inferSchema", "true").option("header", "true")
.load("/splits/sub-suaa");
// This will keep data in memory
dataFrame.cache();
// This will describe the column
// dataFrame.describe("hour").show();
System.out.println("Rows before removing missing data : " + dataFrame.count());
// This will describe column details
// dataFrame.describe("click", "hour", "site_domain").show();
// This will calculate variance between columns +ve one increases
// second increases and -ve means one increases other decreases
// double cov = dataFrame.stat().cov("click", "hour");
// System.out.println("cov : " + cov);
// It provides quantitative measurements of the statistical
// dependence between two random variables
// double corr = dataFrame.stat().corr("click", "hour");
// System.out.println("corr : " + corr);
// Cross Tabulation provides a table of the frequency distribution
// for a set of variables
// dataFrame.stat().crosstab("site_id", "site_domain").show();
// For frequent items
// System.out.println("Frequest Items : " +
// dataFrame.stat().freqItems(new String[] { "site_id",
// "site_domain" }, 0.3).collectAsList());
// TODO we can also set maximum occurring item to categorical
// values.
// This will replace null values with average for numeric columns
dataFrame = modifiyDatFrame(dataFrame);
// Removing rows which have some missing values
dataFrame = dataFrame.na().replace(dataFrame.columns(), ImmutableMap.of("", "NA"));
dataFrame.na().fill(0.0);
dataFrame = dataFrame.na().drop();
System.out.println("Rows after removing missing data : " + dataFrame.count());
// TODO Binning and bucketing
// normalizer will take the column created by the VectorAssembler,
// normalize it and produce a new column
// Normalizer normalizer = new
// Normalizer().setInputCol("features_index").setOutputCol("features");
dataFrame = dataFrame.drop("app_category_index").drop("app_domain_index").drop("hour_index").drop("C20_index")
.drop("device_connection_type_index").drop("C1_index").drop("id").drop("device_ip_index").drop("banner_pos_index");
DataFrame[] splits = dataFrame.randomSplit(new double[] { 0.7, 0.3 });
DataFrame trainingData = splits[0];
DataFrame testData = splits[1];
StringIndexerModel labelIndexer = new StringIndexer().setInputCol("click").setOutputCol("indexedclick").fit(dataFrame);
// Here we will be sending all columns which will participate in
// prediction
VectorAssembler vectorAssembler = new VectorAssembler().setInputCols(findPredictionColumns("click", dataFrame))
.setOutputCol("features_index");
GBTClassifier gbt = new GBTClassifier().setLabelCol("indexedclick").setFeaturesCol("features_index").setMaxIter(10).setMaxBins(69000);
IndexToString labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel");
Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { labelIndexer, vectorAssembler, gbt, labelConverter });
trainingData.show(1);
PipelineModel model = pipeline.fit(trainingData);
DataFrame predictions = model.transform(testData);
predictions.select("predictedLabel", "label").show(5);
MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel")
.setPredictionCol("prediction").setMetricName("precision");
double accuracy = evaluator.evaluate(predictions);
System.out.println("Test Error = " + (1.0 - accuracy));
GBTClassificationModel gbtModel = (GBTClassificationModel) (model.stages()[2]);
System.out.println("Learned classification GBT model:\n" + gbtModel.toDebugString());
}
}
private static String[] findPredictionColumns(String outputCol, DataFrame dataFrame) {
String columns[] = dataFrame.columns();
String inputColumns[] = new String[columns.length - 1];
int count = 0;
for (String column : dataFrame.columns()) {
if (!column.equalsIgnoreCase(outputCol)) {
inputColumns[count++] = column;
}
}
return inputColumns;
}
/**
* This will replace empty values with mean.
*
* #param columnName
* #param dataFrame
* #return
*/
private static DataFrame modifiyDatFrame(DataFrame dataFrame) {
Set<String> numericColumns = new HashSet<String>();
if (dataFrame.numericColumns() != null && dataFrame.numericColumns().length() > 0) {
scala.collection.Iterator<Expression> iterator = ((Seq<Expression>) dataFrame.numericColumns()).toIterator();
while (iterator.hasNext()) {
Expression expression = iterator.next();
Double avgAge = dataFrame.na().drop().groupBy(((AttributeReference) expression).name()).avg(((AttributeReference) expression).name())
.first().getDouble(1);
dataFrame = dataFrame.na().fill(avgAge, new String[] { ((AttributeReference) expression).name() });
numericColumns.add(((AttributeReference) expression).name());
DataType dataType = ((AttributeReference) expression).dataType();
if (!"double".equalsIgnoreCase(dataType.simpleString())) {
dataFrame = dataFrame.withColumn("temp", dataFrame.col(((AttributeReference) expression).name()).cast(DataTypes.DoubleType))
.drop(((AttributeReference) expression).name()).withColumnRenamed("temp", ((AttributeReference) expression).name());
}
}
}
// Fit method of StringIndexer converts the column to StringType(if
// it is not of StringType) and then counts the occurrence of each
// word. It then sorts these words in descending order of their
// frequency and assigns an index to each word. StringIndexer.fit()
// method returns a StringIndexerModel which is a Transformer
StringIndexer stringIndexer = new StringIndexer();
String allCoumns[] = dataFrame.columns();
for (String column : allCoumns) {
if (!numericColumns.contains(column)) {
dataFrame = stringIndexer.setInputCol(column).setOutputCol(column + "_index").fit(dataFrame).transform(dataFrame);
dataFrame = dataFrame.drop(column);
}
}
dataFrame.printSchema();
return dataFrame;
}
#SuppressWarnings("unused")
private static void copyFile(DataFrame dataFrame) {
dataFrame
.select("id", "click", "hour", "C1", "banner_pos", "site_id", "site_domain", "site_category", "app_id", "app_domain", "app_category",
"device_id", "device_ip", "device_model", "device_type", "device_conn_type", "C14", "C15", "C16", "C17", "C18", "C19", "C20",
"C21")
.write().format("com.databricks.spark.csv").option("header", "true").option("codec", "org.apache.hadoop.io.compress.GzipCodec")
.save("/splits/sub-splitaa-optmized");
}
#SuppressWarnings("unused")
private static Integer parse(String sDate, int field) {
try {
if (sDate != null && !sDate.toString().equalsIgnoreCase("hour")) {
Date date = sdf.parse(sDate.toString());
Calendar cal = Calendar.getInstance();
cal.setTime(date);
return cal.get(field);
}
} catch (ParseException e) {
e.printStackTrace();
}
return 0;
}
}
I am using spark java. Sample file will be :
id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
1000009418151094273,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,ddd2926e,44956a24,1,2,15706,320,50,1722,0,35,-1,79
10000169349117863715,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,96809ac8,711ee120,1,0,15704,320,50,1722,0,35,100084,79
10000371904215119486,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,b3cf8def,8a4875bd,1,0,15704,320,50,1722,0,35,100084,79
10000640724480838376,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,e8275b8f,6332421a,1,0,15706,320,50,1722,0,35,100084,79
10000679056417042096,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,07d7df22,a99f214a,9644d0bf,779d90c2,1,0,18993,320,50,2161,0,35,-1,157
10000720757801103869,0,14102100,1005,0,d6137915,bb1ef334,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,05241af0,8a4875bd,1,0,16920,320,50,1899,0,431,100077,117
10000724729988544911,0,14102100,1005,0,8fda644b,25d4cfcd,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,b264c159,be6db1d7,1,0,20362,320,50,2333,0,39,-1,157
I am late in replying but I was also facing the same error while using gbt for a dataset in csv file.
I added .setLabels(labelIndexer.labels()) in labelConverter and this solved the problem.
IndexToString labelConverter = new IndexToString()
.setInputCol("prediction")
.setOutputCol("predictedLabel")
.setLabels(labelIndexer.labels())
I'm using the Stanford Named Entity Recognizer http://nlp.stanford.edu/software/CRF-NER.shtml and it's working fine. This is
List<List<CoreLabel>> out = classifier.classify(text);
for (List<CoreLabel> sentence : out) {
for (CoreLabel word : sentence) {
if (!StringUtils.equals(word.get(AnswerAnnotation.class), "O")) {
namedEntities.add(word.word().trim());
}
}
}
However the problem I'm finding is identifying names and surnames. If the recognizer encounters "Joe Smith", it is returning "Joe" and "Smith" separately. I'd really like it to return "Joe Smith" as one term.
Could this be achieved through the recognizer maybe through a configuration? I didn't find anything in the javadoc till now.
Thanks!
This is because your inner for loop is iterating over individual tokens (words) and adding them separately. You need to change things to add whole names at once.
One way is to replace the inner for loop with a regular for loop with a while loop inside it which takes adjacent non-O things of the same class and adds them as a single entity.*
Another way would be to use the CRFClassifier method call:
List<Triple<String,Integer,Integer>> classifyToCharacterOffsets(String sentences)
which will give you whole entities, which you can extract the String form of by using substring on the original input.
*The models that we distribute use a simple raw IO label scheme, where things are labeled PERSON or LOCATION, and the appropriate thing to do is simply to coalesce adjacent tokens with the same label. Many NER systems use more complex labels such as IOB labels, where codes like B-PERS indicates where a person entity starts. The CRFClassifier class and feature factories support such labels, but they're not used in the models we currently distribute (as of 2012).
The counterpart of the classifyToCharacterOffsets method is that (AFAIK) you can't access the label of the entities.
As proposed by Christopher, here is an example of a loop which assembles "adjacent non-O things". This example also counts the number of occurrences.
public HashMap<String, HashMap<String, Integer>> extractEntities(String text){
HashMap<String, HashMap<String, Integer>> entities =
new HashMap<String, HashMap<String, Integer>>();
for (List<CoreLabel> lcl : classifier.classify(text)) {
Iterator<CoreLabel> iterator = lcl.iterator();
if (!iterator.hasNext())
continue;
CoreLabel cl = iterator.next();
while (iterator.hasNext()) {
String answer =
cl.getString(CoreAnnotations.AnswerAnnotation.class);
if (answer.equals("O")) {
cl = iterator.next();
continue;
}
if (!entities.containsKey(answer))
entities.put(answer, new HashMap<String, Integer>());
String value = cl.getString(CoreAnnotations.ValueAnnotation.class);
while (iterator.hasNext()) {
cl = iterator.next();
if (answer.equals(
cl.getString(CoreAnnotations.AnswerAnnotation.class)))
value = value + " " +
cl.getString(CoreAnnotations.ValueAnnotation.class);
else {
if (!entities.get(answer).containsKey(value))
entities.get(answer).put(value, 0);
entities.get(answer).put(value,
entities.get(answer).get(value) + 1);
break;
}
}
if (!iterator.hasNext())
break;
}
}
return entities;
}
I had the same problem, so I looked it up, too. The method proposed by Christopher Manning is efficient, but the delicate point is to know how to decide which kind of separator is appropriate. One could say only a space should be allowed, e.g. "John Zorn" >> one entity. However, I may find the form "J.Zorn", so I should also allow certain punctuation marks. But what about "Jack, James and Joe" ? I might get 2 entities instead of 3 ("Jack James" and "Joe").
By digging a bit in the Stanford NER classes, I actually found a proper implementation of this idea. They use it to export entities under the form of single String objects. For instance, in the method PlainTextDocumentReaderAndWriter.printAnswersTokenizedInlineXML, we have:
private void printAnswersInlineXML(List<IN> doc, PrintWriter out) {
final String background = flags.backgroundSymbol;
String prevTag = background;
for (Iterator<IN> wordIter = doc.iterator(); wordIter.hasNext();) {
IN wi = wordIter.next();
String tag = StringUtils.getNotNullString(wi.get(AnswerAnnotation.class));
String before = StringUtils.getNotNullString(wi.get(BeforeAnnotation.class));
String current = StringUtils.getNotNullString(wi.get(CoreAnnotations.OriginalTextAnnotation.class));
if (!tag.equals(prevTag)) {
if (!prevTag.equals(background) && !tag.equals(background)) {
out.print("</");
out.print(prevTag);
out.print('>');
out.print(before);
out.print('<');
out.print(tag);
out.print('>');
} else if (!prevTag.equals(background)) {
out.print("</");
out.print(prevTag);
out.print('>');
out.print(before);
} else if (!tag.equals(background)) {
out.print(before);
out.print('<');
out.print(tag);
out.print('>');
}
} else {
out.print(before);
}
out.print(current);
String afterWS = StringUtils.getNotNullString(wi.get(AfterAnnotation.class));
if (!tag.equals(background) && !wordIter.hasNext()) {
out.print("</");
out.print(tag);
out.print('>');
prevTag = background;
} else {
prevTag = tag;
}
out.print(afterWS);
}
}
They iterate over each word, checking if it has the same class (answer) than the previous, as explained before. For this, they take advantage of the fact expressions considered as not being entities are flagged using the so-called backgroundSymbol (class "O"). They also use the property BeforeAnnotation, which represents the string separating the current word from the previous one. This last point allows solving the problem I initially raised, regarding the choice of an appropriate separator.
Code for the above:
<List> result = classifier.classifyToCharacterOffsets(text);
for (Triple<String, Integer, Integer> triple : result)
{
System.out.println(triple.first + " : " + text.substring(triple.second, triple.third));
}
List<List<CoreLabel>> out = classifier.classify(text);
for (List<CoreLabel> sentence : out) {
String s = "";
String prevLabel = null;
for (CoreLabel word : sentence) {
if(prevLabel == null || prevLabel.equals(word.get(CoreAnnotations.AnswerAnnotation.class)) ) {
s = s + " " + word;
prevLabel = word.get(CoreAnnotations.AnswerAnnotation.class);
}
else {
if(!prevLabel.equals("O"))
System.out.println(s.trim() + '/' + prevLabel + ' ');
s = " " + word;
prevLabel = word.get(CoreAnnotations.AnswerAnnotation.class);
}
}
if(!prevLabel.equals("O"))
System.out.println(s + '/' + prevLabel + ' ');
}
I just wrote a small logic and it's working fine. what I did is group words with same label if they are adjacent.
Make use of the classifiers already provided to you. I believe this is what you are looking for:
private static String combineNERSequence(String text) {
String serializedClassifier = "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz";
AbstractSequenceClassifier<CoreLabel> classifier = null;
try {
classifier = CRFClassifier
.getClassifier(serializedClassifier);
} catch (ClassCastException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println(classifier.classifyWithInlineXML(text));
// FOR TSV FORMAT //
//System.out.print(classifier.classifyToString(text, "tsv", false));
return classifier.classifyWithInlineXML(text);
}
Here is my full code, I use Stanford core NLP and write algorithm to concatenate Multi Term names.
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import org.apache.log4j.Logger;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
/**
* Created by Chanuka on 8/28/14 AD.
*/
public class FindNameEntityTypeExecutor {
private static Logger logger = Logger.getLogger(FindNameEntityTypeExecutor.class);
private StanfordCoreNLP pipeline;
public FindNameEntityTypeExecutor() {
logger.info("Initializing Annotator pipeline ...");
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner");
pipeline = new StanfordCoreNLP(props);
logger.info("Annotator pipeline initialized");
}
List<String> findNameEntityType(String text, String entity) {
logger.info("Finding entity type matches in the " + text + " for entity type, " + entity);
// create an empty Annotation just with the given text
Annotation document = new Annotation(text);
// run all Annotators on this text
pipeline.annotate(document);
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
List<String> matches = new ArrayList<String>();
for (CoreMap sentence : sentences) {
int previousCount = 0;
int count = 0;
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
String word = token.get(CoreAnnotations.TextAnnotation.class);
int previousWordIndex;
if (entity.equals(token.get(CoreAnnotations.NamedEntityTagAnnotation.class))) {
count++;
if (previousCount != 0 && (previousCount + 1) == count) {
previousWordIndex = matches.size() - 1;
String previousWord = matches.get(previousWordIndex);
matches.remove(previousWordIndex);
previousWord = previousWord.concat(" " + word);
matches.add(previousWordIndex, previousWord);
} else {
matches.add(word);
}
previousCount = count;
}
else
{
count=0;
previousCount=0;
}
}
}
return matches;
}
}
Another approach to deal with multi words entities.
This code combines multiple tokens together if they have the same annotation and go in a row.
Restriction:
If the same token has two different annotations, the last one will be saved.
private Document getEntities(String fullText) {
Document entitiesList = new Document();
NERClassifierCombiner nerCombClassifier = loadNERClassifiers();
if (nerCombClassifier != null) {
List<List<CoreLabel>> results = nerCombClassifier.classify(fullText);
for (List<CoreLabel> coreLabels : results) {
String prevLabel = null;
String prevToken = null;
for (CoreLabel coreLabel : coreLabels) {
String word = coreLabel.word();
String annotation = coreLabel.get(CoreAnnotations.AnswerAnnotation.class);
if (!"O".equals(annotation)) {
if (prevLabel == null) {
prevLabel = annotation;
prevToken = word;
} else {
if (prevLabel.equals(annotation)) {
prevToken += " " + word;
} else {
prevLabel = annotation;
prevToken = word;
}
}
} else {
if (prevLabel != null) {
entitiesList.put(prevToken, prevLabel);
prevLabel = null;
}
}
}
}
}
return entitiesList;
}
Imports:
Document: org.bson.Document;
NERClassifierCombiner: edu.stanford.nlp.ie.NERClassifierCombiner;