I have many explode function in one compute. It oom.heap dump
I found explode function will codegen in one class, the intermediate result will be buffered in memory.
spark web ui
How can i avoid this promble? separate codegen?
public Object generate(Object[] references) {
return new GeneratedIteratorForCodegenStage1(references);
}
/*wsc_codegenStageId*/
final class GeneratedIteratorForCodegenStage1 extends org.apache.spark.sql.execution.BufferedRowIterator {
private Object[] references;
private scala.collection.Iterator[] inputs;
private scala.collection.Iterator scan_input_0;
private boolean generate_generate_isNull_3_0;
private boolean generate_generate_isNull_14_0;
private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] scan_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[10];
public GeneratedIteratorForCodegenStage1(Object[] references) {
this.references = references;
}
public void init(int index, scala.collection.Iterator[] inputs) {
partitionIndex = index;
this.inputs = inputs;
scan_input_0 = inputs[0];
scan_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 32);
scan_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 32);
scan_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 64);
scan_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 64);
scan_mutableStateArray_0[4] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 64);
scan_mutableStateArray_0[5] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(3, 96);
scan_mutableStateArray_0[6] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(3, 96);
scan_mutableStateArray_0[7] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(3, 96);
scan_mutableStateArray_0[8] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(4, 128);
scan_mutableStateArray_0[9] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(4, 128);
}
private void generate_doConsume_2(UTF8String generate_expr_0_2, boolean generate_exprIsNull_0_2, UTF8String generate_expr_1_1, UTF8String generate_expr_2_0) throws java.io.IOException {
ArrayData generate_value_26 = null;
generate_value_26 = new org.apache.spark.sql.catalyst.util.GenericArrayData(generate_expr_1_1.split(((UTF8String) references[7] /* literal */), -1));
int generate_numElements_2 = false ? 0 : generate_value_26.numElements();
for (int generate_index_2 = 0; generate_index_2 < generate_numElements_2; generate_index_2++) {
((org.apache.spark.sql.execution.metric.SQLMetric) references[8] /* numOutputRows */).add(1);
// common sub-expressions
UTF8String generate_col_2 = generate_value_26.getUTF8String(generate_index_2);
scan_mutableStateArray_0[9].reset();
scan_mutableStateArray_0[9].zeroOutNullBytes();
if (generate_exprIsNull_0_2) {
scan_mutableStateArray_0[9].setNullAt(0);
} else {
scan_mutableStateArray_0[9].write(0, generate_expr_0_2);
}
scan_mutableStateArray_0[9].write(1, generate_expr_1_1);
scan_mutableStateArray_0[9].write(2, generate_expr_2_0);
scan_mutableStateArray_0[9].write(3, generate_col_2);
append((scan_mutableStateArray_0[9].getRow()).copy());
}
}
private void generate_doConsume_1(UTF8String generate_expr_0_1, boolean generate_exprIsNull_0_1, UTF8String generate_expr_1_0) throws java.io.IOException {
generate_generate_isNull_14_0 = true;
UTF8String generate_value_14 = null;
do {
if (!generate_exprIsNull_0_1) {
generate_generate_isNull_14_0 = false;
generate_value_14 = generate_expr_0_1;
continue;
}
if (!false) {
generate_generate_isNull_14_0 = false;
generate_value_14 = ((UTF8String) references[4] /* literal */);
continue;
}
} while (false);
ArrayData generate_value_13 = null;
generate_value_13 = new org.apache.spark.sql.catalyst.util.GenericArrayData(generate_value_14.split(((UTF8String) references[5] /* literal */), -1));
int generate_numElements_1 = false ? 0 : generate_value_13.numElements();
for (int generate_index_1 = 0; generate_index_1 < generate_numElements_1; generate_index_1++) {
((org.apache.spark.sql.execution.metric.SQLMetric) references[6] /* numOutputRows */).add(1);
// common sub-expressions
UTF8String generate_col_1 = generate_value_13.getUTF8String(generate_index_1);
generate_doConsume_2(generate_expr_0_1, generate_exprIsNull_0_1, generate_expr_1_0, generate_col_1);
}
}
protected void processNext() throws java.io.IOException {
while (scan_input_0.hasNext()) {
InternalRow scan_row_0 = (InternalRow) scan_input_0.next();
((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
boolean scan_isNull_0 = scan_row_0.isNullAt(0);
UTF8String scan_value_0 = scan_isNull_0 ?
null : (scan_row_0.getUTF8String(0));
generate_doConsume_0(scan_value_0, scan_isNull_0);
if (shouldStop()) return;
}
}
private void generate_doConsume_0(UTF8String generate_expr_0_0, boolean generate_exprIsNull_0_0) throws java.io.IOException {
generate_generate_isNull_3_0 = true;
UTF8String generate_value_3 = null;
do {
if (!generate_exprIsNull_0_0) {
generate_generate_isNull_3_0 = false;
generate_value_3 = generate_expr_0_0;
continue;
}
if (!false) {
generate_generate_isNull_3_0 = false;
generate_value_3 = ((UTF8String) references[1] /* literal */);
continue;
}
} while (false);
ArrayData generate_value_2 = null;
generate_value_2 = new org.apache.spark.sql.catalyst.util.GenericArrayData(generate_value_3.split(((UTF8String) references[2] /* literal */), -1));
int generate_numElements_0 = false ? 0 : generate_value_2.numElements();
for (int generate_index_0 = 0; generate_index_0 < generate_numElements_0; generate_index_0++) {
((org.apache.spark.sql.execution.metric.SQLMetric) references[3] /* numOutputRows */).add(1);
// common sub-expressions
UTF8String generate_col_0 = generate_value_2.getUTF8String(generate_index_0);
generate_doConsume_1(generate_expr_0_0, generate_exprIsNull_0_0, generate_col_0);
}
}
}
explain output
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [__fcol_4#345 AS table#482, __fcol_5#346 AS table-aaa#483, __fcol_7#335 AS table-bbb#484, __fcol_12#423 AS table-1#485]
+- Project [__fcol_4#345, __fcol_5#346, __fcol_7#335, UDF(SPLITPART($0$, $1$, 1), , [$1$,$0$], [string_type,string_type], [], array(__fcol_11#417, __fcol_4#345), [], []) AS __fcol_12#423]
+- Project [__fcol_1#330 AS __fcol_4#345, __fcol_3#323 AS __fcol_5#346, __fcol_7#335, UDF(",", , [], [], [], [], [], []) AS __fcol_11#417]
+- Generate explode(split(coalesce(__fcol_1#330, ), :, -1)), [__fcol_1#330, __fcol_3#323], false, [__fcol_7#335]
+- Project [__fcol_0#320 AS __fcol_1#330, __fcol_3#323]
+- Generate explode(split(coalesce(__fcol_0#320, ), :, -1)), [__fcol_0#320], false, [__fcol_3#323]
+- Exchange SinglePartition, ENSURE_REQUIREMENTS, [id=#210]
+- Project [table#252 AS __fcol_0#320]
+- Scan [table#252] PushedFilters: [], ReadSchema: struct<table:string>
I try to separate codegen to avoid this oom.
But i don't know how to do this, or there is a better way to solve this.
Related
I am new to apache spark and am trying to run a custom nearest neighbor algorithm on an RDD that has been partitioned into 2 parts using a custom partitioner. The JavaPairRDD contains the graph details and the random object created on the graph.
According to my logic, I am building subgraphs for each partition, and I am running a custom algorithm on each subgraph. It seems to be working "although not properly". I am not sure if this is the correct way to apply action in each partition. I am adding my code and the results as well. Comments and suggestions are highly appreciated.
// <Partition_Index_Key, Map<Source_vertex, Map<Destination Vertex, Tuple2<Edge_Length, ArrayList of Random Objects>>
JavaPairRDD<Object, Map<Object, Map<Object, Tuple2<Double, ArrayList<RoadObject>>>>> adjVertForSubgraphsRDD = jscontext
.parallelizePairs(adjacentVerticesForSubgraphs)
.partitionBy(new CustomPartitioner(CustomPartitionSize));
//applying foreachPartition action on JavaPairRDD
adjVertForSubgraphsRDD.foreachPartition(
new VoidFunction<Iterator<Tuple2<Object, Map<Object, Map<Object, Tuple2<Double, ArrayList<RoadObject>>>>>>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
#Override
public void call(
Iterator<Tuple2<Object, Map<Object, Map<Object, Tuple2<Double, ArrayList<RoadObject>>>>>> tupleRow)
throws Exception {
int sourceVertex;
int destVertex;
double edgeLength;
int roadObjectId;
boolean roadObjectType;
double distanceFromStart;
CoreGraph subgraph0 = new CoreGraph();
CoreGraph subgraph1 = new CoreGraph();
while (tupleRow.hasNext()) {
Map<Object, Map<Object, Tuple2<Double, ArrayList<RoadObject>>>> newMap = tupleRow.next()
._2();
if ((Integer.parseInt(String.valueOf(tupleRow.next()._1())) == 0)) {
for (Object srcVertex : newMap.keySet()) {
for (Object dstVertex : newMap.get(srcVertex).keySet()) {
if (newMap.get(srcVertex).get(dstVertex)._2() != null) {
sourceVertex = Integer.parseInt(String.valueOf(srcVertex));
destVertex = Integer.parseInt(String.valueOf(dstVertex));
edgeLength = newMap.get(srcVertex).get(dstVertex)._1();
subgraph0.addEdge(sourceVertex, destVertex, edgeLength);
for (int i = 0; i < newMap.get(srcVertex).get(dstVertex)._2()
.size(); i++) {
int currentEdgeId = subgraph0.getEdgeId(sourceVertex, destVertex);
roadObjectId = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getObjectId();
roadObjectType = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getType();
distanceFromStart = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getDistanceFromStartNode();
RoadObject rn0 = new RoadObject();
rn0.setObjId(roadObjectId);
rn0.setType(roadObjectType);
rn0.setDistanceFromStartNode(distanceFromStart);
subgraph0.addObjectOnEdge(currentEdgeId, rn0);
}
} else {
sourceVertex = Integer.parseInt(String.valueOf(srcVertex));
destVertex = Integer.parseInt(String.valueOf(dstVertex));
edgeLength = newMap.get(srcVertex).get(dstVertex)._1();
subgraph0.addEdge(sourceVertex, destVertex, edgeLength);
}
}
}
} else if ((Integer.parseInt(String.valueOf(tupleRow.next()._1())) == 1)) {
for (Object srcVertex : newMap.keySet()) {
for (Object dstVertex : newMap.get(srcVertex).keySet()) {
if (newMap.get(srcVertex).get(dstVertex)._2() != null) {
sourceVertex = Integer.parseInt(String.valueOf(srcVertex));
destVertex = Integer.parseInt(String.valueOf(dstVertex));
edgeLength = newMap.get(srcVertex).get(dstVertex)._1();
subgraph1.addEdge(sourceVertex, destVertex, edgeLength);
for (int i = 0; i < newMap.get(srcVertex).get(dstVertex)._2()
.size(); i++) {
int currentEdgeId = subgraph1.getEdgeId(sourceVertex, destVertex);
roadObjectId = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getObjectId();
roadObjectType = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getType();
distanceFromStart = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getDistanceFromStartNode();
RoadObject rn1 = new RoadObject();
rn1.setObjId(roadObjectId);
rn1.setType(roadObjectType);
rn1.setDistanceFromStartNode(distanceFromStart);
subgraph1.addObjectOnEdge(currentEdgeId, rn1);
}
} else {
sourceVertex = Integer.parseInt(String.valueOf(srcVertex));
destVertex = Integer.parseInt(String.valueOf(dstVertex));
edgeLength = newMap.get(srcVertex).get(dstVertex)._1();
subgraph1.addEdge(sourceVertex, destVertex, edgeLength);
}
}
}
}
}
// Straight forward nearest neighbor algorithm from each true to false.
ANNNaive ann = new ANNNaive();
System.err.println("-------------------------------");
Map<Integer, Integer> nearestNeighorPairsSubg0 = ann.compute(subgraph0, true);
System.out.println("for subgraph0");
System.out.println(nearestNeighorPairsSubg0);
System.err.println("-------------------------------");
System.err.println("-------------------------------");
Map<Integer, Integer> nearestNeighorPairsSubg1 = ann.compute(subgraph1, true);
System.out.println("for subgraph1");
System.out.println(nearestNeighorPairsSubg1);
System.err.println("-------------------------------");
}
});
static void getRecommendations(Map<User, HashMap<Item, Double>> map, User to) {
scores = sortMapByScore(scores, to);
TreeMap<User, Double> scores1 = (TreeMap<User, Double>) scores.get(to);
Set<User> user = (Set<User>) scores1.keySet();
Iterator<User> itr = user.iterator();
Map<Item, Double> rec = new HashMap<Item, Double>();
int i = 0;
while (itr.hasNext() && i < 5) {
User u = itr.next();
/* for(Item e:map.get(to).keySet()){ */
for (Item e1 : map.get(u).keySet()) {
if (!map.get(to).containsKey(e1)) {
if (rec.containsKey(e1)) {
double sc = rec.get(e1);
rec.put(e1, sc + map.get(u).get(e1) * scores.get(to).get(u));
} else {
// System.out.println(scores);
rec.put(e1, map.get(u).get(e1) * scores.get(to).get(u));
}
// }
}
}
i++;
}
TreeMap<Item, Double> res = new TreeMap<Item, Double>(
new ValueComparator(rec));
res.putAll(rec);
int k=0;
for(Item d:res.keySet()){
System.out.println(d.getmTitle());
k++;
if(k==5){break;}
}
}
I am using nested HashMap and TreeMap in this example.But I am facing the below problem.
In the code above in the line
rec.put(e1, map.get(u).get(e1) * scores.get(u).get(to));
I am getting a NullPointerException, even though I am using the same HashMap's keyset to get the Values.
i have followed https://gist.github.com/xrstf/b48a970098a8e76943b9 to integrate nutch and elastic-search. everything is working fine data is stored in Hbase 'webpage' table but i am not able to fetch data in elastic search.i want to know how to fetch data in elastic search.
below is my code
package com.process;
/*
import package will be here
*/
public class HbaseToElastic extends Configured implements
org.apache.hadoop.util.Tool {
static class Mapper extends TableMapper<Text, IndexWritable> {
public static String CLUSTER;
public static String SEARCH_HOST;
public static String SEARCH_PORT;
public static String SEARCH_INDEX_NAME;
public static String SEARCHtYPE;
public static int BULKSIZE;
public static String TABLENAME;
public static String FAMILY;
private static List<String> SPORTS_KEYWORDS;
private static List<String> BUSINESS_KEYWORDS;
private static List<String> GOSSIP_KEYWORDS;
private static List<String> CRIME_KEYWORDS;
private static Map<String, Map<String, String>> STATE_MAP = new HashMap<String, Map<String, String>>();
private static Map<String, String> CITY_MAP = new HashMap<String, String>();
private static Mapper mapper = new Mapper();
static {
try {
System.out.println("done1");
DetectorFactory.loadProfile("./profiles");
System.out.println("done2");
} catch (final LangDetectException e) {
System.out.println("done3");
e.printStackTrace();
}
}
Configuration hbaseConf = null;
HTable table = null;
List<Put> hbasePutErrorList = new ArrayList<Put>();
/**
* Clean up the hbase table object
*/
#Override
protected void cleanup(final Context context) throws IOException,
InterruptedException {
super.cleanup(context);
table.put(hbasePutErrorList);
table.close();
hbasePutErrorList.clear();
}
/**
* Initialize various variables
*/
#Override
protected void setup(
final org.apache.hadoop.mapreduce.Mapper<ImmutableBytesWritable, Result, Text, IndexWritable>.Context context)
throws IOException, InterruptedException {
final Configuration conf = context.getConfiguration();
CLUSTER = conf.get("cluster");
SEARCH_HOST = conf.get("search_host");
SEARCH_PORT = conf.get("search_port");
SEARCH_INDEX_NAME = conf.get("search_index_name");
SEARCHtYPE = conf.get("search_type");
BULKSIZE = conf.getInt("search_bulk_size", 500);
TABLENAME = conf.get("table_name");
FAMILY = conf.get("family");
hbaseConf = HBaseConfiguration.create();
hbaseConf.set("hbase.zookeeper.quorum",
conf.get("hbase.zookeeper.quorum"));
hbaseConf.set("hbase.zookeeper.property.clientPort",
conf.get("hbase.zookeeper.property.clientPort"));
hbaseConf.set("hbase.rpc.timeout", conf.get("hbase.rpc.timeout"));
hbaseConf.set("hbase.regionserver.lease.period",
conf.get("hbase.regionserver.lease.period"));
hbaseConf.set("hbase.master", conf.get("hbase.master"));
table = new HTable(hbaseConf, conf.get("table_name"));
SPORTS_KEYWORDS = new ArrayList<String>();
BUSINESS_KEYWORDS = new ArrayList<String>();
GOSSIP_KEYWORDS = new ArrayList<String>();
CRIME_KEYWORDS = new ArrayList<String>();
String keywrods = conf.get("sportskeywords");
String[] keyarr = keywrods.split(",");
for (final String key : keyarr) {
SPORTS_KEYWORDS.add(key.trim());
}
keywrods = conf.get("businesskeywords");
keyarr = keywrods.split(",");
for (final String key : keyarr) {
BUSINESS_KEYWORDS.add(key.trim());
}
keywrods = conf.get("gossipkeywords");
keyarr = keywrods.split(",");
for (final String key : keyarr) {
GOSSIP_KEYWORDS.add(key.trim());
}
keywrods = conf.get("crimekeywords");
keyarr = keywrods.split(",");
for (final String key : keyarr) {
CRIME_KEYWORDS.add(key.trim());
}
final String stateMap = conf.get("statemap");
final Gson g = new Gson();
STATE_MAP = g.fromJson(stateMap, Map.class);
}
/**
* map function
*/
#Override
public void map(final ImmutableBytesWritable row, final Result result,
final Context context) throws IOException, InterruptedException {
try {
final byte b = 0;
int deleteFlag = 0;
final String keyString = Bytes.toString(row.get());
final Map<String, Object> mapobject = new HashMap<String, Object>();
for (final KeyValue kv : result.raw()) {
final String key = (new String(kv.getQualifier()));
final String value = (new String(kv.getValue()));
mapobject.put(key, value);
}
final Gson g = new Gson();
if (checkValidType(mapobject)) {
refineMetaTags(mapobject);
if (refineDescription(mapobject)) {
assignCity(mapobject);
if (checkTitleImage(mapobject)) {
if (setLang(mapobject)) {
setCorrectCategory(mapobject);
correctDuplicateTitle(mapobject);
final String json = g.toJson(mapobject);
context.write(new Text(keyString),
new IndexWritable(json, b));
deleteFlag = 1;
}
}
}
}
if (deleteFlag == 0) {
final Put put = new Put(Bytes.toBytes(keyString));
put.add(Bytes.toBytes("cf"), Bytes.toBytes("ErrorFlag"),
Bytes.toBytes("1"));
hbasePutErrorList.add(put);
}
} catch (final Exception e) {
e.printStackTrace();
}
}
/**
* Remove duplicate statement in the title
*
* #param mapobject
*/
private void correctDuplicateTitle(final Map<String, Object> mapobject) {
final String duplicateTitle = mapobject.get("title").toString();
final String stripedTitleArr[] = duplicateTitle.split(" ", 4);
if (stripedTitleArr.length == 4) {
final String subString = stripedTitleArr[0] + " "
+ stripedTitleArr[1] + " " + stripedTitleArr[2];
if (stripedTitleArr[3].contains(subString)) {
mapobject.put("title", duplicateTitle
.substring(duplicateTitle.indexOf(subString,
subString.length() - 1)));
mapobject.put("title", stripedTitleArr[3]
.substring(stripedTitleArr[3].indexOf(subString)));
}
}
}
/**
* Set category based on the various category specific keyword
*
* #param mapobject
*/
private void setCorrectCategory(final Map<String, Object> mapobject) {
final String url = mapobject.get("url") + "";
final String cat = mapobject.get("tags") + "";
if ("sports".equalsIgnoreCase(cat)
|| "cricket".equalsIgnoreCase(cat)) {
if (!(url.toLowerCase().contains("sport")
|| url.toLowerCase().contains("खेल")
|| url.toLowerCase().contains("cric") || url
.toLowerCase().contains("क्रिकेट"))) {
final String desc = mapobject.get("description").toString();
boolean isSports = false;
int count = 0;
for (final String keyword : SPORTS_KEYWORDS) {
if (desc.contains(keyword)) {
count++;
}
}
if (count > 1) {
isSports = true;
}
if (!isSports) {
mapobject.put("tags", "national");
}
if (isSports
&& (desc.contains("क्रिकेट")
|| url.toLowerCase().contains("cric")
|| desc.contains("टॉस")
|| desc.contains("वनडे") || desc
.contains("बल्लेबाज"))) {
mapobject.put("tags", "cricket");
}
}
} else if ("business".equalsIgnoreCase(cat)) {
if ((url.toLowerCase().contains("sport") || url.toLowerCase()
.contains("खेल"))) {
mapobject.put("tags", "sports");
} else if (url.toLowerCase().contains("cric")
|| url.toLowerCase().contains("क्रिकेट")) {
mapobject.put("tags", "cricket");
} else if (!(url.toLowerCase().contains("busines")
|| url.toLowerCase().contains("व्यापार")
|| url.toLowerCase().contains("economy")
|| url.toLowerCase().contains("finance")
|| url.toLowerCase().contains("बिजनेस")
|| url.toLowerCase().contains("market")
|| url.toLowerCase().contains("karobar") || url
.contains("कारोबार"))) {
final String desc = mapobject.get("description").toString();
int count = 0;
for (final String keyword : BUSINESS_KEYWORDS) {
if (desc.contains(keyword)) {
count++;
}
}
if (count < 2) {
mapobject.put("tags", "national");
}
}
} else if ("gossip".equalsIgnoreCase(cat)) {
if ((url.toLowerCase().contains("sport") || url.toLowerCase()
.contains("खेल"))) {
mapobject.put("tags", "sports");
} else if (url.toLowerCase().contains("cric")
|| url.toLowerCase().contains("क्रिकेट")) {
mapobject.put("tags", "cricket");
} else if (url.toLowerCase().contains("busines")) {
mapobject.put("tags", "business");
} else if (!(url.toLowerCase().contains("masala")
|| url.toLowerCase().contains("gossip")
|| url.toLowerCase().contains("gupshup") || url
.toLowerCase().contains("garam"))) {
final String desc = mapobject.get("description").toString();
int count = 0;
for (final String keyword : GOSSIP_KEYWORDS) {
if (desc.contains(keyword)) {
count++;
}
}
if (count < 2) {
mapobject.put("tags", "national");
}
}
} else if ("crime".equalsIgnoreCase(cat)) {
if ((url.toLowerCase().contains("sport") || url.toLowerCase()
.contains("खेल"))) {
mapobject.put("tags", "sports");
} else if (url.toLowerCase().contains("cric")
|| url.toLowerCase().contains("क्रिकेट")) {
mapobject.put("tags", "cricket");
} else if (url.toLowerCase().contains("busines")) {
mapobject.put("tags", "business");
} else if (!(url.toLowerCase().contains("crime")
|| url.toLowerCase().contains("terrorist")
|| url.toLowerCase().contains("abuse")
|| url.toLowerCase().contains("forgery")
|| url.toLowerCase().contains("assault")
|| url.toLowerCase().contains("violence")
|| url.toLowerCase().contains("rape")
|| url.toLowerCase().contains("teasing")
|| url.toLowerCase().contains("molestation")
|| url.toLowerCase().contains("scandal") || url
.toLowerCase().contains("murder"))) {
final String desc = mapobject.get("description").toString();
int count = 0;
for (final String keyword : CRIME_KEYWORDS) {
if (desc.contains(keyword)) {
count++;
}
}
if (count < 2) {
mapobject.put("tags", "national");
}
}
} else if (cat != null && cat.startsWith("local")) {
}
}
/**
* Check valid type of the HTML pages
*
* #param mapobject
* #return
*/
private boolean checkValidType(final Map<String, Object> mapobject) {
if (mapobject.containsKey("type")
&& !(mapobject.get("type").toString().contains("image") || mapobject
.get("type").toString().contains("rss"))) {
return true;
}
return false;
}
/**
* refine the description according to its length and must starting with
* english and it the description is not present get the description
* from the metatags description
*
* #param mapobject
* #return {#link Boolean}
*/
private boolean refineDescription(final Map<String, Object> mapobject) {
if (mapobject.containsKey("description")
&& mapobject.get("description").toString().length() > 75
&& !mapobject.get("description").toString().contains(";}")
&& !mapobject.get("description").toString()
.contains("<cite>")
&& !mapobject.get("description").toString()
.contains("href=")
&& !mapobject.get("description").toString()
.contains("All rights reserved")) {
return true;
} else if (mapobject.containsKey("metatag.description")
&& mapobject.get("metatag.description").toString().length() > 75
&& !mapobject.get("metatag.description").toString()
.contains(";}")
&& !mapobject.get("metatag.description").toString()
.contains("<cite>")) {
mapobject.put("description",
mapobject.get("metatag.description"));
return true;
}
return false;
}
/**
* refine metatags by refining meta keyword to only include the English
* keyword only that has at most three keyword and if not present then
* create the keyword with title field of the html and if none of the
* keyword found then form it using the help of the url and exclude the
* number from the keywords
*
* #param mapobject
*/
private void refineMetaTags(final Map<String, Object> mapobject) {
String metaTag = "";
int tagFlag = 0;
if (mapobject.containsKey("metatag.keywords")) {
final String metaTags[] = mapobject.get("metatag.keywords")
.toString().replaceAll("\\|", ",").split(",");
String domain = null;
StringBuilder temp = null;
for (final String metaTag2 : metaTags) {
if (mapobject.containsKey("host")) {
domain = mapobject.get("host") + "";
if (domain.split("\\.").length > 1
&& (metaTag2
.contains(domain.split("\\.")[domain
.split("\\.").length - 2]) || metaTag2
.contains(domain.split("\\.")[0])))
{
continue;
}
}
String[] arr = metaTag2.split(" ");
arr = removeUnicodeWords(arr);
if (arr.length > 0 && arr.length < 5) {
temp = new StringBuilder();
for (final String str : arr) {
temp.append(str);
temp.append(" ");
}
if (metaTag.length() + temp.length() < 70) {
metaTag = metaTag + "," + temp.toString();
}
}
}
if (metaTag.startsWith(",")) {
metaTag = metaTag.trim();
metaTag = metaTag.substring(1, metaTag.length());
}
}
if (metaTag.length() < 1 && mapobject.containsKey("title")) {
/**
* Extracting tags from the title tag if the length of the
* keyword is greater than 4
*/
final String title = (String) mapobject.get("title");
final String splitTitle[] = title.split(" ");
int count = 0;
for (int i = 0; i < splitTitle.length; i++) {
if (splitTitle[i].length() > 4
&& !splitTitle[i].matches("^[\\u0900-\\u097F].*")) {
metaTag = metaTag + splitTitle[i] + ",";
count++;
if (count == 5) {
break;
}
}
}
if (metaTag.split(",").length > 3) {
if (metaTag.endsWith(",")) {
metaTag = metaTag.trim();
metaTag = metaTag.substring(0, metaTag.length() - 1);
}
} else {
metaTag = "";
}
}
if (metaTag.length() < 1) {
/**
* Extracting the tags from the url if the length of the keyword
* is greater than 4
*/
final String splitUrl[] = mapobject.get("url").toString()
.split("/");
final String lastSplitValue = splitUrl[splitUrl.length - 1];
final String tagList[] = generateTokens(lastSplitValue);
if (tagList != null) {
int count = 0;
for (int i = 0; i < tagList.length; i++) {
if (tagList[i].length() > 4
&& !tagList[i].matches("^[\\u0900-\\u097F].*")) {
metaTag = metaTag + tagList[i] + ",";
count++;
if (count == 5) {
break;
}
}
}
}
if (metaTag.endsWith(",")) {
metaTag = metaTag.trim();
metaTag = metaTag.substring(0, metaTag.length() - 1);
}
}
if (metaTag.length() > 0) {
metaTag = metaTag.replaceAll("\\[", "");
metaTag = metaTag.replaceAll("\"", "");
metaTag = metaTag.replaceAll(";", "");
metaTag = metaTag.replaceAll(":", "");
metaTag = metaTag.replaceAll("\u0027", "");
metaTag = metaTag.replaceAll("\u003d", "");
metaTag = metaTag.replaceAll("\u0026", "");
tagFlag = 1;
}
mapobject.put("TagFlag", tagFlag);
mapobject.put("metatag.keywords", metaTag);
}
/**
* Remove unicode character
*
* #param arr
* #return
*/
private String[] removeUnicodeWords(final String[] arr) {
final List<String> returnArr = new ArrayList<String>();
for (final String str : arr) {
if (str != null && str.trim().length() > 3
&& !str.matches("^[\\u0900-\\u097F].*")
&& !(str.matches("^[0-9].*"))) {
returnArr.add(str.trim());
}
}
final String[] retrnArr = new String[returnArr.size()];
returnArr.toArray(retrnArr);
return retrnArr;
}
/**
* Generate Token list with the help of the lucene analyzer
*
* #param lastSplitValue
* #return {#link ArrayIndexOutOfBoundsException} of the list of the
* keywords
*/
private String[] generateTokens(String lastSplitValue) {
final List<String> list = new ArrayList<String>();
lastSplitValue = lastSplitValue.replace("\\.", " ").replace("%20",
" ");
try {
final Version matchVersion = Version.LUCENE_45;
final Analyzer analyzer = new HindiAnalyzer(matchVersion);
final TokenStream ts = analyzer.tokenStream("field",
new StringReader(lastSplitValue));
ts.reset();
while (ts.incrementToken()) {
final CharTermAttribute cta = ts
.getAttribute(CharTermAttribute.class);
if (cta.toString().length() > 4
&& !cta.toString().matches("^[0-9].*")) {
list.add(cta.toString());
}
}
ts.end();
ts.close();
analyzer.close();
} catch (final Exception e) {
e.printStackTrace();
}
if (list.size() > 3) {
return list.toArray(new String[list.size()]);
} else {
return null;
}
}
/**
* Checks title and assign their language based on their first character
* of the title
*
* #param mapobject
* #return {#link Map}
*/
private boolean setLang(final Map<String, Object> mapobject) {
final String title = mapobject.get("title").toString();
final String description = mapobject.get("title").toString();
String language = "";
try {
language = mapper.detect(title);
mapper.detect(description);
} catch (final LangDetectException e) {
System.out.println("\n title with error is - " + title);
System.out.println("\n description with error is - "
+ description);
e.printStackTrace();
/*
* String title = mapobject.get("title").toString(); language =
* mapobject.get("lang") + ""; language = language.trim(); if
* (language.trim().equalsIgnoreCase("hi") ||
* language.trim().startsWith("en") ||
* language.trim().equalsIgnoreCase("lt")) { String[] titleArr =
* title.trim().split(" "); int i = 0; for (String titlePart :
* titleArr) { if
* (titlePart.trim().matches("^[\\u0900-\\u097F].*")) { i++; } }
* if (i >= titleArr.length * 0.5) { mapobject.put("lang",
* "hi"); } else { mapobject.put("lang", "lt"); } return true; }
*/
return false;
}
if (language.trim().equalsIgnoreCase("hi")
|| language.trim().startsWith("en")
|| language.trim().equalsIgnoreCase("lt")) {
mapobject.put("lang", language);
return true;
}
return false;
}
private String detect(final String text) throws LangDetectException {
final Detector detector = DetectorFactory.create();
detector.append(text);
return detector.detect();
}
/**
* Checks whether to include the doc based on their title and get the
* title from anchor tag title to choose the title that has largest
* number of the words and in hindi and it also gets the image from
* anchor tag href attribute
*
* #param mapobject
* of the key value pair
* #return {#link Boolean}
*/
private boolean checkTitleImage(final Map<String, Object> mapobject) {
final TreeSet<String> set = new TreeSet<String>(new SetSort());
final Gson gson = new Gson();
JsonArray array = null;
JsonObject object2 = null;
if (mapobject.containsKey("anchor")
&& mapobject.get("anchor") != null) {
final String arr = (String) mapobject.get("anchor");
try {
array = gson.fromJson(arr, JsonArray.class);
for (final JsonElement jsonElement : array) {
try {
object2 = gson.fromJson(jsonElement.getAsString(),
JsonObject.class);
} catch (final Exception e) {
if (object2 == null) {
object2 = new JsonObject();
object2.addProperty("title",
jsonElement.getAsString());
object2.addProperty("href", "");
object2.addProperty("alt", "");
}
}
if (object2 != null) {
assignTitleImage(mapobject, set, object2);
}
object2 = null;
}
} catch (final ClassCastException e) {
object2 = gson.fromJson(arr, JsonObject.class);
assignTitleImage(mapobject, set, object2);
} catch (final Exception e) {
e.printStackTrace();
}
if (!set.isEmpty()) {
int loop = 0;
final List<String> tempList = new LinkedList<String>();
for (final String string : set) {
final String title = string;
tempList.add(title.trim());
loop++;
if (loop == 2) {
break;
}
}
if (!tempList.isEmpty()) {
if (tempList.get(0).matches("^[\\u0900-\\u097F].*")) {
mapobject.put("title", tempList.get(0));
} else if (tempList.size() > 1
&& !(tempList.get(0)
.matches("^[\\u0900-\\u097F].*"))
&& tempList.get(1).matches(
"^[\\u0900-\\u097F].*")) {
mapobject.put("title", tempList.get(1));
} else {
mapobject.put("title", tempList.get(0));
}
}
}
}
if (mapobject.containsKey("title")
&& mapobject.get("title").toString().length() > 0
&& mapobject.get("title").toString().split(" ").length > 2
&& mapobject.get("title").toString().split(" ").length < 20
&& !mapobject.get("title").toString().contains("<")) {
if (set.isEmpty()) {
mapobject.put("title",
getTitleRefined(mapobject.get("title") + ""));
}
return true;
}
return false;
}
/**
* #param mapobject
* #param set
* #param object2
*/
private void assignTitleImage(final Map<String, Object> mapobject,
final TreeSet<String> set, final JsonObject object2) {
if (!mapobject.containsKey("ImgH1")
&& !mapobject.containsKey("ImgH2")) {
if (object2.get("href") != null
&& object2.get("href").getAsString().length() > 0
&& (object2.get("href").getAsString().toLowerCase()
.contains(".jpg")
|| object2.get("href").getAsString()
.toLowerCase().contains(".jpeg") || object2
.get("href").getAsString().toLowerCase()
.contains(".gif"))) {
putImages(mapobject, object2.get("href").getAsString()
.trim(), mapobject.get("tags").toString().trim()
.toLowerCase());
}
}
if (object2.get("title") != null
&& object2.get("title").getAsString().length() > 0
&& object2.get("title").getAsString().split(" ").length > 2
&& object2.get("title").getAsString().split(" ").length < 20
&& !object2.get("title").getAsString().contains("<")) {
final String newTitle = getTitleRefined(object2.get("title")
.getAsString());
set.add(newTitle.trim());
}
}
/**
* This function used to refine the title based on specific bad keyword
* during observation
*
* #param title
* #return refined title
*/
private String getTitleRefined(String title) {
title = title.replaceAll("\u0027", "");
title = title.replaceAll("\u0026", "");
title = title.replaceAll("\u003d", "");
if (title.contains("-")) {
if (title.trim().split("-").length > 1
&& !title.trim().split("-")[1].trim().matches(
"^[\\u0900-\\u097F].*")) {
return title.trim().split("-")[0].trim();
}
} else if (title.contains(":")) {
if (!title.trim().split(":")[0].trim().matches(
"^[\\u0900-\\u097F].*")
&& title.trim().split(":").length > 1) {
return title.trim().split(":")[1].trim();
}
}
return title;
}
/**
* Creates the path for the images
*
* #param map
* of the key value pair
* #param imageUrl
* #param category
*/
private void putImages(final Map<String, Object> map2,
final String imageUrl, final String category) {
try {
map2.put("ImgSrc", StringEscapeUtils.unescapeHtml(imageUrl)
.trim());
if (map2.containsKey("ImgSrc") && map2.get("ImgSrc") != null
&& map2.get("ImgSrc").toString().length() > 0) {
map2.put(
"ImgSrc",
StringEscapeUtils.unescapeHtml(map2.get("ImgSrc")
.toString())
+ "##RAFTAAR##"
+ imageUrl.trim());
} else {
return;
}
String imgNamearr[] = null;
try {
imgNamearr = imageUrl.split("/");
} catch (final Exception e) {
e.printStackTrace();
}
String imgName = null;
try {
imgName = imgNamearr[imgNamearr.length - 1];
} catch (final Exception e) {
e.printStackTrace();
}
final String imagePath = "/"
+ String.valueOf(imgName.charAt(0));
imgName = imgName.replaceAll(" ", "_").replaceAll("%20", "_");
if (imgName.split(".jpg").length > 0) {
imgName = imgName.split(".jpg")[0];
imgName = imgName + ".jpg";
}
map2.put("ImgH1", "h1/" + category + imagePath + "/" + imgName);
map2.put("ImgH2", "h2/" + category + imagePath + "/" + imgName);
} catch (final Exception e) {
e.printStackTrace();
}
}
/**
* Inserts the data to the elasticsearch
*
* #param mapobject
* #param key
* unique id generally it is the unique url
*/
public static void insertToElastic(final Map<String, Object> mapobject,
final String key) {
final Settings settings = ImmutableSettings.settingsBuilder()
.put("cluster.name", CLUSTER).build();/*
* change ccluster.name
* to cluster
*/
final Client client = new TransportClient(settings)
.addTransportAddress(new InetSocketTransportAddress(
SEARCH_HOST, Integer.parseInt(SEARCH_PORT)));
client.prepareIndex(SEARCH_INDEX_NAME, SEARCHtYPE, key)
.setSource(mapobject).execute().actionGet();
client.close();
}
/**
* Assign the city to the news without city
*
* #param category
* #param description
* #return update category with city
*/
private static void assignCity(final Map<String, Object> mapobject) {
String category = mapobject.get("tags").toString();
if (category.endsWith("/")) {
boolean flag = true;
final String catArr[] = category.split("/");
if (catArr.length == 2) {
final String state = catArr[1];
CITY_MAP = STATE_MAP.get(state);
for (final Entry<String, String> e : CITY_MAP.entrySet()) {
final String description = mapobject.get("description")
.toString();
if (description.contains(e.getValue())) {
category = category + e.getKey();
mapobject.put("tags", category);
flag = false;
break;
}
}
}
if (flag) {
mapobject.put("tags", "national");
}
}
}
}
/**
* Update the data to hbase
*
* #param tableName
* #param rowKey
* #param family
* #param qualifier
* #param value
* #param conf
*/
public static void updateIntoHbase(final String tableName,
final String rowKey, final String family, final String qualifier,
final String value, final Configuration conf) {
HTable table = null;
try {
table = new HTable(conf, tableName);
} catch (final IOException e) {
e.printStackTrace();
}
final Put put = new Put(Bytes.toBytes(rowKey));
put.add(Bytes.toBytes(family), Bytes.toBytes(qualifier),
Bytes.toBytes(value));
try {
table.put(put);
table.close();
} catch (final IOException e) {
e.printStackTrace();
}
}
/**
* Return the map of the all states and city
*
* #param fileName
* #return
*/
private static Map<String, Map<String, String>> returnMap(
final String fileName) {
final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
BufferedReader br = null;
try {
br = new BufferedReader(new FileReader(fileName));
String line;
while ((line = br.readLine()) != null) {
final String arr[] = line.split("\t", 3);
if (arr.length == 3) {
if (map.containsKey(arr[0])) {
Map<String, String> m = new HashMap<String, String>();
m = map.get(arr[0]);
m.put(arr[1], arr[2]);
} else {
final Map<String, String> m = new HashMap<String, String>();
m.put(arr[1], arr[2]);
map.put(arr[0], m);
}
}
}
} catch (final FileNotFoundException e) {
e.printStackTrace();
} catch (final IOException e) {
e.printStackTrace();
} finally {
if (br != null) {
try {
br.close();
} catch (final Exception e) {
e.printStackTrace();
}
}
}
return map;
}
public static void main(final String[] args) throws Exception {
int c = 0;
c = ToolRunner.run(new Configuration(), new HbaseToElastic(), args);
System.exit(c);
}
}
i have a working safari extension and i able to install it manually by dragging it on safari web browser. i want to know how can i install it programmatically.
i have done this for firefox, chrome and IE.
in firefox just copy your .xpi file to this folder ("C:\Users\admin\AppData\Roaming\Mozilla\Firefox\Profiles\xxx.default\extensions") in windows 7 and your extension will get installed.
and in chrome you have to write these registry keys
Windows Registry Editor Version 5.00
[HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Google\Chrome\Extensions\dlilbimladfdhkfbbcbjjnbleakbogef]
"version"="3.6"
"path"="C:\\extension.crx"
but in safari when i copy my .safariextz file to this folder "C:\Users\admin\AppData\Local\Apple Computer\Safari\Extensions" than extension not get installed.
can anybody guide me how can i do this.
In the folder:
~\Users\\AppData\Local\Apple Computer\Safari\Extensions
there is a file named Extensions.plist you will also need to add an entry for your extension in this file.
Extension.plist in "~\Users\AppData\Local\Apple Computer\Safari\Extensions" folder is a binary file. for read and add an entry we can use this class.
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace PlistCS
{
public static class Plist
{
private static List<int> offsetTable = new List<int>();
private static List<byte> objectTable = new List<byte>();
private static int refCount;
private static int objRefSize;
private static int offsetByteSize;
private static long offsetTableOffset;
#region Public Functions
public static object readPlist(string path)
{
using (FileStream f = new FileStream(path, FileMode.Open, FileAccess.Read))
{
return readPlist(f);
}
}
public static object readPlistSource(string source)
{
return readPlist(System.Text.Encoding.UTF8.GetBytes(source));
}
public static object readPlist(byte[] data)
{
return readPlist(new MemoryStream(data));
}
public static plistType getPlistType(Stream stream)
{
byte[] magicHeader = new byte[8];
stream.Read(magicHeader, 0, 8);
if (BitConverter.ToInt64(magicHeader, 0) == 3472403351741427810)
{
return plistType.Binary;
}
else
{
return plistType.Xml;
}
}
public static object readPlist(Stream stream, plistType type = plistType.Auto)
{
if (type == plistType.Auto)
{
type = getPlistType(stream);
stream.Seek(0, SeekOrigin.Begin);
}
if (type == plistType.Binary)
{
using (BinaryReader reader = new BinaryReader(stream))
{
byte[] data = reader.ReadBytes((int)reader.BaseStream.Length);
return readBinary(data);
}
}
else
{
using (BinaryReader reader = new BinaryReader(stream))
{
byte[] data = reader.ReadBytes((int)reader.BaseStream.Length);
return readBinary(data);
}
}
}
public static void writeBinary(object value, string path)
{
using (BinaryWriter writer = new BinaryWriter(new FileStream(path, FileMode.Create)))
{
writer.Write(writeBinary(value));
}
}
public static void writeBinary(object value, Stream stream)
{
using (BinaryWriter writer = new BinaryWriter(stream))
{
writer.Write(writeBinary(value));
}
}
public static byte[] writeBinary(object value)
{
offsetTable.Clear();
objectTable.Clear();
refCount = 0;
objRefSize = 0;
offsetByteSize = 0;
offsetTableOffset = 0;
//Do not count the root node, subtract by 1
int totalRefs = countObject(value) - 1;
refCount = totalRefs;
objRefSize = RegulateNullBytes(BitConverter.GetBytes(refCount)).Length;
composeBinary(value);
writeBinaryString("bplist00", false);
offsetTableOffset = (long)objectTable.Count;
offsetTable.Add(objectTable.Count - 8);
offsetByteSize = RegulateNullBytes(BitConverter.GetBytes(offsetTable[offsetTable.Count - 1])).Length;
List<byte> offsetBytes = new List<byte>();
offsetTable.Reverse();
for (int i = 0; i < offsetTable.Count; i++)
{
offsetTable[i] = objectTable.Count - offsetTable[i];
byte[] buffer = RegulateNullBytes(BitConverter.GetBytes(offsetTable[i]), offsetByteSize);
Array.Reverse(buffer);
offsetBytes.AddRange(buffer);
}
objectTable.AddRange(offsetBytes);
objectTable.AddRange(new byte[6]);
objectTable.Add(Convert.ToByte(offsetByteSize));
objectTable.Add(Convert.ToByte(objRefSize));
var a = BitConverter.GetBytes((long)totalRefs + 1);
Array.Reverse(a);
objectTable.AddRange(a);
objectTable.AddRange(BitConverter.GetBytes((long)0));
a = BitConverter.GetBytes(offsetTableOffset);
Array.Reverse(a);
objectTable.AddRange(a);
return objectTable.ToArray();
}
#endregion
#region Private Functions
private static object readBinary(byte[] data)
{
offsetTable.Clear();
List<byte> offsetTableBytes = new List<byte>();
objectTable.Clear();
refCount = 0;
objRefSize = 0;
offsetByteSize = 0;
offsetTableOffset = 0;
List<byte> bList = new List<byte>(data);
List<byte> trailer = bList.GetRange(bList.Count - 32, 32);
parseTrailer(trailer);
objectTable = bList.GetRange(0, (int)offsetTableOffset);
offsetTableBytes = bList.GetRange((int)offsetTableOffset, bList.Count - (int)offsetTableOffset - 32);
parseOffsetTable(offsetTableBytes);
return parseBinary(0);
}
private static int countObject(object value)
{
int count = 0;
switch (value.GetType().ToString())
{
case "System.Collections.Generic.Dictionary`2[System.String,System.Object]":
Dictionary<string, object> dict = (Dictionary<string, object>)value;
foreach (string key in dict.Keys)
{
count += countObject(dict[key]);
}
count += dict.Keys.Count;
count++;
break;
case "System.Collections.Generic.List`1[System.Object]":
List<object> list = (List<object>)value;
foreach (object obj in list)
{
count += countObject(obj);
}
count++;
break;
default:
count++;
break;
}
return count;
}
private static byte[] writeBinaryDictionary(Dictionary<string, object> dictionary)
{
List<byte> buffer = new List<byte>();
List<byte> header = new List<byte>();
List<int> refs = new List<int>();
for (int i = dictionary.Count - 1; i >= 0; i--)
{
var o = new object[dictionary.Count];
dictionary.Values.CopyTo(o, 0);
composeBinary(o[i]);
offsetTable.Add(objectTable.Count);
refs.Add(refCount);
refCount--;
}
for (int i = dictionary.Count - 1; i >= 0; i--)
{
var o = new string[dictionary.Count];
dictionary.Keys.CopyTo(o, 0);
composeBinary(o[i]);//);
offsetTable.Add(objectTable.Count);
refs.Add(refCount);
refCount--;
}
if (dictionary.Count < 15)
{
header.Add(Convert.ToByte(0xD0 | Convert.ToByte(dictionary.Count)));
}
else
{
header.Add(0xD0 | 0xf);
header.AddRange(writeBinaryInteger(dictionary.Count, false));
}
foreach (int val in refs)
{
byte[] refBuffer = RegulateNullBytes(BitConverter.GetBytes(val), objRefSize);
Array.Reverse(refBuffer);
buffer.InsertRange(0, refBuffer);
}
buffer.InsertRange(0, header);
objectTable.InsertRange(0, buffer);
return buffer.ToArray();
}
private static byte[] composeBinaryArray(List<object> objects)
{
List<byte> buffer = new List<byte>();
List<byte> header = new List<byte>();
List<int> refs = new List<int>();
for (int i = objects.Count - 1; i >= 0; i--)
{
composeBinary(objects[i]);
offsetTable.Add(objectTable.Count);
refs.Add(refCount);
refCount--;
}
if (objects.Count < 15)
{
header.Add(Convert.ToByte(0xA0 | Convert.ToByte(objects.Count)));
}
else
{
header.Add(0xA0 | 0xf);
header.AddRange(writeBinaryInteger(objects.Count, false));
}
foreach (int val in refs)
{
byte[] refBuffer = RegulateNullBytes(BitConverter.GetBytes(val), objRefSize);
Array.Reverse(refBuffer);
buffer.InsertRange(0, refBuffer);
}
buffer.InsertRange(0, header);
objectTable.InsertRange(0, buffer);
return buffer.ToArray();
}
private static byte[] composeBinary(object obj)
{
byte[] value;
switch (obj.GetType().ToString())
{
case "System.Collections.Generic.Dictionary`2[System.String,System.Object]":
value = writeBinaryDictionary((Dictionary<string, object>)obj);
return value;
case "System.Collections.Generic.List`1[System.Object]":
value = composeBinaryArray((List<object>)obj);
return value;
case "System.Byte[]":
value = writeBinaryByteArray((byte[])obj);
return value;
case "System.Double":
value = writeBinaryDouble((double)obj);
return value;
case "System.Int32":
value = writeBinaryInteger((int)obj, true);
return value;
case "System.String":
value = writeBinaryString((string)obj, true);
return value;
case "System.DateTime":
value = writeBinaryDate((DateTime)obj);
return value;
case "System.Boolean":
value = writeBinaryBool((bool)obj);
return value;
default:
return new byte[0];
}
}
public static byte[] writeBinaryDate(DateTime obj)
{
List<byte> buffer = new List<byte>(RegulateNullBytes(BitConverter.GetBytes(PlistDateConverter.ConvertToAppleTimeStamp(obj)), 8));
buffer.Reverse();
buffer.Insert(0, 0x33);
objectTable.InsertRange(0, buffer);
return buffer.ToArray();
}
public static byte[] writeBinaryBool(bool obj)
{
List<byte> buffer = new List<byte>(new byte[1] { (bool)obj ? (byte)9 : (byte)8 });
objectTable.InsertRange(0, buffer);
return buffer.ToArray();
}
private static byte[] writeBinaryInteger(int value, bool write)
{
List<byte> buffer = new List<byte>(BitConverter.GetBytes((long)value));
buffer = new List<byte>(RegulateNullBytes(buffer.ToArray()));
while (buffer.Count != Math.Pow(2, Math.Log(buffer.Count) / Math.Log(2)))
buffer.Add(0);
int header = 0x10 | (int)(Math.Log(buffer.Count) / Math.Log(2));
buffer.Reverse();
buffer.Insert(0, Convert.ToByte(header));
if (write)
objectTable.InsertRange(0, buffer);
return buffer.ToArray();
}
private static byte[] writeBinaryDouble(double value)
{
List<byte> buffer = new List<byte>(RegulateNullBytes(BitConverter.GetBytes(value), 4));
while (buffer.Count != Math.Pow(2, Math.Log(buffer.Count) / Math.Log(2)))
buffer.Add(0);
int header = 0x20 | (int)(Math.Log(buffer.Count) / Math.Log(2));
buffer.Reverse();
buffer.Insert(0, Convert.ToByte(header));
objectTable.InsertRange(0, buffer);
return buffer.ToArray();
}
private static byte[] writeBinaryByteArray(byte[] value)
{
List<byte> buffer = new List<byte>(value);
List<byte> header = new List<byte>();
if (value.Length < 15)
{
header.Add(Convert.ToByte(0x40 | Convert.ToByte(value.Length)));
}
else
{
header.Add(0x40 | 0xf);
header.AddRange(writeBinaryInteger(buffer.Count, false));
}
buffer.InsertRange(0, header);
objectTable.InsertRange(0, buffer);
return buffer.ToArray();
}
private static byte[] writeBinaryString(string value, bool head)
{
List<byte> buffer = new List<byte>();
List<byte> header = new List<byte>();
foreach (char chr in value.ToCharArray())
buffer.Add(Convert.ToByte(chr));
if (head)
{
if (value.Length < 15)
{
header.Add(Convert.ToByte(0x50 | Convert.ToByte(value.Length)));
}
else
{
header.Add(0x50 | 0xf);
header.AddRange(writeBinaryInteger(buffer.Count, false));
}
}
buffer.InsertRange(0, header);
objectTable.InsertRange(0, buffer);
return buffer.ToArray();
}
private static byte[] RegulateNullBytes(byte[] value)
{
return RegulateNullBytes(value, 1);
}
private static byte[] RegulateNullBytes(byte[] value, int minBytes)
{
Array.Reverse(value);
List<byte> bytes = new List<byte>(value);
for (int i = 0; i < bytes.Count; i++)
{
if (bytes[i] == 0 && bytes.Count > minBytes)
{
bytes.Remove(bytes[i]);
i--;
}
else
break;
}
if (bytes.Count < minBytes)
{
int dist = minBytes - bytes.Count;
for (int i = 0; i < dist; i++)
bytes.Insert(0, 0);
}
value = bytes.ToArray();
Array.Reverse(value);
return value;
}
private static void parseTrailer(List<byte> trailer)
{
offsetByteSize = BitConverter.ToInt32(RegulateNullBytes(trailer.GetRange(6, 1).ToArray(), 4), 0);
objRefSize = BitConverter.ToInt32(RegulateNullBytes(trailer.GetRange(7, 1).ToArray(), 4), 0);
byte[] refCountBytes = trailer.GetRange(12, 4).ToArray();
Array.Reverse(refCountBytes);
refCount = BitConverter.ToInt32(refCountBytes, 0);
byte[] offsetTableOffsetBytes = trailer.GetRange(24, 8).ToArray();
Array.Reverse(offsetTableOffsetBytes);
offsetTableOffset = BitConverter.ToInt64(offsetTableOffsetBytes, 0);
}
private static void parseOffsetTable(List<byte> offsetTableBytes)
{
for (int i = 0; i < offsetTableBytes.Count; i += offsetByteSize)
{
byte[] buffer = offsetTableBytes.GetRange(i, offsetByteSize).ToArray();
Array.Reverse(buffer);
offsetTable.Add(BitConverter.ToInt32(RegulateNullBytes(buffer, 4), 0));
}
}
private static object parseBinaryDictionary(int objRef)
{
Dictionary<string, object> buffer = new Dictionary<string, object>();
List<int> refs = new List<int>();
int refCount = 0;
byte dictByte = objectTable[offsetTable[objRef]];
int refStartPosition;
refCount = getCount(offsetTable[objRef], out refStartPosition);
if (refCount < 15)
refStartPosition = offsetTable[objRef] + 1;
else
refStartPosition = offsetTable[objRef] + 2 + RegulateNullBytes(BitConverter.GetBytes(refCount), 1).Length;
for (int i = refStartPosition; i < refStartPosition + refCount * 2 * objRefSize; i += objRefSize)
{
byte[] refBuffer = objectTable.GetRange(i, objRefSize).ToArray();
Array.Reverse(refBuffer);
refs.Add(BitConverter.ToInt32(RegulateNullBytes(refBuffer, 4), 0));
}
for (int i = 0; i < refCount; i++)
{
buffer.Add((string)parseBinary(refs[i]), parseBinary(refs[i + refCount]));
}
return buffer;
}
private static object parseBinaryArray(int objRef)
{
List<object> buffer = new List<object>();
List<int> refs = new List<int>();
int refCount = 0;
byte arrayByte = objectTable[offsetTable[objRef]];
int refStartPosition;
refCount = getCount(offsetTable[objRef], out refStartPosition);
if (refCount < 15)
refStartPosition = offsetTable[objRef] + 1;
else
//The following integer has a header aswell so we increase the refStartPosition by two to account for that.
refStartPosition = offsetTable[objRef] + 2 + RegulateNullBytes(BitConverter.GetBytes(refCount), 1).Length;
for (int i = refStartPosition; i < refStartPosition + refCount * objRefSize; i += objRefSize)
{
byte[] refBuffer = objectTable.GetRange(i, objRefSize).ToArray();
Array.Reverse(refBuffer);
refs.Add(BitConverter.ToInt32(RegulateNullBytes(refBuffer, 4), 0));
}
for (int i = 0; i < refCount; i++)
{
buffer.Add(parseBinary(refs[i]));
}
return buffer;
}
private static int getCount(int bytePosition, out int newBytePosition)
{
byte headerByte = objectTable[bytePosition];
byte headerByteTrail = Convert.ToByte(headerByte & 0xf);
int count;
if (headerByteTrail < 15)
{
count = headerByteTrail;
newBytePosition = bytePosition + 1;
}
else
count = (int)parseBinaryInt(bytePosition + 1, out newBytePosition);
return count;
}
private static object parseBinary(int objRef)
{
byte header = objectTable[offsetTable[objRef]];
switch (header & 0xF0)
{
case 0:
{
//If the byte is
//0 return null
//9 return true
//8 return false
return (objectTable[offsetTable[objRef]] == 0) ? (object)null : ((objectTable[offsetTable[objRef]] == 9) ? true : false);
}
case 0x10:
{
return parseBinaryInt(offsetTable[objRef]);
}
case 0x20:
{
return parseBinaryReal(offsetTable[objRef]);
}
case 0x30:
{
return parseBinaryDate(offsetTable[objRef]);
}
case 0x40:
{
return parseBinaryByteArray(offsetTable[objRef]);
}
case 0x50://String ASCII
{
return parseBinaryAsciiString(offsetTable[objRef]);
}
case 0x60://String Unicode
{
return parseBinaryUnicodeString(offsetTable[objRef]);
}
case 0xD0:
{
return parseBinaryDictionary(objRef);
}
case 0xA0:
{
return parseBinaryArray(objRef);
}
}
throw new Exception("This type is not supported");
}
public static object parseBinaryDate(int headerPosition)
{
byte[] buffer = objectTable.GetRange(headerPosition + 1, 8).ToArray();
Array.Reverse(buffer);
double appleTime = BitConverter.ToDouble(buffer, 0);
DateTime result = PlistDateConverter.ConvertFromAppleTimeStamp(appleTime);
return result;
}
private static object parseBinaryInt(int headerPosition)
{
int output;
return parseBinaryInt(headerPosition, out output);
}
private static object parseBinaryInt(int headerPosition, out int newHeaderPosition)
{
byte header = objectTable[headerPosition];
int byteCount = (int)Math.Pow(2, header & 0xf);
byte[] buffer = objectTable.GetRange(headerPosition + 1, byteCount).ToArray();
Array.Reverse(buffer);
//Add one to account for the header byte
newHeaderPosition = headerPosition + byteCount + 1;
return BitConverter.ToInt32(RegulateNullBytes(buffer, 4), 0);
}
private static object parseBinaryReal(int headerPosition)
{
byte header = objectTable[headerPosition];
int byteCount = (int)Math.Pow(2, header & 0xf);
byte[] buffer = objectTable.GetRange(headerPosition + 1, byteCount).ToArray();
Array.Reverse(buffer);
return BitConverter.ToDouble(RegulateNullBytes(buffer, 8), 0);
}
private static object parseBinaryAsciiString(int headerPosition)
{
int charStartPosition;
int charCount = getCount(headerPosition, out charStartPosition);
var buffer = objectTable.GetRange(charStartPosition, charCount);
return buffer.Count > 0 ? Encoding.ASCII.GetString(buffer.ToArray()) : string.Empty;
}
private static object parseBinaryUnicodeString(int headerPosition)
{
int charStartPosition;
int charCount = getCount(headerPosition, out charStartPosition);
charCount = charCount * 2;
byte[] buffer = new byte[charCount];
byte one, two;
for (int i = 0; i < charCount; i += 2)
{
one = objectTable.GetRange(charStartPosition + i, 1)[0];
two = objectTable.GetRange(charStartPosition + i + 1, 1)[0];
if (BitConverter.IsLittleEndian)
{
buffer[i] = two;
buffer[i + 1] = one;
}
else
{
buffer[i] = one;
buffer[i + 1] = two;
}
}
return Encoding.Unicode.GetString(buffer);
}
private static object parseBinaryByteArray(int headerPosition)
{
int byteStartPosition;
int byteCount = getCount(headerPosition, out byteStartPosition);
return objectTable.GetRange(byteStartPosition, byteCount).ToArray();
}
#endregion
}
public enum plistType
{
Auto, Binary, Xml
}
public static class PlistDateConverter
{
public static long timeDifference = 978307200;
public static long GetAppleTime(long unixTime)
{
return unixTime - timeDifference;
}
public static long GetUnixTime(long appleTime)
{
return appleTime + timeDifference;
}
public static DateTime ConvertFromAppleTimeStamp(double timestamp)
{
DateTime origin = new DateTime(2001, 1, 1, 0, 0, 0, 0);
return origin.AddSeconds(timestamp);
}
public static double ConvertToAppleTimeStamp(DateTime date)
{
DateTime begin = new DateTime(2001, 1, 1, 0, 0, 0, 0);
TimeSpan diff = date - begin;
return Math.Floor(diff.TotalSeconds);
}
}
}
and use this method in commit action of Installer.cs class to add an entry of extension Extension.plist
public void InstallSafariExt()
{
string safariExtPlist = Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData) + "\\Apple Computer\\Safari\\Extensions\\Extensions.plist";
string safariSetupPlist = Environment.GetFolderPath(Environment.SpecialFolder.ProgramFiles) + "\YourComp\\YourSoft\\Extensions.plist";
string ExtDir = Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData) + "\\Apple Computer\\Safari\\Extensions";
if (!Directory.Exists(ExtDir))
{
Directory.CreateDirectory(ExtDir);
if (!File.Exists(safariExtPlist))
{
File.Copy(safariSetupPlist, safariExtPlist);
}
}
else
{
if (!File.Exists(safariExtPlist))
{
File.Copy(safariSetupPlist, safariExtPlist);
}
}
object obj = Plist.readPlist(safariExtPlist);
Dictionary<string, object> dict = (Dictionary<string, object>)obj;
Dictionary<string, object> NewExt = new Dictionary<string, object>();
NewExt.Add("Hidden Bars", new List<object>());
NewExt.Add("Added Non-Default Toolbar Items", new List<object>());
NewExt.Add("Enabled", true);
NewExt.Add("Archive File Name", "YourExtName.safariextz");
NewExt.Add("Removed Default Toolbar Items", new List<object>());
NewExt.Add("Bundle Directory Name", "YourExtName.safariextension");
List<object> listExt = (List<object>)dict["Installed Extensions"];
listExt.Add(NewExt);
Plist.writeBinary(obj, safariExtPlist);
string safariExtFile = Environment.GetFolderPath(Environment.SpecialFolder.ProgramFiles) + "\YourComp\\YourSoft\\YourExtName.safariextz";
string safariInstallfolder = Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData) + "\\Apple Computer\\Safari\\Extensions\\YourExtName.safariextz";
string[] safExtFiles = Directory.GetFiles(Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData) + "\\Apple Computer\\Safari\\Extensions\\", "YourExtName*.safariextz");
for (int i = 0; i < safExtFiles.Length; i++)
{
if (File.Exists(safExtFiles[i]))
File.Delete(safExtFiles[i]);
}
File.Copy(safariExtFile, safariInstallfolder);
}
This is my dictionary format:
word Frequency
Gone 60
Goes 10
Go 30
So far the system returns words eg starting with 'g' as go30, goes10, gone60 as a list.
(alphabetically). I want to increase the accuracy of the system so that the search result is based on frequency. Words with high frequencies appear first. kindly help.
Here is the Text midlet class that reads the dictionary line by line.
public class Text extends MIDlet {
// Fields
private static final String[] DEFAULT_KEY_CODES = {
// 1
".,?!'\"1-()#/:_",
// 2
"ABC2",
// 3
"DEF3",
// 4
"GHI4",
// 5
"JKL5",
// 6
"MNO6",
// 7
"PQRS7",
// 8
"TUV8",
// 9
"WXYZ9",
};
//Initializing inner Classes
public ComposeText() {
cmdHandler = new CommandHandler();
lineVector = new Vector();
}
//Calling All InitMethods, setting Theme, Show MainForm
public void startApp() {
Display.init(this);
setTheme();
initCmd();
initMainGui();
mainFrm.show();
}
public void pauseApp() {
}
public void destroyApp(boolean unconditional) {
}
//Initializing all the Commands
public void initCmd() {
exitCmd = new Command("Exit");
selectCmd = new Command("Ok");
cancelCmd = new Command("Cancel");
predCmd = new Command("Prediction");
sendCmd = new Command("Send");
tfPredArea = new TextField();
//check dictionary
try {
readFile();
} catch (IOException ex) {
ex.printStackTrace();
}
}
//Initiating MainScreen
public void initMainGui() {
mainFrm = new Form("Compose Text");
mainFrm.setLayout(new BorderLayout());
mainFrm.setLayout(new CoordinateLayout(150, 150));
mainFrm.addCommand(exitCmd);
mainFrm.addCommand(predCmd);
mainFrm.addCommand(sendCmd);
mainFrm.addCommandListener(new ActionListener() {
public void actionPerformed(ActionEvent ae) {
if(ae.getSource() == predCmd){
initPredGui();
} else if(ae.getSource() == exitCmd){
destroyApp(true);
notifyDestroyed();
}
}
});
// To : 07xxxxxxxxxx
Dimension d1 = new Dimension(130, 20);
lbTo = new Label("To:");
lbTo.setX(10);
lbTo.setY(10);
tfTo = new TextField();
tfTo.setReplaceMenu(false);
tfTo.setConstraint(TextField.NUMERIC);
tfTo.setInputModeOrder(new String[]{"123"});
tfTo.setMaxSize(13);
tfTo.setX(40);
tfTo.setY(10);
tfTo.setPreferredSize(d1);
//Message : Compose Text
Dimension d2 = new Dimension(135, 135);
lbSms = new Label("Message:");
lbSms.setX(5);
lbSms.setY(40);
tfSms = new TextField();
tfSms.setReplaceMenu(false);
tfSms.setX(40);
tfSms.setY(40);
tfSms.setPreferredSize(d2);
//add stuff
mainFrm.addComponent(lbTo);
mainFrm.addComponent(lbSms);
mainFrm.addComponent(tfTo);
mainFrm.addComponent(tfSms);
}
//Initiating FilterSelection Screen
public void initPredGui() {
predForm = new Form("Prediction on");
predForm.setLayout(new CoordinateLayout(150, 150));
predForm.addCommand(cancelCmd);
predForm.addCommand(selectCmd);
//textfied in prediction form
final Dimension d5 = new Dimension(200, 200);
tfPredArea = new TextField();
tfPredArea.setReplaceMenu(false);
tfPredArea.setX(10);
tfPredArea.setY(10);
tfPredArea.setPreferredSize(d5);
predForm.addComponent(tfPredArea);
final ListModel underlyingModel = new DefaultListModel(lineVector);
// final ListModel underlyingModel = new
DefaultListModel(tree.getAllPrefixMatches(avail));
// this is a list model that can narrow down the underlying model
final SortListModel proxyModel = new SortListModel(underlyingModel);
final List suggestion = new List(proxyModel);
tfPredArea.addDataChangeListener(new DataChangedListener() {
public void dataChanged(int type, int index) {
int len = 0;
int i = 0;
String input = tfPredArea.getText();
len = tfPredArea.getText().length();
//ensure start search character is set for each word
if (!(len == 0)) {
for (i = 0; i < len; i++) {
if (input.charAt(i) == ' ') {
k = i;
}
}
String currentInput = input.substring(k + 1, len);
proxyModel.filter(currentInput);
}
}
});
Dimension d3 = new Dimension(110, 120);
suggestion.setX(80);
suggestion.setY(80);
suggestion.setPreferredSize(d3);
predForm.addComponent(suggestion);
suggestion.addActionListener(new ActionListener() {
public void actionPerformed(ActionEvent ae) {
String string = suggestion.getSelectedItem().toString();
if (tfPredArea.getText().charAt(0) == 0) {
tfPredArea.setText(string);
}
else if (tfPredArea.getText().length() == 0) {
tfPredArea.setText(string);
} else {
tfPredArea.setText(tfPredArea.getText() + string);
}
}
});
predForm.addCommandListener(new ActionListener() {
public void actionPerformed(ActionEvent ae) {
if (ae.getSource() == addCmd) {
newDictionaryFrm.show();
} else {
mainFrm.show();
}
}
});
predForm.show();
}
//Setting Theme for All Forms
public void setTheme() {
try {
Resources r = Resources.open("/theme.res");
UIManager.getInstance().setThemeProps(r.getTheme(
r.getThemeResourceNames()[0]));
} catch (java.io.IOException e) {
System.err.println("Couldn't load the theme");
}
}
//Inner class CommandHandler
public class CommandHandler implements ActionListener {
public void actionPerformed(ActionEvent ae) {
//cancelCommand from predictionForm
if (ae.getSource() == cancelCmd) {
if (edit) {
mainFrm.show();
// clearFields();
} else if (ae.getSource() == selectCmd){
tfPredList.addDataChangeListener(model);
predForm.show();
}
else{}
}
}
}
// method that reads dictionary line by line
public void readFile() throws IOException {
tree = new Trie();
InputStreamReader reader = new InputStreamReader(
getClass().getResourceAsStream("/Maa Corpus.txt-01-ngrams-Alpha.txt"));
String line = null;
// Read a single line from the file. null represents the EOF.
while ((line = readLine(reader)) != null) {
// Append to a vector to be used as a list
lineVector.addElement(line);
}
}
public String readLine(InputStreamReader reader) throws IOException {
// Test whether the end of file has been reached. If so, return null.
int readChar = reader.read();
if (readChar == -1) {
return null;
}
StringBuffer string = new StringBuffer("");
// Read until end of file or new line
while (readChar != -1 && readChar != '\n') {
// Append the read character to the string.
// This is part of the newline character
if (readChar != '\r') {
string.append((char) readChar);
}
// Read the next character
readChar = reader.read();
}
return string.toString();
}
}
}
The SortListModel Class has a filter method that gets prefix from the textfield datachangeLister
class SortListModel implements ListModel, DataChangedListener {
private ListModel underlying;
private Vector filter;
private Vector listeners = new Vector();
public SortListModel(ListModel underlying) {
this.underlying = underlying;
underlying.addDataChangedListener(this);
}
private int getFilterOffset(int index) {
if(filter == null) {
return index;
}
if(filter.size() > index) {
return ((Integer)filter.elementAt(index)).intValue();
}
return -1;
}
private int getUnderlyingOffset(int index) {
if(filter == null) {
return index;
}
return filter.indexOf(new Integer(index));
}
public void filter(String str) {
filter = new Vector();
str = str.toUpperCase();
for(int iter = 0 ; iter < underlying.getSize() ; iter++) {
String element = (String)underlying.getItemAt(iter);
if(element.toUpperCase().startsWith(str)) // suggest only if smthing
{
filter.addElement(new Integer(iter));
}
}
dataChanged(DataChangedListener.CHANGED, -1);
}
public Object getItemAt(int index) {
return underlying.getItemAt(getFilterOffset(index));
}
public int getSize() {
if(filter == null) {
return underlying.getSize();
}
return filter.size();
}
public int getSelectedIndex() {
return Math.max(0, getUnderlyingOffset(underlying.getSelectedIndex()));
}
public void setSelectedIndex(int index) {
underlying.setSelectedIndex(getFilterOffset(index));
}
public void addDataChangedListener(DataChangedListener l) {
listeners.addElement(l);
}
public void removeDataChangedListener(DataChangedListener l) {
listeners.removeElement(l);
}
public void addSelectionListener(SelectionListener l) {
underlying.addSelectionListener(l);
}
public void removeSelectionListener(SelectionListener l) {
underlying.removeSelectionListener(l);
}
public void addItem(Object item) {
underlying.addItem(item);
}
public void removeItem(int index) {
underlying.removeItem(index);
}
public void dataChanged(int type, int index) {
if(index > -1) {
index = getUnderlyingOffset(index);
if(index < 0) {
return;
}
}
for(int iter = 0 ; iter < listeners.size() ; iter++) {
((DataChangedListener)listeners.elementAt(iter)).dataChanged(type, index);
}
}
}