I am using spark session to save a data frame to hive table. The code is as below.
df.write.mode(SaveMode.Append).format("orc").insertInto("table")
The data comes to spark from kafka. This can be huge amount of data coming throughout the day. Does , spark dataframe save internally does hive compaction ?. If not what is the best way to do compaction at regular intervals without affecting the table insertions.
In your example you should add partitionBy as data can be in huge amount
df.write..mode(SaveMode.Append).format("orc").partitionBy("age")
OR you can also archive as below
The way I have done this is to first register a temp table in Spark job itself and then leverage the sql method of the HiveContext to create a new table in hive using the data from the temp table. For example if I have a dataframe df and HiveContext hc the general process is:
df.registerTempTable("my_temp_table")
hc.sql("Insert into overwrite table_name PARTITION SELECT a,b, PARTITION_col from my_temp_table")
public class HiveCompaction {
private static SparkConf sparkConf;
private static JavaSparkContext sc;
private static SparkSession sqlContext = springutil.getBean("testSparkSession");
private static HashMap<Object, Object> partitionColumns;
public static void compact(String table, Dataset<Row> dataToCompact) {
logger.info("Started Compaction for - " + table);
if (!partitionColumns.containsKey(table)) {
compact_table_without_partition(table, dataToCompact);
} else {
compact_table_with_partition(table, dataToCompact, partitionColumns);
}
logger.info("Data Overwritten in HIVE table : " + table + " successfully");
}
private static void compact_table_with_partition(String table, Dataset<Row> dataToCompact,
Map<Object, Object> partitionData) {
String[] partitions = ((String) partitionData.get(table)).split(",");
List<Map<Object, Object>> partitionMap = getPartitionsToCompact(dataToCompact, Arrays.asList(partitions));
for (Map mapper : partitionMap) {
// sqlContext.sql("REFRESH TABLE staging.dummy_table");
String query = "select * from " + table + " where " + frameQuery(" and ", mapper);
Dataset<Row> originalTable = sqlContext.sql(query.toString());
if (originalTable.count() == 0) {
dataToCompact.write().mode("append").format("parquet").insertInto(table);
} else {
String location = getHdfsFileLocation(table);
String uuid = getUUID();
updateTable(table, dataToCompact, originalTable, uuid);
String destinationPath = framePath(location, frameQuery("/", mapper), uuid);
sqlContext.sql("Alter table " + table + " partition(" + frameQuery(",", mapper) + ") set location '"
+ destinationPath + "'");
}
}
}
private static void compact_table_without_partition(String table, Dataset<Row> dataToCompact) {
String query = "select * from " + table;
Dataset<Row> originalTable = sqlContext.sql(query.toString());
if (originalTable.count() == 0) {
dataToCompact.write().mode("append").format("parquet").insertInto(table);
} else {
String location = getHdfsFileLocation(table);
String uuid = getUUID();
String destinationPath = framePath(location, null, uuid);
updateTable(table, dataToCompact, originalTable, uuid);
sqlContext.sql("Alter table " + table + " set location '" + destinationPath + "'");
}
}
private static void updateTable(String table, Dataset<Row> dataToCompact, Dataset<Row> originalTable, String uuid) {
Seq<String> joinColumnSeq = getPrimaryKeyColumns();
Dataset<Row> unModifiedRecords = originalTable.join(dataToCompact, joinColumnSeq, "leftanti");
Dataset<Row> dataToInsert1 = dataToCompact.withColumn("uuid", functions.lit(uuid));
Dataset<Row> dataToInsert2 = unModifiedRecords.withColumn("uuid", functions.lit(uuid));
dataToInsert1.write().mode("append").format("parquet").insertInto(table + "_compacted");
dataToInsert2.write().mode("append").format("parquet").insertInto(table + "_compacted");
}
private static String getHdfsFileLocation(String table) {
Dataset<Row> tableDescription = sqlContext.sql("describe formatted " + table + "_compacted");
List<Row> rows = tableDescription.collectAsList();
String location = null;
for (Row r : rows) {
if (r.get(0).equals("Location")) {
location = r.getString(1);
break;
}
}
return location;
}
private static String frameQuery(String delimiter, Map mapper) {
StringBuilder modifiedQuery = new StringBuilder();
int i = 1;
for (Object key : mapper.keySet()) {
modifiedQuery.append(key + "=");
modifiedQuery.append(mapper.get(key));
if (mapper.size() > i)
modifiedQuery.append(delimiter);
i++;
}
return modifiedQuery.toString();
}
private static String framePath(String location, String framedpartition, String uuid) {
StringBuilder loc = new StringBuilder(location);
loc.append("/");
if (StringUtils.isNotEmpty(framedpartition)) {
loc.append(framedpartition);
loc.append("/");
}
loc.append("uuid=");
loc.append(uuid);
logger.info(loc.toString());
return loc.toString();
}
public static Seq<String> getColumnSeq(List<String> joinColumns) {
List<String> cols = new ArrayList<>(joinColumns.size());
for (int i = 0; i < joinColumns.size(); i++) {
cols.add(joinColumns.get(i).toLowerCase());
}
return JavaConverters.asScalaBufferConverter(cols).asScala().readOnly();
}
private static String getUUID() {
StringBuilder uri = new StringBuilder();
Random rand = new Random();
int randNum = rand.nextInt(200);
String uuid = DateTimeFormatter.ofPattern("yyyyMMddHHmmSSS").format(LocalDateTime.now()).toString()
+ (String.valueOf(randNum));
return uuid;
}
private static List<Map<Object, Object>> getPartitionsToCompact(Dataset<Row> filteredRecords,
List<String> partitions) {
Column[] columns = new Column[partitions.size()];
int index = 0;
for (String c : partitions) {
columns[index] = new Column(c);
index++;
}
Dataset<Row> partitionsToCompact = filteredRecords.select(columns)
.distinct(); /**
* TOD : add filter condition for selecting
* known paritions
*/
JavaRDD<Map<Object, Object>> querywithPartitions = partitionsToCompact.toJavaRDD().map(row -> {
return convertRowToMap(row);
});
return querywithPartitions.collect();
}
private static Map<Object, Object> convertRowToMap(Row row) {
StructField[] fields = row.schema().fields();
List<StructField> structFields = Arrays.asList(fields);
Map<Object, Object> a = structFields.stream()
.collect(Collectors.toMap(e -> ((StructField) e).name(), e -> row.getAs(e.name())));
return a;
}
private static Seq<String> getPrimaryKeyColumns() {
ArrayList<String> primaryKeyColumns = new ArrayList<String>();
Seq<String> joinColumnSeq = getColumnSeq(primaryKeyColumns);
return joinColumnSeq;
}
/*
* public static void initSpark(String jobname) { sparkConf = new
* SparkConf().setAppName(jobname); sparkConf.setMaster("local[3]");
* sparkConf.set("spark.driver.allowMultipleContexts", "true"); sc = new
* JavaSparkContext(); sqlContext = new SQLContext(sc); }
*/
public static HashMap<Object, Object> getParitionColumns() {
HashMap<Object, Object> paritionColumns = new HashMap<Object, Object>();
paritionColumns.put((Object) "staging.dummy_table", "trade_date,dwh_business_date,region_cd");
return paritionColumns;
}
public static void initialize(String table) {
// initSpark("Hive Table Compaction -" + table);
partitionColumns = getParitionColumns();
}
}
Usage:
String table = "staging.dummy_table";
HiveCompaction.initialize(table);
Dataset<Row> dataToCompact = sparkSession.sql("select * from staging.dummy_table");
HiveCompaction.compact(table, dataToCompact);
sparkSession.sql("select * from staging.dummy_table_compacted").show();
System.out.println("Compaction successful");
Related
I am new to apache spark and am trying to run a custom nearest neighbor algorithm on an RDD that has been partitioned into 2 parts using a custom partitioner. The JavaPairRDD contains the graph details and the random object created on the graph.
According to my logic, I am building subgraphs for each partition, and I am running a custom algorithm on each subgraph. It seems to be working "although not properly". I am not sure if this is the correct way to apply action in each partition. I am adding my code and the results as well. Comments and suggestions are highly appreciated.
// <Partition_Index_Key, Map<Source_vertex, Map<Destination Vertex, Tuple2<Edge_Length, ArrayList of Random Objects>>
JavaPairRDD<Object, Map<Object, Map<Object, Tuple2<Double, ArrayList<RoadObject>>>>> adjVertForSubgraphsRDD = jscontext
.parallelizePairs(adjacentVerticesForSubgraphs)
.partitionBy(new CustomPartitioner(CustomPartitionSize));
//applying foreachPartition action on JavaPairRDD
adjVertForSubgraphsRDD.foreachPartition(
new VoidFunction<Iterator<Tuple2<Object, Map<Object, Map<Object, Tuple2<Double, ArrayList<RoadObject>>>>>>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
#Override
public void call(
Iterator<Tuple2<Object, Map<Object, Map<Object, Tuple2<Double, ArrayList<RoadObject>>>>>> tupleRow)
throws Exception {
int sourceVertex;
int destVertex;
double edgeLength;
int roadObjectId;
boolean roadObjectType;
double distanceFromStart;
CoreGraph subgraph0 = new CoreGraph();
CoreGraph subgraph1 = new CoreGraph();
while (tupleRow.hasNext()) {
Map<Object, Map<Object, Tuple2<Double, ArrayList<RoadObject>>>> newMap = tupleRow.next()
._2();
if ((Integer.parseInt(String.valueOf(tupleRow.next()._1())) == 0)) {
for (Object srcVertex : newMap.keySet()) {
for (Object dstVertex : newMap.get(srcVertex).keySet()) {
if (newMap.get(srcVertex).get(dstVertex)._2() != null) {
sourceVertex = Integer.parseInt(String.valueOf(srcVertex));
destVertex = Integer.parseInt(String.valueOf(dstVertex));
edgeLength = newMap.get(srcVertex).get(dstVertex)._1();
subgraph0.addEdge(sourceVertex, destVertex, edgeLength);
for (int i = 0; i < newMap.get(srcVertex).get(dstVertex)._2()
.size(); i++) {
int currentEdgeId = subgraph0.getEdgeId(sourceVertex, destVertex);
roadObjectId = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getObjectId();
roadObjectType = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getType();
distanceFromStart = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getDistanceFromStartNode();
RoadObject rn0 = new RoadObject();
rn0.setObjId(roadObjectId);
rn0.setType(roadObjectType);
rn0.setDistanceFromStartNode(distanceFromStart);
subgraph0.addObjectOnEdge(currentEdgeId, rn0);
}
} else {
sourceVertex = Integer.parseInt(String.valueOf(srcVertex));
destVertex = Integer.parseInt(String.valueOf(dstVertex));
edgeLength = newMap.get(srcVertex).get(dstVertex)._1();
subgraph0.addEdge(sourceVertex, destVertex, edgeLength);
}
}
}
} else if ((Integer.parseInt(String.valueOf(tupleRow.next()._1())) == 1)) {
for (Object srcVertex : newMap.keySet()) {
for (Object dstVertex : newMap.get(srcVertex).keySet()) {
if (newMap.get(srcVertex).get(dstVertex)._2() != null) {
sourceVertex = Integer.parseInt(String.valueOf(srcVertex));
destVertex = Integer.parseInt(String.valueOf(dstVertex));
edgeLength = newMap.get(srcVertex).get(dstVertex)._1();
subgraph1.addEdge(sourceVertex, destVertex, edgeLength);
for (int i = 0; i < newMap.get(srcVertex).get(dstVertex)._2()
.size(); i++) {
int currentEdgeId = subgraph1.getEdgeId(sourceVertex, destVertex);
roadObjectId = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getObjectId();
roadObjectType = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getType();
distanceFromStart = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getDistanceFromStartNode();
RoadObject rn1 = new RoadObject();
rn1.setObjId(roadObjectId);
rn1.setType(roadObjectType);
rn1.setDistanceFromStartNode(distanceFromStart);
subgraph1.addObjectOnEdge(currentEdgeId, rn1);
}
} else {
sourceVertex = Integer.parseInt(String.valueOf(srcVertex));
destVertex = Integer.parseInt(String.valueOf(dstVertex));
edgeLength = newMap.get(srcVertex).get(dstVertex)._1();
subgraph1.addEdge(sourceVertex, destVertex, edgeLength);
}
}
}
}
}
// Straight forward nearest neighbor algorithm from each true to false.
ANNNaive ann = new ANNNaive();
System.err.println("-------------------------------");
Map<Integer, Integer> nearestNeighorPairsSubg0 = ann.compute(subgraph0, true);
System.out.println("for subgraph0");
System.out.println(nearestNeighorPairsSubg0);
System.err.println("-------------------------------");
System.err.println("-------------------------------");
Map<Integer, Integer> nearestNeighorPairsSubg1 = ann.compute(subgraph1, true);
System.out.println("for subgraph1");
System.out.println(nearestNeighorPairsSubg1);
System.err.println("-------------------------------");
}
});
I'm trying to build a method that deletes entries with attributes that are not null, but I keep failing in predicates and I don't know how to implement properly the predicate because hazelcast doesn't take "NOT NULL" or "IS NULL" as a where clause Any idea how can I find into the map the values I need to search to delete them?
the method
public int removeEntry(String CacheName, String claveReq, int id_interno_pe, String cod_nrbe_en, int num_sec_ac) {
int counter = 0;
IMap<String, ResponseSerializablePlus> map = clientInstance.getMap(CacheName);
// Predicate claveReqQuery = Predicates.equal("claveReq", claveReq);
// Predicate idInternoQuery = Predicates.equal("id_interno_pe", id_interno_pe);
// Predicate codNrbeQuery = Predicates.equal("cod_nrbe_en", cod_nrbe_en);
// Predicate numSecQuery = Predicates.equal("num_sec_ac", num_sec_ac);
// Predicate query = Predicates.and(idInternoQuery,codNrbeQuery,numSecQuery);
Predicate query = Predicates.sql("id_interno_pe IS NOT NULL");
if (!map.isEmpty()) {
for (ResponseSerializablePlus entry : map.values(query)) {
System.out.println("Entry "+entry.toString()+" Found");
map.delete(entry);
counter++;
}
System.out.println("Map Size ->"+map.size());
System.out.println("Deleted entries -> "+counter);
return counter;
}else {
System.out.println("No matches");
return 0;
}
}
the main class ResponseSerializablePlus
public class ResponseSerializablePlus implements IdentifiedDataSerializable{
private int id_interno_pe;
private String cod_nrbe_en;
private int num_sec_ac;
private int statusCode;
private HashMap<String,List<String>> headers;
private byte[] content;
public ResponseSerializablePlus(int id_interno_pe, String cod_nrbe_en, int num_sec_ac, int statusCode,
HashMap<String, List<String>> headers, byte[] content) {
this.id_interno_pe = id_interno_pe;
this.cod_nrbe_en = cod_nrbe_en;
this.num_sec_ac = num_sec_ac;
this.statusCode = statusCode;
this.headers = headers;
this.content = content;
}
public ResponseSerializablePlus() {
}
public void writeData(ObjectDataOutput out) throws IOException {
out.writeInt(id_interno_pe);
out.writeString(cod_nrbe_en);
out.writeInt(num_sec_ac);
out.writeInt(statusCode);
out.writeObject(headers);
out.writeByteArray(content);
}
public void readData(ObjectDataInput in) throws IOException {
this.id_interno_pe = in.readInt();
this.cod_nrbe_en = in.readString();
this.num_sec_ac = in.readInt();
this.statusCode = in.readInt();
this.headers = in.readObject();
this.content = in.readByteArray();
}
public int getFactoryId() {
return ResponseSerializablePlusFactory.FACTORY_ID;
}
public int getClassId() {
return ResponseSerializablePlusFactory.RESPONSE_SERIALIZABLE_PLUS_CLASS;
}
#Override
public String toString() {
return "ResponseSerializablePlus [id_interno_pe=" + id_interno_pe + ", cod_nrbe_en=" + cod_nrbe_en
+ ", num_sec_ac=" + num_sec_ac + ", statusCode=" + statusCode + ", headers=" + headers + ", content="
+ Arrays.toString(content) + "]";
}
}
It's indeed not supported, and it's document, along with the alternative option. Have a look here: https://github.com/hazelcast/hazelcast/blob/3cb8ce1fc3bb848aced6c87a30bf7b31aec16cf7/hazelcast/src/main/java/com/hazelcast/query/Predicates.java#L492-L497
i have followed https://gist.github.com/xrstf/b48a970098a8e76943b9 to integrate nutch and elastic-search. everything is working fine data is stored in Hbase 'webpage' table but i am not able to fetch data in elastic search.i want to know how to fetch data in elastic search.
below is my code
package com.process;
/*
import package will be here
*/
public class HbaseToElastic extends Configured implements
org.apache.hadoop.util.Tool {
static class Mapper extends TableMapper<Text, IndexWritable> {
public static String CLUSTER;
public static String SEARCH_HOST;
public static String SEARCH_PORT;
public static String SEARCH_INDEX_NAME;
public static String SEARCHtYPE;
public static int BULKSIZE;
public static String TABLENAME;
public static String FAMILY;
private static List<String> SPORTS_KEYWORDS;
private static List<String> BUSINESS_KEYWORDS;
private static List<String> GOSSIP_KEYWORDS;
private static List<String> CRIME_KEYWORDS;
private static Map<String, Map<String, String>> STATE_MAP = new HashMap<String, Map<String, String>>();
private static Map<String, String> CITY_MAP = new HashMap<String, String>();
private static Mapper mapper = new Mapper();
static {
try {
System.out.println("done1");
DetectorFactory.loadProfile("./profiles");
System.out.println("done2");
} catch (final LangDetectException e) {
System.out.println("done3");
e.printStackTrace();
}
}
Configuration hbaseConf = null;
HTable table = null;
List<Put> hbasePutErrorList = new ArrayList<Put>();
/**
* Clean up the hbase table object
*/
#Override
protected void cleanup(final Context context) throws IOException,
InterruptedException {
super.cleanup(context);
table.put(hbasePutErrorList);
table.close();
hbasePutErrorList.clear();
}
/**
* Initialize various variables
*/
#Override
protected void setup(
final org.apache.hadoop.mapreduce.Mapper<ImmutableBytesWritable, Result, Text, IndexWritable>.Context context)
throws IOException, InterruptedException {
final Configuration conf = context.getConfiguration();
CLUSTER = conf.get("cluster");
SEARCH_HOST = conf.get("search_host");
SEARCH_PORT = conf.get("search_port");
SEARCH_INDEX_NAME = conf.get("search_index_name");
SEARCHtYPE = conf.get("search_type");
BULKSIZE = conf.getInt("search_bulk_size", 500);
TABLENAME = conf.get("table_name");
FAMILY = conf.get("family");
hbaseConf = HBaseConfiguration.create();
hbaseConf.set("hbase.zookeeper.quorum",
conf.get("hbase.zookeeper.quorum"));
hbaseConf.set("hbase.zookeeper.property.clientPort",
conf.get("hbase.zookeeper.property.clientPort"));
hbaseConf.set("hbase.rpc.timeout", conf.get("hbase.rpc.timeout"));
hbaseConf.set("hbase.regionserver.lease.period",
conf.get("hbase.regionserver.lease.period"));
hbaseConf.set("hbase.master", conf.get("hbase.master"));
table = new HTable(hbaseConf, conf.get("table_name"));
SPORTS_KEYWORDS = new ArrayList<String>();
BUSINESS_KEYWORDS = new ArrayList<String>();
GOSSIP_KEYWORDS = new ArrayList<String>();
CRIME_KEYWORDS = new ArrayList<String>();
String keywrods = conf.get("sportskeywords");
String[] keyarr = keywrods.split(",");
for (final String key : keyarr) {
SPORTS_KEYWORDS.add(key.trim());
}
keywrods = conf.get("businesskeywords");
keyarr = keywrods.split(",");
for (final String key : keyarr) {
BUSINESS_KEYWORDS.add(key.trim());
}
keywrods = conf.get("gossipkeywords");
keyarr = keywrods.split(",");
for (final String key : keyarr) {
GOSSIP_KEYWORDS.add(key.trim());
}
keywrods = conf.get("crimekeywords");
keyarr = keywrods.split(",");
for (final String key : keyarr) {
CRIME_KEYWORDS.add(key.trim());
}
final String stateMap = conf.get("statemap");
final Gson g = new Gson();
STATE_MAP = g.fromJson(stateMap, Map.class);
}
/**
* map function
*/
#Override
public void map(final ImmutableBytesWritable row, final Result result,
final Context context) throws IOException, InterruptedException {
try {
final byte b = 0;
int deleteFlag = 0;
final String keyString = Bytes.toString(row.get());
final Map<String, Object> mapobject = new HashMap<String, Object>();
for (final KeyValue kv : result.raw()) {
final String key = (new String(kv.getQualifier()));
final String value = (new String(kv.getValue()));
mapobject.put(key, value);
}
final Gson g = new Gson();
if (checkValidType(mapobject)) {
refineMetaTags(mapobject);
if (refineDescription(mapobject)) {
assignCity(mapobject);
if (checkTitleImage(mapobject)) {
if (setLang(mapobject)) {
setCorrectCategory(mapobject);
correctDuplicateTitle(mapobject);
final String json = g.toJson(mapobject);
context.write(new Text(keyString),
new IndexWritable(json, b));
deleteFlag = 1;
}
}
}
}
if (deleteFlag == 0) {
final Put put = new Put(Bytes.toBytes(keyString));
put.add(Bytes.toBytes("cf"), Bytes.toBytes("ErrorFlag"),
Bytes.toBytes("1"));
hbasePutErrorList.add(put);
}
} catch (final Exception e) {
e.printStackTrace();
}
}
/**
* Remove duplicate statement in the title
*
* #param mapobject
*/
private void correctDuplicateTitle(final Map<String, Object> mapobject) {
final String duplicateTitle = mapobject.get("title").toString();
final String stripedTitleArr[] = duplicateTitle.split(" ", 4);
if (stripedTitleArr.length == 4) {
final String subString = stripedTitleArr[0] + " "
+ stripedTitleArr[1] + " " + stripedTitleArr[2];
if (stripedTitleArr[3].contains(subString)) {
mapobject.put("title", duplicateTitle
.substring(duplicateTitle.indexOf(subString,
subString.length() - 1)));
mapobject.put("title", stripedTitleArr[3]
.substring(stripedTitleArr[3].indexOf(subString)));
}
}
}
/**
* Set category based on the various category specific keyword
*
* #param mapobject
*/
private void setCorrectCategory(final Map<String, Object> mapobject) {
final String url = mapobject.get("url") + "";
final String cat = mapobject.get("tags") + "";
if ("sports".equalsIgnoreCase(cat)
|| "cricket".equalsIgnoreCase(cat)) {
if (!(url.toLowerCase().contains("sport")
|| url.toLowerCase().contains("खेल")
|| url.toLowerCase().contains("cric") || url
.toLowerCase().contains("क्रिकेट"))) {
final String desc = mapobject.get("description").toString();
boolean isSports = false;
int count = 0;
for (final String keyword : SPORTS_KEYWORDS) {
if (desc.contains(keyword)) {
count++;
}
}
if (count > 1) {
isSports = true;
}
if (!isSports) {
mapobject.put("tags", "national");
}
if (isSports
&& (desc.contains("क्रिकेट")
|| url.toLowerCase().contains("cric")
|| desc.contains("टॉस")
|| desc.contains("वनडे") || desc
.contains("बल्लेबाज"))) {
mapobject.put("tags", "cricket");
}
}
} else if ("business".equalsIgnoreCase(cat)) {
if ((url.toLowerCase().contains("sport") || url.toLowerCase()
.contains("खेल"))) {
mapobject.put("tags", "sports");
} else if (url.toLowerCase().contains("cric")
|| url.toLowerCase().contains("क्रिकेट")) {
mapobject.put("tags", "cricket");
} else if (!(url.toLowerCase().contains("busines")
|| url.toLowerCase().contains("व्यापार")
|| url.toLowerCase().contains("economy")
|| url.toLowerCase().contains("finance")
|| url.toLowerCase().contains("बिजनेस")
|| url.toLowerCase().contains("market")
|| url.toLowerCase().contains("karobar") || url
.contains("कारोबार"))) {
final String desc = mapobject.get("description").toString();
int count = 0;
for (final String keyword : BUSINESS_KEYWORDS) {
if (desc.contains(keyword)) {
count++;
}
}
if (count < 2) {
mapobject.put("tags", "national");
}
}
} else if ("gossip".equalsIgnoreCase(cat)) {
if ((url.toLowerCase().contains("sport") || url.toLowerCase()
.contains("खेल"))) {
mapobject.put("tags", "sports");
} else if (url.toLowerCase().contains("cric")
|| url.toLowerCase().contains("क्रिकेट")) {
mapobject.put("tags", "cricket");
} else if (url.toLowerCase().contains("busines")) {
mapobject.put("tags", "business");
} else if (!(url.toLowerCase().contains("masala")
|| url.toLowerCase().contains("gossip")
|| url.toLowerCase().contains("gupshup") || url
.toLowerCase().contains("garam"))) {
final String desc = mapobject.get("description").toString();
int count = 0;
for (final String keyword : GOSSIP_KEYWORDS) {
if (desc.contains(keyword)) {
count++;
}
}
if (count < 2) {
mapobject.put("tags", "national");
}
}
} else if ("crime".equalsIgnoreCase(cat)) {
if ((url.toLowerCase().contains("sport") || url.toLowerCase()
.contains("खेल"))) {
mapobject.put("tags", "sports");
} else if (url.toLowerCase().contains("cric")
|| url.toLowerCase().contains("क्रिकेट")) {
mapobject.put("tags", "cricket");
} else if (url.toLowerCase().contains("busines")) {
mapobject.put("tags", "business");
} else if (!(url.toLowerCase().contains("crime")
|| url.toLowerCase().contains("terrorist")
|| url.toLowerCase().contains("abuse")
|| url.toLowerCase().contains("forgery")
|| url.toLowerCase().contains("assault")
|| url.toLowerCase().contains("violence")
|| url.toLowerCase().contains("rape")
|| url.toLowerCase().contains("teasing")
|| url.toLowerCase().contains("molestation")
|| url.toLowerCase().contains("scandal") || url
.toLowerCase().contains("murder"))) {
final String desc = mapobject.get("description").toString();
int count = 0;
for (final String keyword : CRIME_KEYWORDS) {
if (desc.contains(keyword)) {
count++;
}
}
if (count < 2) {
mapobject.put("tags", "national");
}
}
} else if (cat != null && cat.startsWith("local")) {
}
}
/**
* Check valid type of the HTML pages
*
* #param mapobject
* #return
*/
private boolean checkValidType(final Map<String, Object> mapobject) {
if (mapobject.containsKey("type")
&& !(mapobject.get("type").toString().contains("image") || mapobject
.get("type").toString().contains("rss"))) {
return true;
}
return false;
}
/**
* refine the description according to its length and must starting with
* english and it the description is not present get the description
* from the metatags description
*
* #param mapobject
* #return {#link Boolean}
*/
private boolean refineDescription(final Map<String, Object> mapobject) {
if (mapobject.containsKey("description")
&& mapobject.get("description").toString().length() > 75
&& !mapobject.get("description").toString().contains(";}")
&& !mapobject.get("description").toString()
.contains("<cite>")
&& !mapobject.get("description").toString()
.contains("href=")
&& !mapobject.get("description").toString()
.contains("All rights reserved")) {
return true;
} else if (mapobject.containsKey("metatag.description")
&& mapobject.get("metatag.description").toString().length() > 75
&& !mapobject.get("metatag.description").toString()
.contains(";}")
&& !mapobject.get("metatag.description").toString()
.contains("<cite>")) {
mapobject.put("description",
mapobject.get("metatag.description"));
return true;
}
return false;
}
/**
* refine metatags by refining meta keyword to only include the English
* keyword only that has at most three keyword and if not present then
* create the keyword with title field of the html and if none of the
* keyword found then form it using the help of the url and exclude the
* number from the keywords
*
* #param mapobject
*/
private void refineMetaTags(final Map<String, Object> mapobject) {
String metaTag = "";
int tagFlag = 0;
if (mapobject.containsKey("metatag.keywords")) {
final String metaTags[] = mapobject.get("metatag.keywords")
.toString().replaceAll("\\|", ",").split(",");
String domain = null;
StringBuilder temp = null;
for (final String metaTag2 : metaTags) {
if (mapobject.containsKey("host")) {
domain = mapobject.get("host") + "";
if (domain.split("\\.").length > 1
&& (metaTag2
.contains(domain.split("\\.")[domain
.split("\\.").length - 2]) || metaTag2
.contains(domain.split("\\.")[0])))
{
continue;
}
}
String[] arr = metaTag2.split(" ");
arr = removeUnicodeWords(arr);
if (arr.length > 0 && arr.length < 5) {
temp = new StringBuilder();
for (final String str : arr) {
temp.append(str);
temp.append(" ");
}
if (metaTag.length() + temp.length() < 70) {
metaTag = metaTag + "," + temp.toString();
}
}
}
if (metaTag.startsWith(",")) {
metaTag = metaTag.trim();
metaTag = metaTag.substring(1, metaTag.length());
}
}
if (metaTag.length() < 1 && mapobject.containsKey("title")) {
/**
* Extracting tags from the title tag if the length of the
* keyword is greater than 4
*/
final String title = (String) mapobject.get("title");
final String splitTitle[] = title.split(" ");
int count = 0;
for (int i = 0; i < splitTitle.length; i++) {
if (splitTitle[i].length() > 4
&& !splitTitle[i].matches("^[\\u0900-\\u097F].*")) {
metaTag = metaTag + splitTitle[i] + ",";
count++;
if (count == 5) {
break;
}
}
}
if (metaTag.split(",").length > 3) {
if (metaTag.endsWith(",")) {
metaTag = metaTag.trim();
metaTag = metaTag.substring(0, metaTag.length() - 1);
}
} else {
metaTag = "";
}
}
if (metaTag.length() < 1) {
/**
* Extracting the tags from the url if the length of the keyword
* is greater than 4
*/
final String splitUrl[] = mapobject.get("url").toString()
.split("/");
final String lastSplitValue = splitUrl[splitUrl.length - 1];
final String tagList[] = generateTokens(lastSplitValue);
if (tagList != null) {
int count = 0;
for (int i = 0; i < tagList.length; i++) {
if (tagList[i].length() > 4
&& !tagList[i].matches("^[\\u0900-\\u097F].*")) {
metaTag = metaTag + tagList[i] + ",";
count++;
if (count == 5) {
break;
}
}
}
}
if (metaTag.endsWith(",")) {
metaTag = metaTag.trim();
metaTag = metaTag.substring(0, metaTag.length() - 1);
}
}
if (metaTag.length() > 0) {
metaTag = metaTag.replaceAll("\\[", "");
metaTag = metaTag.replaceAll("\"", "");
metaTag = metaTag.replaceAll(";", "");
metaTag = metaTag.replaceAll(":", "");
metaTag = metaTag.replaceAll("\u0027", "");
metaTag = metaTag.replaceAll("\u003d", "");
metaTag = metaTag.replaceAll("\u0026", "");
tagFlag = 1;
}
mapobject.put("TagFlag", tagFlag);
mapobject.put("metatag.keywords", metaTag);
}
/**
* Remove unicode character
*
* #param arr
* #return
*/
private String[] removeUnicodeWords(final String[] arr) {
final List<String> returnArr = new ArrayList<String>();
for (final String str : arr) {
if (str != null && str.trim().length() > 3
&& !str.matches("^[\\u0900-\\u097F].*")
&& !(str.matches("^[0-9].*"))) {
returnArr.add(str.trim());
}
}
final String[] retrnArr = new String[returnArr.size()];
returnArr.toArray(retrnArr);
return retrnArr;
}
/**
* Generate Token list with the help of the lucene analyzer
*
* #param lastSplitValue
* #return {#link ArrayIndexOutOfBoundsException} of the list of the
* keywords
*/
private String[] generateTokens(String lastSplitValue) {
final List<String> list = new ArrayList<String>();
lastSplitValue = lastSplitValue.replace("\\.", " ").replace("%20",
" ");
try {
final Version matchVersion = Version.LUCENE_45;
final Analyzer analyzer = new HindiAnalyzer(matchVersion);
final TokenStream ts = analyzer.tokenStream("field",
new StringReader(lastSplitValue));
ts.reset();
while (ts.incrementToken()) {
final CharTermAttribute cta = ts
.getAttribute(CharTermAttribute.class);
if (cta.toString().length() > 4
&& !cta.toString().matches("^[0-9].*")) {
list.add(cta.toString());
}
}
ts.end();
ts.close();
analyzer.close();
} catch (final Exception e) {
e.printStackTrace();
}
if (list.size() > 3) {
return list.toArray(new String[list.size()]);
} else {
return null;
}
}
/**
* Checks title and assign their language based on their first character
* of the title
*
* #param mapobject
* #return {#link Map}
*/
private boolean setLang(final Map<String, Object> mapobject) {
final String title = mapobject.get("title").toString();
final String description = mapobject.get("title").toString();
String language = "";
try {
language = mapper.detect(title);
mapper.detect(description);
} catch (final LangDetectException e) {
System.out.println("\n title with error is - " + title);
System.out.println("\n description with error is - "
+ description);
e.printStackTrace();
/*
* String title = mapobject.get("title").toString(); language =
* mapobject.get("lang") + ""; language = language.trim(); if
* (language.trim().equalsIgnoreCase("hi") ||
* language.trim().startsWith("en") ||
* language.trim().equalsIgnoreCase("lt")) { String[] titleArr =
* title.trim().split(" "); int i = 0; for (String titlePart :
* titleArr) { if
* (titlePart.trim().matches("^[\\u0900-\\u097F].*")) { i++; } }
* if (i >= titleArr.length * 0.5) { mapobject.put("lang",
* "hi"); } else { mapobject.put("lang", "lt"); } return true; }
*/
return false;
}
if (language.trim().equalsIgnoreCase("hi")
|| language.trim().startsWith("en")
|| language.trim().equalsIgnoreCase("lt")) {
mapobject.put("lang", language);
return true;
}
return false;
}
private String detect(final String text) throws LangDetectException {
final Detector detector = DetectorFactory.create();
detector.append(text);
return detector.detect();
}
/**
* Checks whether to include the doc based on their title and get the
* title from anchor tag title to choose the title that has largest
* number of the words and in hindi and it also gets the image from
* anchor tag href attribute
*
* #param mapobject
* of the key value pair
* #return {#link Boolean}
*/
private boolean checkTitleImage(final Map<String, Object> mapobject) {
final TreeSet<String> set = new TreeSet<String>(new SetSort());
final Gson gson = new Gson();
JsonArray array = null;
JsonObject object2 = null;
if (mapobject.containsKey("anchor")
&& mapobject.get("anchor") != null) {
final String arr = (String) mapobject.get("anchor");
try {
array = gson.fromJson(arr, JsonArray.class);
for (final JsonElement jsonElement : array) {
try {
object2 = gson.fromJson(jsonElement.getAsString(),
JsonObject.class);
} catch (final Exception e) {
if (object2 == null) {
object2 = new JsonObject();
object2.addProperty("title",
jsonElement.getAsString());
object2.addProperty("href", "");
object2.addProperty("alt", "");
}
}
if (object2 != null) {
assignTitleImage(mapobject, set, object2);
}
object2 = null;
}
} catch (final ClassCastException e) {
object2 = gson.fromJson(arr, JsonObject.class);
assignTitleImage(mapobject, set, object2);
} catch (final Exception e) {
e.printStackTrace();
}
if (!set.isEmpty()) {
int loop = 0;
final List<String> tempList = new LinkedList<String>();
for (final String string : set) {
final String title = string;
tempList.add(title.trim());
loop++;
if (loop == 2) {
break;
}
}
if (!tempList.isEmpty()) {
if (tempList.get(0).matches("^[\\u0900-\\u097F].*")) {
mapobject.put("title", tempList.get(0));
} else if (tempList.size() > 1
&& !(tempList.get(0)
.matches("^[\\u0900-\\u097F].*"))
&& tempList.get(1).matches(
"^[\\u0900-\\u097F].*")) {
mapobject.put("title", tempList.get(1));
} else {
mapobject.put("title", tempList.get(0));
}
}
}
}
if (mapobject.containsKey("title")
&& mapobject.get("title").toString().length() > 0
&& mapobject.get("title").toString().split(" ").length > 2
&& mapobject.get("title").toString().split(" ").length < 20
&& !mapobject.get("title").toString().contains("<")) {
if (set.isEmpty()) {
mapobject.put("title",
getTitleRefined(mapobject.get("title") + ""));
}
return true;
}
return false;
}
/**
* #param mapobject
* #param set
* #param object2
*/
private void assignTitleImage(final Map<String, Object> mapobject,
final TreeSet<String> set, final JsonObject object2) {
if (!mapobject.containsKey("ImgH1")
&& !mapobject.containsKey("ImgH2")) {
if (object2.get("href") != null
&& object2.get("href").getAsString().length() > 0
&& (object2.get("href").getAsString().toLowerCase()
.contains(".jpg")
|| object2.get("href").getAsString()
.toLowerCase().contains(".jpeg") || object2
.get("href").getAsString().toLowerCase()
.contains(".gif"))) {
putImages(mapobject, object2.get("href").getAsString()
.trim(), mapobject.get("tags").toString().trim()
.toLowerCase());
}
}
if (object2.get("title") != null
&& object2.get("title").getAsString().length() > 0
&& object2.get("title").getAsString().split(" ").length > 2
&& object2.get("title").getAsString().split(" ").length < 20
&& !object2.get("title").getAsString().contains("<")) {
final String newTitle = getTitleRefined(object2.get("title")
.getAsString());
set.add(newTitle.trim());
}
}
/**
* This function used to refine the title based on specific bad keyword
* during observation
*
* #param title
* #return refined title
*/
private String getTitleRefined(String title) {
title = title.replaceAll("\u0027", "");
title = title.replaceAll("\u0026", "");
title = title.replaceAll("\u003d", "");
if (title.contains("-")) {
if (title.trim().split("-").length > 1
&& !title.trim().split("-")[1].trim().matches(
"^[\\u0900-\\u097F].*")) {
return title.trim().split("-")[0].trim();
}
} else if (title.contains(":")) {
if (!title.trim().split(":")[0].trim().matches(
"^[\\u0900-\\u097F].*")
&& title.trim().split(":").length > 1) {
return title.trim().split(":")[1].trim();
}
}
return title;
}
/**
* Creates the path for the images
*
* #param map
* of the key value pair
* #param imageUrl
* #param category
*/
private void putImages(final Map<String, Object> map2,
final String imageUrl, final String category) {
try {
map2.put("ImgSrc", StringEscapeUtils.unescapeHtml(imageUrl)
.trim());
if (map2.containsKey("ImgSrc") && map2.get("ImgSrc") != null
&& map2.get("ImgSrc").toString().length() > 0) {
map2.put(
"ImgSrc",
StringEscapeUtils.unescapeHtml(map2.get("ImgSrc")
.toString())
+ "##RAFTAAR##"
+ imageUrl.trim());
} else {
return;
}
String imgNamearr[] = null;
try {
imgNamearr = imageUrl.split("/");
} catch (final Exception e) {
e.printStackTrace();
}
String imgName = null;
try {
imgName = imgNamearr[imgNamearr.length - 1];
} catch (final Exception e) {
e.printStackTrace();
}
final String imagePath = "/"
+ String.valueOf(imgName.charAt(0));
imgName = imgName.replaceAll(" ", "_").replaceAll("%20", "_");
if (imgName.split(".jpg").length > 0) {
imgName = imgName.split(".jpg")[0];
imgName = imgName + ".jpg";
}
map2.put("ImgH1", "h1/" + category + imagePath + "/" + imgName);
map2.put("ImgH2", "h2/" + category + imagePath + "/" + imgName);
} catch (final Exception e) {
e.printStackTrace();
}
}
/**
* Inserts the data to the elasticsearch
*
* #param mapobject
* #param key
* unique id generally it is the unique url
*/
public static void insertToElastic(final Map<String, Object> mapobject,
final String key) {
final Settings settings = ImmutableSettings.settingsBuilder()
.put("cluster.name", CLUSTER).build();/*
* change ccluster.name
* to cluster
*/
final Client client = new TransportClient(settings)
.addTransportAddress(new InetSocketTransportAddress(
SEARCH_HOST, Integer.parseInt(SEARCH_PORT)));
client.prepareIndex(SEARCH_INDEX_NAME, SEARCHtYPE, key)
.setSource(mapobject).execute().actionGet();
client.close();
}
/**
* Assign the city to the news without city
*
* #param category
* #param description
* #return update category with city
*/
private static void assignCity(final Map<String, Object> mapobject) {
String category = mapobject.get("tags").toString();
if (category.endsWith("/")) {
boolean flag = true;
final String catArr[] = category.split("/");
if (catArr.length == 2) {
final String state = catArr[1];
CITY_MAP = STATE_MAP.get(state);
for (final Entry<String, String> e : CITY_MAP.entrySet()) {
final String description = mapobject.get("description")
.toString();
if (description.contains(e.getValue())) {
category = category + e.getKey();
mapobject.put("tags", category);
flag = false;
break;
}
}
}
if (flag) {
mapobject.put("tags", "national");
}
}
}
}
/**
* Update the data to hbase
*
* #param tableName
* #param rowKey
* #param family
* #param qualifier
* #param value
* #param conf
*/
public static void updateIntoHbase(final String tableName,
final String rowKey, final String family, final String qualifier,
final String value, final Configuration conf) {
HTable table = null;
try {
table = new HTable(conf, tableName);
} catch (final IOException e) {
e.printStackTrace();
}
final Put put = new Put(Bytes.toBytes(rowKey));
put.add(Bytes.toBytes(family), Bytes.toBytes(qualifier),
Bytes.toBytes(value));
try {
table.put(put);
table.close();
} catch (final IOException e) {
e.printStackTrace();
}
}
/**
* Return the map of the all states and city
*
* #param fileName
* #return
*/
private static Map<String, Map<String, String>> returnMap(
final String fileName) {
final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
BufferedReader br = null;
try {
br = new BufferedReader(new FileReader(fileName));
String line;
while ((line = br.readLine()) != null) {
final String arr[] = line.split("\t", 3);
if (arr.length == 3) {
if (map.containsKey(arr[0])) {
Map<String, String> m = new HashMap<String, String>();
m = map.get(arr[0]);
m.put(arr[1], arr[2]);
} else {
final Map<String, String> m = new HashMap<String, String>();
m.put(arr[1], arr[2]);
map.put(arr[0], m);
}
}
}
} catch (final FileNotFoundException e) {
e.printStackTrace();
} catch (final IOException e) {
e.printStackTrace();
} finally {
if (br != null) {
try {
br.close();
} catch (final Exception e) {
e.printStackTrace();
}
}
}
return map;
}
public static void main(final String[] args) throws Exception {
int c = 0;
c = ToolRunner.run(new Configuration(), new HbaseToElastic(), args);
System.exit(c);
}
}
I'm working with Cassandra Cluster with 4 nodes (Amazon), I have test_ks_r2 with replication factor = 2, write and update with consistency level = ALL, read with consistency level = ONE. The code are using datastax java driver (version 2.1.5):
// schema of user column family
CREATE TABLE user (
id text PRIMARY KEY,
password text,
username text
);
// ResultSet class
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.HashMap;
import java.util.Map;
public class ResultSet {
private List<String> idList = new ArrayList<String>();
private Map<String, List<String>> data = new HashMap<String, List<String>>();
public void setIdList(List<String> v) {idList = v;}
public List<String> getIdList() { return idList;}
public void setData(Map<String, List<String>> v) {data = v;}
public Map<String, List<String>> getData() {return data;}
}
// test cassandra class
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.HashMap;
import java.util.Map;
import com.datastax.driver.core.*;
import com.datastax.driver.core.Cluster.Builder;
import com.datastax.driver.core.querybuilder.QueryBuilder;
class TestCassandra {
private ConsistencyLevel getConsistencyLevel(String consistency) {
System.out.println("consistency level string: " + consistency);
if(consistency.equals("one")) {
System.out.println("using consistency level: ONE");
return ConsistencyLevel.ONE;
}
else if(consistency.equals("two")) {
System.out.println("using consistency level: TWO");
return ConsistencyLevel.TWO;
}
else if(consistency.equals("quorum")) {
System.out.println("using consistency level: QUORUM");
return ConsistencyLevel.QUORUM;
}
else if(consistency.equals("all")) {
System.out.println("using consistency level: ALL");
return ConsistencyLevel.ALL;
}
else {
System.out.println("using default consistency level: ONE");
return ConsistencyLevel.ONE;
}
}
public void testWriteUpdateRead(int n, String mode, int delay) {
if (mode.equals("QueryBuilder")) {
System.out.println("test write & read with QueryBuilder ...");
} else if (mode.equals("PreparedStatement")) {
System.out.println("test write & read with PreparedStatement ...");
} else {
System.exit(-1);
}
ResultSet resultSet = write(n, mode, getConsistencyLevel("all"));
read(resultSet.getIdList(), resultSet.getData(), getConsistencyLevel("one"));
// update data
update(resultSet, mode, getConsistencyLevel("all"), delay);
// read again
read(resultSet.getIdList(), resultSet.getData(), getConsistencyLevel("one"));
}
public ResultSet update(ResultSet resultSet, String mode, ConsistencyLevel consistencyLevel, int delay) {
PreparedStatement ps = session.prepare("UPDATE user SET password = ? WHERE id = ?")
.setConsistencyLevel(consistencyLevel);
List<String> idList = resultSet.getIdList();
Map<String, List<String>> data = resultSet.getData();
for (String idRead: idList) {
try {
Thread.sleep(delay);
}
catch (Exception e) {
System.out.println(e);
}
String newPassword = "new password" + idRead;
if (mode.equals("QueryBuilder")) {
session.execute(
QueryBuilder.update("user")
.where(QueryBuilder.eq("id", idRead))
.with(QueryBuilder.set("password", newPassword))
.setConsistencyLevel(consistencyLevel)
);
} else {
ResultSet result = session.execute(ps.bind(newPassword, idRead));
System.out.println(result);
}
List<String> itemValues = new ArrayList<String>();
itemValues.add("empty");
itemValues.add(newPassword);
data.put(idRead, itemValues);
}
System.out.println("Update " + Objects.toString(idList.size(), null) + " times");
resultSet.setData(data);
return resultSet;
}
public ResultSet write(int n, String mode, ConsistencyLevel consistencyLevel) {
PreparedStatement ps = session
.prepare("INSERT INTO user (id, username, password) VALUES (?, ?, ?)")
.setConsistencyLevel(consistencyLevel);
String idWrite = "";
List<String> idList = new ArrayList<String>();
Map<String, List<String>> data = new HashMap<String, List<String>>();
for (int i=0; i<n;i++) {
idWrite = Objects.toString(System.currentTimeMillis(),null);
String username = "username" + idWrite;
String password = "password" + idWrite;
List<String> itemValues = new ArrayList<String>();
itemValues.add(username);
itemValues.add(password);
if (mode.equals("QueryBuilder")) {
session.execute(
QueryBuilder.insertInto("user")
.value("id", idWrite)
.value("username", username)
.value("password", password)
.setConsistencyLevel(consistencyLevel)
);
} else {
session.execute(ps.bind(idWrite, username, password));
}
idList.add(idWrite);
data.put(idWrite, itemValues);
}
ResultSet resultSet = new ResultSet();
resultSet.setIdList(idList);
resultSet.setData(data);
System.out.println("Write " + Objects.toString(n, null) + " times");
return resultSet;
}
public void read(List<String> idList, Map<String, List<String>> data, ConsistencyLevel consistencyLevel) {
int readCount;
int success = 0;
for (String idRead: idList) {
PreparedStatement stmt = session.prepare("SELECT * FROM user WHERE id = ?").setConsistencyLevel(consistencyLevel);
List<String> itemValues = data.get(idRead);
try {
readCount = 0;
Row resultRow = null;
for (Row row : session.execute(stmt.bind(idRead))) {
readCount++;
resultRow = row;
}
// there should be only 1 read, in such case, this is a successful read of a recent write
if (readCount==1) {
if(itemValues.get(1).equals(resultRow.getString("password"))) {
success++;
} else {
System.out.println("password: " + itemValues.get(1));
System.out.println("password in database: " + resultRow.getString("password"));
}
}
} catch (Exception e) {
success = success + 0; // do not increase success since there is an exception
}
}
System.out.println("Successfully read back " + Objects.toString(success, null) + " times");
}
}
Here is my result
Write 40 times
consistency level string: one
using consistency level: ONE
Successfully read back 40 times
consistency level string: all
using consistency level: ALL
Update 40 times
consistency level string: one
using consistency level: ONE
Successfully read back 32 times
Update always fail about ~ 30%. What am I doing wrong?
Can OrmLite recognize differences between my POCO and my schema and automatically add (or remove) columns as necessary to force the schema to remain in sync with my POCO?
If this ability doesn't exist, is there way for me to query the db for table schema so that I may manually perform the syncing? I found this, but I'm using the version of OrmLite that installs with ServiceStack and for the life of me, I cannot find a namespace that has the TableInfo classes.
I created an extension method to automatically add missing columns to my tables. Been working great so far. Caveat: the code for getting the column names is SQL Server specific.
namespace System.Data
{
public static class IDbConnectionExtensions
{
private static List<string> GetColumnNames(IDbConnection db, string tableName)
{
var columns = new List<string>();
using (var cmd = db.CreateCommand())
{
cmd.CommandText = "exec sp_columns " + tableName;
var reader = cmd.ExecuteReader();
while (reader.Read())
{
var ordinal = reader.GetOrdinal("COLUMN_NAME");
columns.Add(reader.GetString(ordinal));
}
reader.Close();
}
return columns;
}
public static void AlterTable<T>(this IDbConnection db) where T : new()
{
var model = ModelDefinition<T>.Definition;
// just create the table if it doesn't already exist
if (db.TableExists(model.ModelName) == false)
{
db.CreateTable<T>(overwrite: false);
return;
}
// find each of the missing fields
var columns = GetColumnNames(db, model.ModelName);
var missing = ModelDefinition<T>.Definition.FieldDefinitions
.Where(field => columns.Contains(field.FieldName) == false)
.ToList();
// add a new column for each missing field
foreach (var field in missing)
{
var alterSql = string.Format("ALTER TABLE {0} ADD {1} {2}",
model.ModelName,
field.FieldName,
db.GetDialectProvider().GetColumnTypeDefinition(field.FieldType)
);
Console.WriteLine(alterSql);
db.ExecuteSql(alterSql);
}
}
}
}
No there is no current support for Auto Migration of RDBMS Schema's vs POCOs in ServiceStack's OrmLite.
There are currently a few threads being discussed in OrmLite's issues that are exploring the different ways to add this.
Here is a slightly modified version of code from cornelha to work with PostgreSQL. Removed this fragment
//private static List<string> GetColumnNames(object poco)
//{
// var list = new List<string>();
// foreach (var prop in poco.GetType().GetProperties())
// {
// list.Add(prop.Name);
// }
// return list;
//}
and used IOrmLiteDialectProvider.NamingStrategy.GetTableName and IOrmLiteDialectProvider.NamingStrategy.GetColumnName methods to convert table and column names from PascalNotation to this_kind_of_notation used by OrmLite when creating tables in PostgreSQL.
public static class IDbConnectionExtensions
{
private static List<string> GetColumnNames(IDbConnection db, string tableName, IOrmLiteDialectProvider provider)
{
var columns = new List<string>();
using (var cmd = db.CreateCommand())
{
cmd.CommandText = getCommandText(tableName, provider);
var tbl = new DataTable();
tbl.Load(cmd.ExecuteReader());
for (int i = 0; i < tbl.Columns.Count; i++)
{
columns.Add(tbl.Columns[i].ColumnName);
}
}
return columns;
}
private static string getCommandText(string tableName, IOrmLiteDialectProvider provider)
{
if (provider == PostgreSqlDialect.Provider)
return string.Format("select * from {0} limit 1", tableName);
else return string.Format("select top 1 * from {0}", tableName);
}
public static void AlterTable<T>(this IDbConnection db, IOrmLiteDialectProvider provider) where T : new()
{
var model = ModelDefinition<T>.Definition;
var table = new T();
var namingStrategy = provider.NamingStrategy;
// just create the table if it doesn't already exist
var tableName = namingStrategy.GetTableName(model.ModelName);
if (db.TableExists(tableName) == false)
{
db.CreateTable<T>(overwrite: false);
return;
}
// find each of the missing fields
var columns = GetColumnNames(db, model.ModelName, provider);
var missing = ModelDefinition<T>.Definition.FieldDefinitions
.Where(field => columns.Contains(namingStrategy.GetColumnName(field.FieldName)) == false)
.ToList();
// add a new column for each missing field
foreach (var field in missing)
{
var columnName = namingStrategy.GetColumnName(field.FieldName);
var alterSql = string.Format("ALTER TABLE {0} ADD COLUMN {1} {2}",
tableName,
columnName,
db.GetDialectProvider().GetColumnTypeDefinition(field.FieldType)
);
Console.WriteLine(alterSql);
db.ExecuteSql(alterSql);
}
}
}
I implemented an UpdateTable function. The basic idea is:
Rename current table on database.
Let OrmLite create the new schema.
Copy the relevant data from the old table to the new.
Drop the old table.
Github Repo: https://github.com/peheje/Extending-NServiceKit.OrmLite
Condensed code:
public interface ISqlProvider
{
string RenameTableSql(string currentName, string newName);
string GetColumnNamesSql(string tableName);
string InsertIntoSql(string intoTableName, string fromTableName, string commaSeparatedColumns);
string DropTableSql(string tableName);
}
public static void UpdateTable<T>(IDbConnection connection, ISqlProvider sqlProvider) where T : new()
{
connection.CreateTableIfNotExists<T>();
var model = ModelDefinition<T>.Definition;
string tableName = model.Name;
string tableNameTmp = tableName + "Tmp";
string renameTableSql = sqlProvider.RenameTableSql(tableName, tableNameTmp);
connection.ExecuteNonQuery(renameTableSql);
connection.CreateTable<T>();
string getModelColumnsSql = sqlProvider.GetColumnNamesSql(tableName);
var modelColumns = connection.SqlList<string>(getModelColumnsSql);
string getDbColumnsSql = sqlProvider.GetColumnNamesSql(tableNameTmp);
var dbColumns = connection.SqlList<string>(getDbColumnsSql);
List<string> activeFields = dbColumns.Where(dbColumn => modelColumns.Contains(dbColumn)).ToList();
string activeFieldsCommaSep = ListToCommaSeparatedString(activeFields);
string insertIntoSql = sqlProvider.InsertIntoSql(tableName, tableNameTmp, activeFieldsCommaSep);
connection.ExecuteSql(insertIntoSql);
string dropTableSql = sqlProvider.DropTableSql(tableNameTmp);
//connection.ExecuteSql(dropTableSql); //maybe you want to clean up yourself, else uncomment
}
private static String ListToCommaSeparatedString(List<String> source)
{
var sb = new StringBuilder();
for (int i = 0; i < source.Count; i++)
{
sb.Append(source[i]);
if (i < source.Count - 1)
{
sb.Append(", ");
}
}
return sb.ToString();
}
}
MySql implementation:
public class MySqlProvider : ISqlProvider
{
public string RenameTableSql(string currentName, string newName)
{
return "RENAME TABLE `" + currentName + "` TO `" + newName + "`;";
}
public string GetColumnNamesSql(string tableName)
{
return "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '" + tableName + "';";
}
public string InsertIntoSql(string intoTableName, string fromTableName, string commaSeparatedColumns)
{
return "INSERT INTO `" + intoTableName + "` (" + commaSeparatedColumns + ") SELECT " + commaSeparatedColumns + " FROM `" + fromTableName + "`;";
}
public string DropTableSql(string tableName)
{
return "DROP TABLE `" + tableName + "`;";
}
}
Usage:
using (var db = dbFactory.OpenDbConnection())
{
DbUpdate.UpdateTable<SimpleData>(db, new MySqlProvider());
}
Haven't tested with FKs. Can't handle renaming properties.
I needed to implement something similiar and found the post by Scott very helpful. I decided to make a small change which will make it much more agnostic. Since I only use Sqlite and MSSQL, I made the getCommand method very simple, but can be extended. I used a simple datatable to get the columns. This solution works perfectly for my requirements.
public static class IDbConnectionExtensions
{
private static List<string> GetColumnNames(IDbConnection db, string tableName,IOrmLiteDialectProvider provider)
{
var columns = new List<string>();
using (var cmd = db.CreateCommand())
{
cmd.CommandText = getCommandText(tableName, provider);
var tbl = new DataTable();
tbl.Load(cmd.ExecuteReader());
for (int i = 0; i < tbl.Columns.Count; i++)
{
columns.Add(tbl.Columns[i].ColumnName);
}
}
return columns;
}
private static string getCommandText(string tableName, IOrmLiteDialectProvider provider)
{
if(provider == SqliteDialect.Provider)
return string.Format("select * from {0} limit 1", tableName);
else return string.Format("select top 1 * from {0}", tableName);
}
private static List<string> GetColumnNames(object poco)
{
var list = new List<string>();
foreach (var prop in poco.GetType().GetProperties())
{
list.Add(prop.Name);
}
return list;
}
public static void AlterTable<T>(this IDbConnection db, IOrmLiteDialectProvider provider) where T : new()
{
var model = ModelDefinition<T>.Definition;
var table = new T();
// just create the table if it doesn't already exist
if (db.TableExists(model.ModelName) == false)
{
db.CreateTable<T>(overwrite: false);
return;
}
// find each of the missing fields
var columns = GetColumnNames(db, model.ModelName,provider);
var missing = ModelDefinition<T>.Definition.FieldDefinitions
.Where(field => columns.Contains(field.FieldName) == false)
.ToList();
// add a new column for each missing field
foreach (var field in missing)
{
var alterSql = string.Format("ALTER TABLE {0} ADD {1} {2}",
model.ModelName,
field.FieldName,
db.GetDialectProvider().GetColumnTypeDefinition(field.FieldType)
);
Console.WriteLine(alterSql);
db.ExecuteSql(alterSql);
}
}
}
So I took user44 answer, and modified the AlterTable method to make it a bit more efficient.
Instead of looping and running one SQL query per field/column, I merge it into one with some simple text parsing (MySQL commands!).
public static void AlterTable<T>(this IDbConnection db, IOrmLiteDialectProvider provider) where T : new()
{
var model = ModelDefinition<T>.Definition;
var table = new T();
var namingStrategy = provider.NamingStrategy;
// just create the table if it doesn't already exist
var tableName = namingStrategy.GetTableName(model.ModelName);
if (db.TableExists(tableName) == false)
{
db.CreateTable<T>(overwrite: false);
return;
}
// find each of the missing fields
var columns = GetColumnNames(db, model.ModelName, provider);
var missing = ModelDefinition<T>.Definition.FieldDefinitions
.Where(field => columns.Contains(namingStrategy.GetColumnName(field.FieldName)) == false)
.ToList();
string alterSql = "";
string addSql = "";
// add a new column for each missing field
foreach (var field in missing)
{
var alt = db.GetDialectProvider().ToAddColumnStatement(typeof(T), field); // Should be made more efficient, one query for all changes instead of many
int index = alt.IndexOf("ADD ");
alterSql = alt.Substring(0, index);
addSql += alt.Substring(alt.IndexOf("ADD COLUMN")).Replace(";", "") + ", ";
}
if (addSql.Length > 2)
addSql = addSql.Substring(0, addSql.Length - 2);
string fullSql = alterSql + addSql;
Console.WriteLine(fullSql);
db.ExecuteSql(fullSql);
}