Using Apache POI to parse text documents and search for keywords inside them
I just simply create resume parser that parse each document(.doc, .docx) to find given keywords in folder or in sub-folder. Please read the Java-Doc of HWPFDocument.
Here is code
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.poi.hwpf.HWPFDocument;
public class ResumeParser {
HWPFDocument document;
FileInputStream fileInputStream;
String rootFolderPath;
Map<String, List<String>> displayContent = new HashMap<String, List<String>>();
List<String> keywordList;
public ResumeParser(String rootFolderPath, List<String> keywordList) throws IOException {
this.rootFolderPath = rootFolderPath;
this.keywordList = keywordList;
startParsing();
}
public void startParsing() throws IOException {
createDisplayContent(rootFolderPath);
printContent(displayContent);
}
/**
* Creates the display content it parse the each file under given folder and
* also in sub folder. folder
*
* #param rootFolderPath
* the root folder path
*/
public void createDisplayContent(String rootFolderPath) {
File fileL = new File(rootFolderPath);
File[] fileArrL = fileL.listFiles();
for (int i = 0; i < fileArrL.length; i++) {
File fileTempL = fileArrL[i];
if (fileTempL.isFile()) {
String name = fileTempL.getName();
String extension = name.substring(name.indexOf('.') + 1, name.length()).toUpperCase();
if (extension.equals("DOC") || extension.equals("DOCX")) {
parseDocFile(fileTempL.getAbsolutePath());
}
} else if (fileTempL.isDirectory()) {
createDisplayContent(fileTempL.getAbsolutePath());
}
}
}
public void parseDocFile(String filePath) {
try {
fileInputStream = new FileInputStream(filePath);
document = new HWPFDocument(fileInputStream);
String text = document.getText().toString().toUpperCase();
Iterator<String> iteratorL = keywordList.iterator();
List<String> matchKeywordListL = new ArrayList<String>();
String keywordL = "";
while (iteratorL.hasNext()) {
keywordL = iteratorL.next().toUpperCase();
if (text.contains(keywordL)) {
matchKeywordListL.add(keywordL.toLowerCase());
}
}
displayContent.put(filePath, matchKeywordListL);
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
public void printContent(Map<String, List<String>> displayContent) {
Iterator<String> iteratorL = displayContent.keySet().iterator();
String keyL = "";
while (iteratorL.hasNext()) {
keyL = iteratorL.next();
System.out.println("File Name: " + keyL + " Match Keywords: " + displayContent.get(keyL));
}
}
public static void main(String args[]) throws IOException {
List<String> keywordListL = new ArrayList<String>();
keywordListL.add("Java");
keywordListL.add("PHP");
keywordListL.add("Andriod");
keywordListL.add("John");
new ResumeParser("D:\\Doc", keywordListL);
}
}
I do have a multi node spark cluster and submitting my spark program on node where master resides.
When the job submitted to worker nodes, the HOSTNAME paramter is giving null value. Here is the line where properties are being read as null.
System.getenv(HOSTNAME) is not being read from worker node.
System.out.println("line 76 System.getenv(HOSTNAME)=" + System.getenv("HOSTNAME"));
AUDIT_USER, AUDIT_PASSWORD also null when being read(they both were on properties file).
If i submit job with one node i have no issues with these parameters. But, if u submit job on standalone mode with 6 nodes i am getting this issue.
I have created the same folder for properties file on all nodes.
Here is my code. could you please let me know why System.env is not giving null and my properties are null?
package com.fb.cpd.myapp;
import java.io.Serializable;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.Future;
import org.apache.commons.configuration.ConfigurationConverter;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.configuration.reloading.FileChangedReloadingStrategy;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.clients.producer.RecordMetadata;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.TaskContext;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import kafka.common.TopicAndPartition;
import kafka.message.MessageAndMetadata;
import kafka.serializer.DefaultDecoder;
import kafka.serializer.StringDecoder;
public class GenericLogic implements Serializable {
/**
*
*/
private static final long serialVersionUID = 1L;
private static final Logger logger = LogManager.getLogger(GenericLogic.class);
private PropertiesConfiguration props;
private Producer<String, String> producer = null;
private Future<RecordMetadata> receipt = null;
private RecordMetadata receiptInfo = null;
private ConnectToRDBMS auditor = null;
private ConnectToRDBMS df = null;
private static String myId = null;
private Map<TopicAndPartition, Long> getOffsets(String topic) throws SQLException {
String appName = "myapp";
String TopicName = topic;
Map<TopicAndPartition, Long> topicMap = new HashMap<>(); //
System.out.println("line 64 before making connection");
try {
props = new PropertiesConfiguration("/app/lock/conf/empty.properties");
} catch (ConfigurationException e) { // TODO Auto-generated catch block
System.out.println("Line 70");
e.printStackTrace();
}
try {
System.out.println("line 76 System.getenv(HOSTNAME)=" + System.getenv("HOSTNAME"));
auditor = new ConnectToRDBMS(System.getenv("HOSTNAME"), "lockSparkCollector", null, null, null, null, null,
0, props.getString("AUDIT_USER"), props.getString("AUDIT_PASSWORD"),
props.getString("AUDIT_DB_URL"));
} catch (SQLException e) {
logger.error("ASSERT: run() ERROR CONNECTING TO AUDIT DB " + e.getMessage());
}
System.out.println("line 64 after making connection");
Statement stmt = null;
String query = "select va_application, topic_name, partition_id, from_offset,until_offset from lock_spark_offsets where va_application = "
+ "'" + appName + "'" + " and topic_name= " + "'" + TopicName + "'";
System.out.println("query" + query);
System.out.println("before query exection");
try {
stmt = auditor.dbConnection.createStatement();
System.out.println("line 81");
ResultSet rs = stmt.executeQuery(query);
System.out.println("line 83");
while (rs.next()) {
System.out.println("pass 1 of Resultset");
System.out.println("getOffsets=" + topic.trim() + " " + rs.getInt("partition_id") + " "
+ rs.getString("until_offset") + " " + rs.getString("until_offset"));
Integer partition = rs.getInt("partition_id");
TopicAndPartition tp = new TopicAndPartition(topic.trim(), partition);
System.out.println("102");
Long.parseLong(rs.getString("until_offset"));
topicMap.put(tp, Long.parseLong(rs.getString("until_offset")));
System.out.println("105");
}
System.out.println("after populating topic map");
} catch (
SQLException e) {
System.out.println("printing exception");
e.printStackTrace();
} finally {
if (stmt != null) {
System.out.println("closing statement");
stmt.close();
}
}
return topicMap;
}
public void setDefaultProperties() {
FileChangedReloadingStrategy strategy = new FileChangedReloadingStrategy();
strategy.setRefreshDelay(10000);
System.out.println("Line 45");
// supply the properties file.
try {
props = new PropertiesConfiguration("/app/lock/conf/empty.properties");
} catch (ConfigurationException e) {
// TODO Auto-generated catch block
System.out.println("Line 51");
e.printStackTrace();
}
props.setReloadingStrategy(strategy);
System.out.println("Line 56");
// Producer configs
if (!props.containsKey("acks")) {
props.setProperty("acks", "1");
}
if (!props.containsKey("retries")) {
props.setProperty("retries", "1000");
}
if (!props.containsKey("compression.type")) {
props.setProperty("compression.type", "gzip");
}
if (!props.containsKey("request.timeout.ms")) {
props.setProperty("request.timeout.ms", "600000");
}
if (!props.containsKey("batch.size")) {
props.setProperty("batch.size", "32768");
}
if (!props.containsKey("buffer.memory")) {
props.setProperty("buffer.memory", "134217728");
}
if (!props.containsKey("block.on.buffer.full")) {
props.setProperty("block.on.buffer.full", "true");
}
if (!props.containsKey("SHUTDOWN")) {
props.setProperty("SHUTDOWN", "false");
}
if (!props.containsKey("producer.topic")) {
props.setProperty("producer.topic", "mytopic1");
}
Properties producer_props = ConfigurationConverter.getProperties(props);
producer_props.setProperty("bootstrap.servers", props.getString("target.bootstrap.servers"));
producer_props.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
producer_props.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); // ????
this.producer = new KafkaProducer<String, String>(producer_props);
System.out.println("Line 107");
}
public void PublishMessages(String st) {
try {
System.out.println("Line 111");
String key = UUID.randomUUID().toString().replace("-", "");
System.out.println("Started Producing...");
receipt = producer.send(new ProducerRecord<String, String>(props.getString("producer.topic"), key, // Key
st));
System.out.println("After Completion of Producing Producing");
} catch (Exception e) {
e.printStackTrace();
System.out.println("Exception in PublishMessages ");
}
}
public void DBConnect() {
try {
auditor = new ConnectToRDBMS(System.getenv("HOSTNAME"), "myapp", props.getString("consumer.topic"), null,
null, null, null, 0, props.getString("AUDIT_USER"), props.getString("AUDIT_PASSWORD"),
props.getString("AUDIT_DB_URL"));
} catch (SQLException e) {
logger.error("ASSERT: run() ERROR CONNECTING TO AUDIT DB " + e.getMessage());
return;
}
}
private void writeToDB(Long startTime, Integer partnId, String fromOffset, String untilOffset, Integer count) {
this.auditor.audit(startTime, partnId, fromOffset, untilOffset, count);
}
/**
*
* #param jsc
* #param topicSet
* #throws Exception
*/
public static void main(String[] args) {
String topicNames = "MySourceTopic";
GenericLogic ec = new GenericLogic();
Map<TopicAndPartition, Long> topicMap = null;
try {
topicMap = ec.getOffsets("MySourceTopic");
} catch (SQLException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
boolean clusterMode = false;
Integer batchDuration = Integer.parseInt("30000");
JavaSparkContext sparkConf = new JavaSparkContext("abcd.net:7077", "Kafka-Spark-Integration");
sparkConf.getConf().set("spark.local.ip", "lock-dt-a4d.xyz.com");
sparkConf.getConf().set("spark.eventLog.enabled", "false");
sparkConf.getConf().set("spark.shuffle.blockTransferService", "nio");
JavaStreamingContext jsc = new JavaStreamingContext(sparkConf, new Duration(10000));
Map<String, String> kafkaParams = new HashMap<String, String>();
String pollInterval = "10000";
String zookeeper = "lock-dt-a5d.xyz.com:2181,lock-dt-a6d.xyz.com:2181";
kafkaParams.put("metadata.broker.list", "lock-dt-a5d.xyz.com:9092,lock-dt-a6d.xyz.com:9092");
kafkaParams.put("group.id", "Consumer");
kafkaParams.put("client.id", "Consumer");
kafkaParams.put("zookeeper.connect", zookeeper);
JavaInputDStream<byte[]> directKafkaStream = KafkaUtils.createDirectStream(jsc, String.class, byte[].class,
StringDecoder.class, DefaultDecoder.class, byte[].class, kafkaParams, topicMap,
(Function<MessageAndMetadata<String, byte[]>, byte[]>) MessageAndMetadata::message);
directKafkaStream.foreachRDD(rdd -> {
if (rdd.isEmpty()) {
System.out.println("No events polled in last " + pollInterval + " milli seconds");
return;
}
rdd.foreachPartition(itr -> {
Integer partnId = TaskContext.get().partitionId();
Long systime = System.nanoTime();
Map<String, String> hmap = new HashMap<String, String>();
GenericLogic ec2 = new GenericLogic();
ec2.setDefaultProperties();
ec2.DBConnect();
try {
while (itr.hasNext()) {
System.out.println("232");
}
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
});
});
jsc.start();
jsc.awaitTermination();
}
}
I started the salves with start-slaves.sh.
That's the issue. I have to start workers by supplying master addres.
Can you please let us know OS of all the nodes and if you have ensured that noting on Master node is exporting the HOSTNAME. Answering your question will be better if you let us know about your OS detail.
May not be correctly related to your context but just for information System.getenv("HOSTNAME") may not provide hostname in all the platforms (for example Ubuntu or Mac).
Better is why not export the HOSTNAME.
Note: I am assuming you have already checked that props is not null or empty?
If not debug and check whether properties files is loaded or not and if loaded it is not the empty properties file and hence it has loaded the properties from the file.
Looking at your problem (not only environment variable but properties are also not returning, there may be something wrong with properties file or its relative location on different computers.
If it is not exact copy which is placed at different computers, please also check if it is a file good for Linux (not written and edited in windows and put in linux).
So I created a custom appender for Log4J that essentially does the same thing as the socketappender, but with UDP as opposed to TCP. However, I cannot get the log to display the date. I think that there might be an issue with the .properties file, but I'm not sure what. Especially since I know the .properties file is working to some extent because that is how the test program knows which appender to use. However, it is not using the layout specified in the .properties file. Any help/suggestions would be greatly appreciated!
Appender code:
import java.io.IOException;
import java.net.DatagramPacket;
import java.net.DatagramSocket;
import java.net.InetAddress;
import java.net.SocketException;
import java.net.UnknownHostException;
import org.apache.log4j.AppenderSkeleton;
import org.apache.log4j.spi.LoggingEvent;
public /*static*/ class UDPAppender extends AppenderSkeleton {
#Override
protected void append(LoggingEvent event) {
String message = null;
event.getProperties();
StringBuilder formatedMessage = new StringBuilder();
formatedMessage.append("Level:");
formatedMessage.append(event.getLevel());
formatedMessage.append(", FILE:");
formatedMessage.append(event.getLocationInformation().getFileName());
formatedMessage.append(", Thread Id:");
formatedMessage.append(event.getLocationInformation().getLineNumber());
formatedMessage.append(", ERROR: ");
formatedMessage.append(event.getMessage().toString());
message = formatedMessage.toString();
DatagramSocket clientSocket = null;
//creating DatagramSocket to send logger info to socket; if not created, will print Stack Trace
try {
clientSocket = new DatagramSocket();
} catch (SocketException e) {
e.printStackTrace();
}
InetAddress IPAddress = null;
//get InetAddress for sending packet
try {
IPAddress = InetAddress.getByName("localhost");
} catch (UnknownHostException e) {
e.printStackTrace();
}
byte[] sendData = new byte[1024];
sendData = message.getBytes(); //convert from String to bytes so that message can be sent through datagrampacket
DatagramPacket sendPacket = new DatagramPacket(sendData, sendData.length, IPAddress, 27770); //address packet
try {
clientSocket.send(sendPacket); //send packet to the socket addressed above
} catch (IOException e) {
e.printStackTrace();
}
clientSocket.close(); //need to close
}
public void close() {
}
public boolean requiresLayout() {
return true;
}
}
Test code:
import org.apache.log4j.*;
public class Tester {
private static final Logger log2 = Logger.getLogger("REMOTE");
public static void main(String[] args) {
//configures using the .properties file user creates
PropertyConfigurator.configure("log4j.properties");
log2.info("This is info message");
System.out.println("Logging complete");
//prints to console to let user know logging completed successfully
LogManager.shutdown();
//closes logs once logging is complete
}
}
.properties file:
log4j.logger.REMOTE=DEBUG, REMOTE
log4j.appender.REMOTE=custom.UDPAppender
log4j.appender.REMOTE.layout=org.apache.log4j.PatternLayout
log4j.appender.REMOTE.layout.ConversionPattern=%d{MMMM dd HH:mm:ss}, %d{yyyy} %F %L %m %t
I was actually able to just convert the time stamp to day/time with the following code:
long date = event.getTimeStamp();
Date dateObj = new Date(date);
DateFormat df = new SimpleDateFormat("MM-dd-yyyy HH:mm:ss");
StringBuilder formatedMessage = new StringBuilder();
formatedMessage.append("Time:");
formatedMessage.append(df.format(dateObj));
I want to create an custom appender using log4j in java programatically which stores the log in memory(Certain size will be defined for e.g. say DEFAULT_SIZE=20) & as soon as memory is full(there are 20 logs in memory) create an json array of those logs and send it to the server. Along with log message i also want information such as (class, file name,line number,method name,level etc.)
Help is appreciated.!!!
Updated ::
Below is the source code developed by me for the same . Problem which i am facing is i get line number, method name,file name as "?".I dont understand what is the reason.
package com.any.logger;
import java.util.ArrayList;
import org.apache.log4j.AppenderSkeleton;
import org.apache.log4j.spi.LoggingEvent;
public class Log4jAppender extends AppenderSkeleton {
private static final int DEFAULT_BUFFER_SIZE = 20;
private ArrayList<LoggingEvent> buffer;
private int bufferSize;
public Log4jAppender() {
this.bufferSize = Log4jAppender.DEFAULT_BUFFER_SIZE;
this.buffer = new ArrayList<LoggingEvent>();
}
public void close() {
}
public boolean requiresLayout() {
return false;
}
#Override
protected void append(LoggingEvent loggingEvent) {
synchronized (buffer) {
if (buffer.size() >= bufferSize) {
try {
System.out.println("Hello");
Log4jService service = new Log4jService(new ArrayList<LoggingEvent>
(buffer));
System.out.println("Buff >");
service.start();
} catch (Exception e) {
System.err.println("Failed to write logs");
e.printStackTrace();
} finally {
buffer.clear();
}
}
}
buffer.add(loggingEvent);
}
}
package com.any.logger;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.log4j.Level;
import org.apache.log4j.spi.LoggingEvent;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
#SuppressWarnings("rawtypes")
public class Log4jService implements Runnable {
private ArrayList<LoggingEvent> buffer;
private Thread t;
private boolean flag;
public Log4jService(ArrayList<LoggingEvent> buffer) {
this.buffer = new ArrayList<LoggingEvent>(buffer);
this.flag = false;
}
public void start() {
if (!flag) {
this.flag = true;
this.t = new Thread(this);
this.t.start();
}
}
public void run() {
if (flag) {
String json = getLogJson(buffer);
}
}
/**
* Method to get the json log
*
* #param logEvents
* arrayList of log events
* #return an String which is json file
* #throws JSONException
*/
private String getLogJson(ArrayList<LoggingEvent> logEvents) {
System.out.println("In getLogJson");
JSONObject logger = new JSONObject();
try {
JSONArray logs = new JSONArray();
String message = "";
String time = "";
String level = "";
String tag = "";
String file = "";
String exceptionName = "";
String exceptionReason = "";
String line = "";
String clas = "";
String method = "";
int index = 0;
SimpleDateFormat formatter=new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
for (Iterator i = logEvents.iterator(); i.hasNext();) {
LoggingEvent logEvent = (LoggingEvent) i.next();
boolean chk=logEvent.locationInformationExists();
System.out.println("Check :: "+chk);
clas=logEvent.getClass().toString();
JSONObject log = new JSONObject();
message = logEvent.getMessage().toString();
time = String.valueOf(formatter.format(logEvent.getTimeStamp()));
level = logEvent.getLevel().toString();
if (chk) {
System.out.println("hello");
file = logEvent.getLocationInformation().getFileName();
line = logEvent.getLocationInformation().getLineNumber();
method = logEvent.getLocationInformation().getMethodName();
}
if (logEvent.getLevel() == Level.ERROR || logEvent.getLevel() ==
Level.FATAL) {
exceptionReason = message;
}
log.put("message", message);
log.put("time", time);
log.put("level", level);
log.put("tag", tag);
log.put("file", file);
log.put("exceptionName", exceptionName);
log.put("exceptionReason", exceptionReason);
log.put("line", line);
log.put("class", clas);
log.put("method", method);
logs.put(index, log);
index++;
}
JSONObject logsObj = new JSONObject();
logsObj.put("logs", logs);
logger.put("log", logsObj);
System.out.println("Logs Array :: " + logsObj);
System.out.println("Logger Object :: " + logger);
} catch (Exception e) {
e.printStackTrace();
}
return logger.toString();
}
}
I would look at the source code for log4j2 and use an existing appender as a template for my custom appender. SocketAppender and TCPSocketManager may be good places to start.
http://logging.apache.org/log4j/2.0/log4j-core/xref/org/apache/logging/log4j/core/net/TCPSocketManager.html
Currently the buffering is based on the number of bytes, not event count. If you follow the existing appender/manager pattern then in your custom appender manager you will need to have a large enough buffer and flush every 20 events. (Perhaps also flush after some timeout? You may only get 19 events and nothing after that...)
You could also do the buffering in the appender, keep a list of event objects there and only call pass them to the manager to be written after some threshold is reached.
The default Layout for SocketAppender will use java serialization. You'll need to roll your own JasonLayout. The log event object has all the details you mentioned.
I am using ASM Bytecode Library to instrument Java Classes using pre-main.
How do we get the name of the method executed?
Thanks in advance...
In short, there is a name parameter on corresponding visit*() method call. You'll have to produce a more specific example to get more detailed answer.
Attached my Code for your reference
package com.eg.agent;
import java.lang.instrument.Instrumentation;
public class Agent {
private Agent(Instrumentation instrumentation){
}
public static void premain(String agentArgs, Instrumentation inst)
{
EgClassFileTransformer egClassFileTransformer =
new EgClassFileTransformer (agentArgs , inst);
}
}
package com.eg.agent;
import java.lang.instrument.ClassFileTransformer;
import java.lang.instrument.IllegalClassFormatException;
import java.lang.instrument.Instrumentation;
import java.security.ProtectionDomain;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import org.objectweb.asm.ClassReader;
public class EgClassFileTransformer implements ClassFileTransformer {
protected String agentArgString = "";
protected Instrumentation instrumentation;
public EgClassFileTransformer(String agentArgs, Instrumentation inst){
agentArgString = agentArgs;
instrumentation = inst;
instrumentation.addTransformer(this);
}
public byte[] transform(ClassLoader loader, String className, Class<?> classBeingRedefined,
ProtectionDomain protectionDomain, byte[] classfileBuffer) throws IllegalClassFormatException
{
//System.out.println("ClassName :"+className);
InputStream in = loader.getResourceAsStream(className.replace('.', '/') + ".class");
try{
ClassReader classReader=new ClassReader(in);
String superClassName = classReader.getSuperName();
String[] interfaces = classReader.getInterfaces();
if(interfaces!=null && interfaces.length > 0){
for(int k=0;k<interfaces.length;k++){
String inface = interfaces[k];
System.out.println(" \t interface :"+inface);
}
}
//System.out.println("superClassName :"+superClassName);
ArrayList thisList = new ArrayList();
thisList.add(superClassName);
ArrayList superList = printSuperClassNames(superClassName , thisList);
System.out.println("className :"+className +" ==>"+ " superList :"+superList);
} catch (IOException e) {
//e.printStackTrace();
System.out.println("[EXECEPTION] ..."+className);
}
return null;
}
public static ArrayList printSuperClassNames(String className, ArrayList list)
{
ClassReader cr=null;
try {
cr = new ClassReader(className);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String superName = cr.getSuperName();
//System.out.println("[superName]"+superName);
if(superName!=null && !superName.equals("java/lang/Object"))
{
list.add(superName);
String superClass = superName.replace('.', '/');
printSuperClassNames(superClass, list);
}
return list;
}
}