Related
Our program displays a tree control showing the metadata structure of the XML file they are using as a datasource. So it displays all elements & attributes in use in the XML file, like this:
Employees
Employee
FirstName
LastName
Orders
Order
OrderId
For the case where the user does not pass us a XSD file, we need to walk the XML file and build up the metadata structure.
The full code for this is at SaxonQuestions.zip, TestBuildTreeWithSchema.java and is also listed below.
The below code works but it has a problem. Suppose under Employee there's an element for SpouseName. This is only populated if the employee is married. What if the sample data file I have is all unmarried employees? Then the below code does not know there's a SpouseName element.
So my question is - how can I read the schema directly, instead of using the below code. If I read the schema then I get every node & attribute including the optional ones. I also get the type. And the schema optionally has a description for each node and I get that too.
Therefore, I need to read the schema itself. How can I do that?
And secondary question - why is the type for an int BigInteger instead of Integer or Long? I see this with Employee/#EmployeeID in Southwind.xml & Southwind.xsd.
TestBuildTreeWithSample.java
import net.sf.saxon.s9api.*;
import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;
public class TestBuildTreeWithSchema {
public static void main(String[] args) throws Exception {
XmlDatasource datasource = new XmlDatasource(
new FileInputStream(new File("files", "SouthWind.xml").getCanonicalPath()),
new FileInputStream(new File("files", "SouthWind.xsd").getCanonicalPath()));
// get the root element
XdmNode rootNode = null;
for (XdmNode node : datasource.getXmlRootNode().children()) {
if (node.getNodeKind() == XdmNodeKind.ELEMENT) {
rootNode = node;
break;
}
}
TestBuildTreeWithSchema buildTree = new TestBuildTreeWithSchema(rootNode);
Element root = buildTree.addNode();
System.out.println("Schema:");
printElement("", root);
}
private static void printElement(String indent, Element element) {
System.out.println(indent + "<" + element.name + "> (" + (element.type == null ? "null" : element.type.getSimpleName()) + ")");
indent += " ";
for (Attribute attr : element.attributes)
System.out.println(indent + "=" + attr.name + " (" + (attr.type == null ? "null" : attr.type.getSimpleName()) + ")");
for (Element child : element.children)
printElement(indent, child);
}
protected XdmItem currentNode;
public TestBuildTreeWithSchema(XdmItem currentNode) {
this.currentNode = currentNode;
}
private Element addNode() throws SaxonApiException {
String name = ((XdmNode)currentNode).getNodeName().getLocalName();
// Question:
// Is this the best way to determine that this element has data (as opposed to child elements)?
Boolean elementHasData;
try {
((XdmNode) currentNode).getTypedValue();
elementHasData = true;
} catch (Exception ex) {
elementHasData = false;
}
// Questions:
// Is this the best way to get the type of the element value?
// Why BigInteger instead of Long for int?
Class valueClass = ! elementHasData ? null : ((XdmAtomicValue)((XdmNode)currentNode).getTypedValue()).getValue().getClass();
Element element = new Element(name, valueClass, null);
// add in attributes
XdmSequenceIterator currentSequence;
if ((currentSequence = moveTo(Axis.ATTRIBUTE)) != null) {
do {
name = ((XdmNode) currentNode).getNodeName().getLocalName();
// Questions:
// Is this the best way to get the type of the attribute value?
// Why BigInteger instead of Long for int?
valueClass = ((XdmAtomicValue)((XdmNode)currentNode).getTypedValue()).getValue().getClass();
Attribute attr = new Attribute(name, valueClass, null);
element.attributes.add(attr);
} while (moveToNextInCurrentSequence(currentSequence));
moveTo(Axis.PARENT);
}
// add in children elements
if ((currentSequence = moveTo(Axis.CHILD)) != null) {
do {
Element child = addNode();
// if we don't have this, add it
Element existing = element.getChildByName(child.name);
if (existing == null)
element.children.add(child);
else
// add in any children this does not have
existing.addNewItems (child);
} while (moveToNextInCurrentSequence(currentSequence));
moveTo(Axis.PARENT);
}
return element;
}
// moves to element or attribute
private XdmSequenceIterator moveTo(Axis axis) {
XdmSequenceIterator en = ((XdmNode) currentNode).axisIterator(axis);
boolean gotIt = false;
while (en.hasNext()) {
currentNode = en.next();
if (((XdmNode) currentNode).getNodeKind() == XdmNodeKind.ELEMENT || ((XdmNode) currentNode).getNodeKind() == XdmNodeKind.ATTRIBUTE) {
gotIt = true;
break;
}
}
if (gotIt) {
if (axis == Axis.ATTRIBUTE || axis == Axis.CHILD || axis == Axis.NAMESPACE)
return en;
return null;
}
return null;
}
// moves to next element or attribute
private Boolean moveToNextInCurrentSequence(XdmSequenceIterator currentSequence)
{
if (currentSequence == null)
return false;
while (currentSequence.hasNext())
{
currentNode = currentSequence.next();
if (((XdmNode)currentNode).getNodeKind() == XdmNodeKind.ELEMENT || ((XdmNode)currentNode).getNodeKind() == XdmNodeKind.ATTRIBUTE)
return true;
}
return false;
}
static class Node {
String name;
Class type;
String description;
public Node(String name, Class type, String description) {
this.name = name;
this.type = type;
this.description = description;
}
}
static class Element extends Node {
List<Element> children;
List<Attribute> attributes;
public Element(String name, Class type, String description) {
super(name, type, description);
children = new ArrayList<>();
attributes = new ArrayList<>();
}
public Element getChildByName(String name) {
for (Element child : children) {
if (child.name.equals(name))
return child;
}
return null;
}
public void addNewItems(Element child) {
for (Attribute attrAdd : child.attributes) {
boolean haveIt = false;
for (Attribute attrExist : attributes)
if (attrExist.name.equals(attrAdd.name)) {
haveIt = true;
break;
}
if (!haveIt)
attributes.add(attrAdd);
}
for (Element elemAdd : child.children) {
Element exist = null;
for (Element elemExist : children)
if (elemExist.name.equals(elemAdd.name)) {
exist = elemExist;
break;
}
if (exist == null)
children.add(elemAdd);
else
exist.addNewItems(elemAdd);
}
}
}
static class Attribute extends Node {
public Attribute(String name, Class type, String description) {
super(name, type, description);
}
}
}
XmlDatasource.java
import com.saxonica.config.EnterpriseConfiguration;
import com.saxonica.ee.s9api.SchemaValidatorImpl;
import net.sf.saxon.Configuration;
import net.sf.saxon.lib.FeatureKeys;
import net.sf.saxon.s9api.*;
import net.sf.saxon.type.SchemaException;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;
import javax.xml.transform.Source;
import javax.xml.transform.sax.SAXSource;
import javax.xml.transform.stream.StreamSource;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
public class XmlDatasource {
/** the DOM all searches are against */
private XdmNode xmlRootNode;
private XPathCompiler xPathCompiler;
/** key == the prefix; value == the uri mapped to that prefix */
private HashMap<String, String> prefixToUriMap = new HashMap<>();
/** key == the uri mapped to that prefix; value == the prefix */
private HashMap<String, String> uriToPrefixMap = new HashMap<>();
public XmlDatasource (InputStream xmlData, InputStream schemaFile) throws SAXException, SchemaException, SaxonApiException, IOException {
boolean haveSchema = schemaFile != null;
// call this before any instantiation of Saxon classes.
Configuration config = createEnterpriseConfiguration();
if (haveSchema) {
Source schemaSource = new StreamSource(schemaFile);
config.addSchemaSource(schemaSource);
}
Processor processor = new Processor(config);
DocumentBuilder doc_builder = processor.newDocumentBuilder();
XMLReader reader = createXMLReader();
InputSource xmlSource = new InputSource(xmlData);
SAXSource saxSource = new SAXSource(reader, xmlSource);
if (haveSchema) {
SchemaValidator validator = new SchemaValidatorImpl(processor);
doc_builder.setSchemaValidator(validator);
}
xmlRootNode = doc_builder.build(saxSource);
xPathCompiler = processor.newXPathCompiler();
if (haveSchema)
xPathCompiler.setSchemaAware(true);
declareNameSpaces();
}
public XdmNode getXmlRootNode() {
return xmlRootNode;
}
public XPathCompiler getxPathCompiler() {
return xPathCompiler;
}
/**
* Create a XMLReader set to disallow XXE aattacks.
* #return a safe XMLReader.
*/
public static XMLReader createXMLReader() throws SAXException {
XMLReader reader = XMLReaderFactory.createXMLReader();
// stop XXE https://www.owasp.org/index.php/XML_External_Entity_(XXE)_Prevention_Cheat_Sheet#JAXP_DocumentBuilderFactory.2C_SAXParserFactory_and_DOM4J
reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
reader.setFeature("http://xml.org/sax/features/external-general-entities", false);
reader.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
return reader;
}
private void declareNameSpaces() throws SaxonApiException {
// saxon has some of their functions set up with this.
prefixToUriMap.put("saxon", "http://saxon.sf.net");
uriToPrefixMap.put("http://saxon.sf.net", "saxon");
XdmValue list = xPathCompiler.evaluate("//namespace::*", xmlRootNode);
if (list == null || list.size() == 0)
return;
for (int index=0; index<list.size(); index++) {
XdmNode node = (XdmNode) list.itemAt(index);
String prefix = node.getNodeName() == null ? "" : node.getNodeName().getLocalName();
// xml, xsd, & xsi are XML structure ones, not ones used in the XML
if (prefix.equals("xml") || prefix.equals("xsd") || prefix.equals("xsi"))
continue;
// use default prefix if prefix is empty.
if (prefix == null || prefix.isEmpty())
prefix = "def";
// this returns repeats, so if a repeat, go on to next.
if (prefixToUriMap.containsKey(prefix))
continue;
String uri = node.getStringValue();
if (uri != null && !uri.isEmpty()) {
xPathCompiler.declareNamespace(prefix, uri);
prefixToUriMap.put(prefix, uri);
uriToPrefixMap.put(uri, prefix); }
}
}
public static EnterpriseConfiguration createEnterpriseConfiguration()
{
EnterpriseConfiguration configuration = new EnterpriseConfiguration();
configuration.supplyLicenseKey(new BufferedReader(new java.io.StringReader(deobfuscate(key))));
configuration.setConfigurationProperty(FeatureKeys.SUPPRESS_XPATH_WARNINGS, Boolean.TRUE);
return configuration;
}
}
Thanks for the clarifications. I think your real goal is to find a way to parse and process an XML Schema in Java without having to treat the XSD as an ordinary XML document (it is an ordinary XML document, but processing it using the standard facilities is not easy).
On that basis, I think this thread should help: In Java, how do I parse an xml schema (xsd) to learn what's valid at a given element?
Personally, I've never found any library that does a better job than the EMF XSD model. It's complex, but comprehensive.
i have followed https://gist.github.com/xrstf/b48a970098a8e76943b9 to integrate nutch and elastic-search. everything is working fine data is stored in Hbase 'webpage' table but i am not able to fetch data in elastic search.i want to know how to fetch data in elastic search.
below is my code
package com.process;
/*
import package will be here
*/
public class HbaseToElastic extends Configured implements
org.apache.hadoop.util.Tool {
static class Mapper extends TableMapper<Text, IndexWritable> {
public static String CLUSTER;
public static String SEARCH_HOST;
public static String SEARCH_PORT;
public static String SEARCH_INDEX_NAME;
public static String SEARCHtYPE;
public static int BULKSIZE;
public static String TABLENAME;
public static String FAMILY;
private static List<String> SPORTS_KEYWORDS;
private static List<String> BUSINESS_KEYWORDS;
private static List<String> GOSSIP_KEYWORDS;
private static List<String> CRIME_KEYWORDS;
private static Map<String, Map<String, String>> STATE_MAP = new HashMap<String, Map<String, String>>();
private static Map<String, String> CITY_MAP = new HashMap<String, String>();
private static Mapper mapper = new Mapper();
static {
try {
System.out.println("done1");
DetectorFactory.loadProfile("./profiles");
System.out.println("done2");
} catch (final LangDetectException e) {
System.out.println("done3");
e.printStackTrace();
}
}
Configuration hbaseConf = null;
HTable table = null;
List<Put> hbasePutErrorList = new ArrayList<Put>();
/**
* Clean up the hbase table object
*/
#Override
protected void cleanup(final Context context) throws IOException,
InterruptedException {
super.cleanup(context);
table.put(hbasePutErrorList);
table.close();
hbasePutErrorList.clear();
}
/**
* Initialize various variables
*/
#Override
protected void setup(
final org.apache.hadoop.mapreduce.Mapper<ImmutableBytesWritable, Result, Text, IndexWritable>.Context context)
throws IOException, InterruptedException {
final Configuration conf = context.getConfiguration();
CLUSTER = conf.get("cluster");
SEARCH_HOST = conf.get("search_host");
SEARCH_PORT = conf.get("search_port");
SEARCH_INDEX_NAME = conf.get("search_index_name");
SEARCHtYPE = conf.get("search_type");
BULKSIZE = conf.getInt("search_bulk_size", 500);
TABLENAME = conf.get("table_name");
FAMILY = conf.get("family");
hbaseConf = HBaseConfiguration.create();
hbaseConf.set("hbase.zookeeper.quorum",
conf.get("hbase.zookeeper.quorum"));
hbaseConf.set("hbase.zookeeper.property.clientPort",
conf.get("hbase.zookeeper.property.clientPort"));
hbaseConf.set("hbase.rpc.timeout", conf.get("hbase.rpc.timeout"));
hbaseConf.set("hbase.regionserver.lease.period",
conf.get("hbase.regionserver.lease.period"));
hbaseConf.set("hbase.master", conf.get("hbase.master"));
table = new HTable(hbaseConf, conf.get("table_name"));
SPORTS_KEYWORDS = new ArrayList<String>();
BUSINESS_KEYWORDS = new ArrayList<String>();
GOSSIP_KEYWORDS = new ArrayList<String>();
CRIME_KEYWORDS = new ArrayList<String>();
String keywrods = conf.get("sportskeywords");
String[] keyarr = keywrods.split(",");
for (final String key : keyarr) {
SPORTS_KEYWORDS.add(key.trim());
}
keywrods = conf.get("businesskeywords");
keyarr = keywrods.split(",");
for (final String key : keyarr) {
BUSINESS_KEYWORDS.add(key.trim());
}
keywrods = conf.get("gossipkeywords");
keyarr = keywrods.split(",");
for (final String key : keyarr) {
GOSSIP_KEYWORDS.add(key.trim());
}
keywrods = conf.get("crimekeywords");
keyarr = keywrods.split(",");
for (final String key : keyarr) {
CRIME_KEYWORDS.add(key.trim());
}
final String stateMap = conf.get("statemap");
final Gson g = new Gson();
STATE_MAP = g.fromJson(stateMap, Map.class);
}
/**
* map function
*/
#Override
public void map(final ImmutableBytesWritable row, final Result result,
final Context context) throws IOException, InterruptedException {
try {
final byte b = 0;
int deleteFlag = 0;
final String keyString = Bytes.toString(row.get());
final Map<String, Object> mapobject = new HashMap<String, Object>();
for (final KeyValue kv : result.raw()) {
final String key = (new String(kv.getQualifier()));
final String value = (new String(kv.getValue()));
mapobject.put(key, value);
}
final Gson g = new Gson();
if (checkValidType(mapobject)) {
refineMetaTags(mapobject);
if (refineDescription(mapobject)) {
assignCity(mapobject);
if (checkTitleImage(mapobject)) {
if (setLang(mapobject)) {
setCorrectCategory(mapobject);
correctDuplicateTitle(mapobject);
final String json = g.toJson(mapobject);
context.write(new Text(keyString),
new IndexWritable(json, b));
deleteFlag = 1;
}
}
}
}
if (deleteFlag == 0) {
final Put put = new Put(Bytes.toBytes(keyString));
put.add(Bytes.toBytes("cf"), Bytes.toBytes("ErrorFlag"),
Bytes.toBytes("1"));
hbasePutErrorList.add(put);
}
} catch (final Exception e) {
e.printStackTrace();
}
}
/**
* Remove duplicate statement in the title
*
* #param mapobject
*/
private void correctDuplicateTitle(final Map<String, Object> mapobject) {
final String duplicateTitle = mapobject.get("title").toString();
final String stripedTitleArr[] = duplicateTitle.split(" ", 4);
if (stripedTitleArr.length == 4) {
final String subString = stripedTitleArr[0] + " "
+ stripedTitleArr[1] + " " + stripedTitleArr[2];
if (stripedTitleArr[3].contains(subString)) {
mapobject.put("title", duplicateTitle
.substring(duplicateTitle.indexOf(subString,
subString.length() - 1)));
mapobject.put("title", stripedTitleArr[3]
.substring(stripedTitleArr[3].indexOf(subString)));
}
}
}
/**
* Set category based on the various category specific keyword
*
* #param mapobject
*/
private void setCorrectCategory(final Map<String, Object> mapobject) {
final String url = mapobject.get("url") + "";
final String cat = mapobject.get("tags") + "";
if ("sports".equalsIgnoreCase(cat)
|| "cricket".equalsIgnoreCase(cat)) {
if (!(url.toLowerCase().contains("sport")
|| url.toLowerCase().contains("खेल")
|| url.toLowerCase().contains("cric") || url
.toLowerCase().contains("क्रिकेट"))) {
final String desc = mapobject.get("description").toString();
boolean isSports = false;
int count = 0;
for (final String keyword : SPORTS_KEYWORDS) {
if (desc.contains(keyword)) {
count++;
}
}
if (count > 1) {
isSports = true;
}
if (!isSports) {
mapobject.put("tags", "national");
}
if (isSports
&& (desc.contains("क्रिकेट")
|| url.toLowerCase().contains("cric")
|| desc.contains("टॉस")
|| desc.contains("वनडे") || desc
.contains("बल्लेबाज"))) {
mapobject.put("tags", "cricket");
}
}
} else if ("business".equalsIgnoreCase(cat)) {
if ((url.toLowerCase().contains("sport") || url.toLowerCase()
.contains("खेल"))) {
mapobject.put("tags", "sports");
} else if (url.toLowerCase().contains("cric")
|| url.toLowerCase().contains("क्रिकेट")) {
mapobject.put("tags", "cricket");
} else if (!(url.toLowerCase().contains("busines")
|| url.toLowerCase().contains("व्यापार")
|| url.toLowerCase().contains("economy")
|| url.toLowerCase().contains("finance")
|| url.toLowerCase().contains("बिजनेस")
|| url.toLowerCase().contains("market")
|| url.toLowerCase().contains("karobar") || url
.contains("कारोबार"))) {
final String desc = mapobject.get("description").toString();
int count = 0;
for (final String keyword : BUSINESS_KEYWORDS) {
if (desc.contains(keyword)) {
count++;
}
}
if (count < 2) {
mapobject.put("tags", "national");
}
}
} else if ("gossip".equalsIgnoreCase(cat)) {
if ((url.toLowerCase().contains("sport") || url.toLowerCase()
.contains("खेल"))) {
mapobject.put("tags", "sports");
} else if (url.toLowerCase().contains("cric")
|| url.toLowerCase().contains("क्रिकेट")) {
mapobject.put("tags", "cricket");
} else if (url.toLowerCase().contains("busines")) {
mapobject.put("tags", "business");
} else if (!(url.toLowerCase().contains("masala")
|| url.toLowerCase().contains("gossip")
|| url.toLowerCase().contains("gupshup") || url
.toLowerCase().contains("garam"))) {
final String desc = mapobject.get("description").toString();
int count = 0;
for (final String keyword : GOSSIP_KEYWORDS) {
if (desc.contains(keyword)) {
count++;
}
}
if (count < 2) {
mapobject.put("tags", "national");
}
}
} else if ("crime".equalsIgnoreCase(cat)) {
if ((url.toLowerCase().contains("sport") || url.toLowerCase()
.contains("खेल"))) {
mapobject.put("tags", "sports");
} else if (url.toLowerCase().contains("cric")
|| url.toLowerCase().contains("क्रिकेट")) {
mapobject.put("tags", "cricket");
} else if (url.toLowerCase().contains("busines")) {
mapobject.put("tags", "business");
} else if (!(url.toLowerCase().contains("crime")
|| url.toLowerCase().contains("terrorist")
|| url.toLowerCase().contains("abuse")
|| url.toLowerCase().contains("forgery")
|| url.toLowerCase().contains("assault")
|| url.toLowerCase().contains("violence")
|| url.toLowerCase().contains("rape")
|| url.toLowerCase().contains("teasing")
|| url.toLowerCase().contains("molestation")
|| url.toLowerCase().contains("scandal") || url
.toLowerCase().contains("murder"))) {
final String desc = mapobject.get("description").toString();
int count = 0;
for (final String keyword : CRIME_KEYWORDS) {
if (desc.contains(keyword)) {
count++;
}
}
if (count < 2) {
mapobject.put("tags", "national");
}
}
} else if (cat != null && cat.startsWith("local")) {
}
}
/**
* Check valid type of the HTML pages
*
* #param mapobject
* #return
*/
private boolean checkValidType(final Map<String, Object> mapobject) {
if (mapobject.containsKey("type")
&& !(mapobject.get("type").toString().contains("image") || mapobject
.get("type").toString().contains("rss"))) {
return true;
}
return false;
}
/**
* refine the description according to its length and must starting with
* english and it the description is not present get the description
* from the metatags description
*
* #param mapobject
* #return {#link Boolean}
*/
private boolean refineDescription(final Map<String, Object> mapobject) {
if (mapobject.containsKey("description")
&& mapobject.get("description").toString().length() > 75
&& !mapobject.get("description").toString().contains(";}")
&& !mapobject.get("description").toString()
.contains("<cite>")
&& !mapobject.get("description").toString()
.contains("href=")
&& !mapobject.get("description").toString()
.contains("All rights reserved")) {
return true;
} else if (mapobject.containsKey("metatag.description")
&& mapobject.get("metatag.description").toString().length() > 75
&& !mapobject.get("metatag.description").toString()
.contains(";}")
&& !mapobject.get("metatag.description").toString()
.contains("<cite>")) {
mapobject.put("description",
mapobject.get("metatag.description"));
return true;
}
return false;
}
/**
* refine metatags by refining meta keyword to only include the English
* keyword only that has at most three keyword and if not present then
* create the keyword with title field of the html and if none of the
* keyword found then form it using the help of the url and exclude the
* number from the keywords
*
* #param mapobject
*/
private void refineMetaTags(final Map<String, Object> mapobject) {
String metaTag = "";
int tagFlag = 0;
if (mapobject.containsKey("metatag.keywords")) {
final String metaTags[] = mapobject.get("metatag.keywords")
.toString().replaceAll("\\|", ",").split(",");
String domain = null;
StringBuilder temp = null;
for (final String metaTag2 : metaTags) {
if (mapobject.containsKey("host")) {
domain = mapobject.get("host") + "";
if (domain.split("\\.").length > 1
&& (metaTag2
.contains(domain.split("\\.")[domain
.split("\\.").length - 2]) || metaTag2
.contains(domain.split("\\.")[0])))
{
continue;
}
}
String[] arr = metaTag2.split(" ");
arr = removeUnicodeWords(arr);
if (arr.length > 0 && arr.length < 5) {
temp = new StringBuilder();
for (final String str : arr) {
temp.append(str);
temp.append(" ");
}
if (metaTag.length() + temp.length() < 70) {
metaTag = metaTag + "," + temp.toString();
}
}
}
if (metaTag.startsWith(",")) {
metaTag = metaTag.trim();
metaTag = metaTag.substring(1, metaTag.length());
}
}
if (metaTag.length() < 1 && mapobject.containsKey("title")) {
/**
* Extracting tags from the title tag if the length of the
* keyword is greater than 4
*/
final String title = (String) mapobject.get("title");
final String splitTitle[] = title.split(" ");
int count = 0;
for (int i = 0; i < splitTitle.length; i++) {
if (splitTitle[i].length() > 4
&& !splitTitle[i].matches("^[\\u0900-\\u097F].*")) {
metaTag = metaTag + splitTitle[i] + ",";
count++;
if (count == 5) {
break;
}
}
}
if (metaTag.split(",").length > 3) {
if (metaTag.endsWith(",")) {
metaTag = metaTag.trim();
metaTag = metaTag.substring(0, metaTag.length() - 1);
}
} else {
metaTag = "";
}
}
if (metaTag.length() < 1) {
/**
* Extracting the tags from the url if the length of the keyword
* is greater than 4
*/
final String splitUrl[] = mapobject.get("url").toString()
.split("/");
final String lastSplitValue = splitUrl[splitUrl.length - 1];
final String tagList[] = generateTokens(lastSplitValue);
if (tagList != null) {
int count = 0;
for (int i = 0; i < tagList.length; i++) {
if (tagList[i].length() > 4
&& !tagList[i].matches("^[\\u0900-\\u097F].*")) {
metaTag = metaTag + tagList[i] + ",";
count++;
if (count == 5) {
break;
}
}
}
}
if (metaTag.endsWith(",")) {
metaTag = metaTag.trim();
metaTag = metaTag.substring(0, metaTag.length() - 1);
}
}
if (metaTag.length() > 0) {
metaTag = metaTag.replaceAll("\\[", "");
metaTag = metaTag.replaceAll("\"", "");
metaTag = metaTag.replaceAll(";", "");
metaTag = metaTag.replaceAll(":", "");
metaTag = metaTag.replaceAll("\u0027", "");
metaTag = metaTag.replaceAll("\u003d", "");
metaTag = metaTag.replaceAll("\u0026", "");
tagFlag = 1;
}
mapobject.put("TagFlag", tagFlag);
mapobject.put("metatag.keywords", metaTag);
}
/**
* Remove unicode character
*
* #param arr
* #return
*/
private String[] removeUnicodeWords(final String[] arr) {
final List<String> returnArr = new ArrayList<String>();
for (final String str : arr) {
if (str != null && str.trim().length() > 3
&& !str.matches("^[\\u0900-\\u097F].*")
&& !(str.matches("^[0-9].*"))) {
returnArr.add(str.trim());
}
}
final String[] retrnArr = new String[returnArr.size()];
returnArr.toArray(retrnArr);
return retrnArr;
}
/**
* Generate Token list with the help of the lucene analyzer
*
* #param lastSplitValue
* #return {#link ArrayIndexOutOfBoundsException} of the list of the
* keywords
*/
private String[] generateTokens(String lastSplitValue) {
final List<String> list = new ArrayList<String>();
lastSplitValue = lastSplitValue.replace("\\.", " ").replace("%20",
" ");
try {
final Version matchVersion = Version.LUCENE_45;
final Analyzer analyzer = new HindiAnalyzer(matchVersion);
final TokenStream ts = analyzer.tokenStream("field",
new StringReader(lastSplitValue));
ts.reset();
while (ts.incrementToken()) {
final CharTermAttribute cta = ts
.getAttribute(CharTermAttribute.class);
if (cta.toString().length() > 4
&& !cta.toString().matches("^[0-9].*")) {
list.add(cta.toString());
}
}
ts.end();
ts.close();
analyzer.close();
} catch (final Exception e) {
e.printStackTrace();
}
if (list.size() > 3) {
return list.toArray(new String[list.size()]);
} else {
return null;
}
}
/**
* Checks title and assign their language based on their first character
* of the title
*
* #param mapobject
* #return {#link Map}
*/
private boolean setLang(final Map<String, Object> mapobject) {
final String title = mapobject.get("title").toString();
final String description = mapobject.get("title").toString();
String language = "";
try {
language = mapper.detect(title);
mapper.detect(description);
} catch (final LangDetectException e) {
System.out.println("\n title with error is - " + title);
System.out.println("\n description with error is - "
+ description);
e.printStackTrace();
/*
* String title = mapobject.get("title").toString(); language =
* mapobject.get("lang") + ""; language = language.trim(); if
* (language.trim().equalsIgnoreCase("hi") ||
* language.trim().startsWith("en") ||
* language.trim().equalsIgnoreCase("lt")) { String[] titleArr =
* title.trim().split(" "); int i = 0; for (String titlePart :
* titleArr) { if
* (titlePart.trim().matches("^[\\u0900-\\u097F].*")) { i++; } }
* if (i >= titleArr.length * 0.5) { mapobject.put("lang",
* "hi"); } else { mapobject.put("lang", "lt"); } return true; }
*/
return false;
}
if (language.trim().equalsIgnoreCase("hi")
|| language.trim().startsWith("en")
|| language.trim().equalsIgnoreCase("lt")) {
mapobject.put("lang", language);
return true;
}
return false;
}
private String detect(final String text) throws LangDetectException {
final Detector detector = DetectorFactory.create();
detector.append(text);
return detector.detect();
}
/**
* Checks whether to include the doc based on their title and get the
* title from anchor tag title to choose the title that has largest
* number of the words and in hindi and it also gets the image from
* anchor tag href attribute
*
* #param mapobject
* of the key value pair
* #return {#link Boolean}
*/
private boolean checkTitleImage(final Map<String, Object> mapobject) {
final TreeSet<String> set = new TreeSet<String>(new SetSort());
final Gson gson = new Gson();
JsonArray array = null;
JsonObject object2 = null;
if (mapobject.containsKey("anchor")
&& mapobject.get("anchor") != null) {
final String arr = (String) mapobject.get("anchor");
try {
array = gson.fromJson(arr, JsonArray.class);
for (final JsonElement jsonElement : array) {
try {
object2 = gson.fromJson(jsonElement.getAsString(),
JsonObject.class);
} catch (final Exception e) {
if (object2 == null) {
object2 = new JsonObject();
object2.addProperty("title",
jsonElement.getAsString());
object2.addProperty("href", "");
object2.addProperty("alt", "");
}
}
if (object2 != null) {
assignTitleImage(mapobject, set, object2);
}
object2 = null;
}
} catch (final ClassCastException e) {
object2 = gson.fromJson(arr, JsonObject.class);
assignTitleImage(mapobject, set, object2);
} catch (final Exception e) {
e.printStackTrace();
}
if (!set.isEmpty()) {
int loop = 0;
final List<String> tempList = new LinkedList<String>();
for (final String string : set) {
final String title = string;
tempList.add(title.trim());
loop++;
if (loop == 2) {
break;
}
}
if (!tempList.isEmpty()) {
if (tempList.get(0).matches("^[\\u0900-\\u097F].*")) {
mapobject.put("title", tempList.get(0));
} else if (tempList.size() > 1
&& !(tempList.get(0)
.matches("^[\\u0900-\\u097F].*"))
&& tempList.get(1).matches(
"^[\\u0900-\\u097F].*")) {
mapobject.put("title", tempList.get(1));
} else {
mapobject.put("title", tempList.get(0));
}
}
}
}
if (mapobject.containsKey("title")
&& mapobject.get("title").toString().length() > 0
&& mapobject.get("title").toString().split(" ").length > 2
&& mapobject.get("title").toString().split(" ").length < 20
&& !mapobject.get("title").toString().contains("<")) {
if (set.isEmpty()) {
mapobject.put("title",
getTitleRefined(mapobject.get("title") + ""));
}
return true;
}
return false;
}
/**
* #param mapobject
* #param set
* #param object2
*/
private void assignTitleImage(final Map<String, Object> mapobject,
final TreeSet<String> set, final JsonObject object2) {
if (!mapobject.containsKey("ImgH1")
&& !mapobject.containsKey("ImgH2")) {
if (object2.get("href") != null
&& object2.get("href").getAsString().length() > 0
&& (object2.get("href").getAsString().toLowerCase()
.contains(".jpg")
|| object2.get("href").getAsString()
.toLowerCase().contains(".jpeg") || object2
.get("href").getAsString().toLowerCase()
.contains(".gif"))) {
putImages(mapobject, object2.get("href").getAsString()
.trim(), mapobject.get("tags").toString().trim()
.toLowerCase());
}
}
if (object2.get("title") != null
&& object2.get("title").getAsString().length() > 0
&& object2.get("title").getAsString().split(" ").length > 2
&& object2.get("title").getAsString().split(" ").length < 20
&& !object2.get("title").getAsString().contains("<")) {
final String newTitle = getTitleRefined(object2.get("title")
.getAsString());
set.add(newTitle.trim());
}
}
/**
* This function used to refine the title based on specific bad keyword
* during observation
*
* #param title
* #return refined title
*/
private String getTitleRefined(String title) {
title = title.replaceAll("\u0027", "");
title = title.replaceAll("\u0026", "");
title = title.replaceAll("\u003d", "");
if (title.contains("-")) {
if (title.trim().split("-").length > 1
&& !title.trim().split("-")[1].trim().matches(
"^[\\u0900-\\u097F].*")) {
return title.trim().split("-")[0].trim();
}
} else if (title.contains(":")) {
if (!title.trim().split(":")[0].trim().matches(
"^[\\u0900-\\u097F].*")
&& title.trim().split(":").length > 1) {
return title.trim().split(":")[1].trim();
}
}
return title;
}
/**
* Creates the path for the images
*
* #param map
* of the key value pair
* #param imageUrl
* #param category
*/
private void putImages(final Map<String, Object> map2,
final String imageUrl, final String category) {
try {
map2.put("ImgSrc", StringEscapeUtils.unescapeHtml(imageUrl)
.trim());
if (map2.containsKey("ImgSrc") && map2.get("ImgSrc") != null
&& map2.get("ImgSrc").toString().length() > 0) {
map2.put(
"ImgSrc",
StringEscapeUtils.unescapeHtml(map2.get("ImgSrc")
.toString())
+ "##RAFTAAR##"
+ imageUrl.trim());
} else {
return;
}
String imgNamearr[] = null;
try {
imgNamearr = imageUrl.split("/");
} catch (final Exception e) {
e.printStackTrace();
}
String imgName = null;
try {
imgName = imgNamearr[imgNamearr.length - 1];
} catch (final Exception e) {
e.printStackTrace();
}
final String imagePath = "/"
+ String.valueOf(imgName.charAt(0));
imgName = imgName.replaceAll(" ", "_").replaceAll("%20", "_");
if (imgName.split(".jpg").length > 0) {
imgName = imgName.split(".jpg")[0];
imgName = imgName + ".jpg";
}
map2.put("ImgH1", "h1/" + category + imagePath + "/" + imgName);
map2.put("ImgH2", "h2/" + category + imagePath + "/" + imgName);
} catch (final Exception e) {
e.printStackTrace();
}
}
/**
* Inserts the data to the elasticsearch
*
* #param mapobject
* #param key
* unique id generally it is the unique url
*/
public static void insertToElastic(final Map<String, Object> mapobject,
final String key) {
final Settings settings = ImmutableSettings.settingsBuilder()
.put("cluster.name", CLUSTER).build();/*
* change ccluster.name
* to cluster
*/
final Client client = new TransportClient(settings)
.addTransportAddress(new InetSocketTransportAddress(
SEARCH_HOST, Integer.parseInt(SEARCH_PORT)));
client.prepareIndex(SEARCH_INDEX_NAME, SEARCHtYPE, key)
.setSource(mapobject).execute().actionGet();
client.close();
}
/**
* Assign the city to the news without city
*
* #param category
* #param description
* #return update category with city
*/
private static void assignCity(final Map<String, Object> mapobject) {
String category = mapobject.get("tags").toString();
if (category.endsWith("/")) {
boolean flag = true;
final String catArr[] = category.split("/");
if (catArr.length == 2) {
final String state = catArr[1];
CITY_MAP = STATE_MAP.get(state);
for (final Entry<String, String> e : CITY_MAP.entrySet()) {
final String description = mapobject.get("description")
.toString();
if (description.contains(e.getValue())) {
category = category + e.getKey();
mapobject.put("tags", category);
flag = false;
break;
}
}
}
if (flag) {
mapobject.put("tags", "national");
}
}
}
}
/**
* Update the data to hbase
*
* #param tableName
* #param rowKey
* #param family
* #param qualifier
* #param value
* #param conf
*/
public static void updateIntoHbase(final String tableName,
final String rowKey, final String family, final String qualifier,
final String value, final Configuration conf) {
HTable table = null;
try {
table = new HTable(conf, tableName);
} catch (final IOException e) {
e.printStackTrace();
}
final Put put = new Put(Bytes.toBytes(rowKey));
put.add(Bytes.toBytes(family), Bytes.toBytes(qualifier),
Bytes.toBytes(value));
try {
table.put(put);
table.close();
} catch (final IOException e) {
e.printStackTrace();
}
}
/**
* Return the map of the all states and city
*
* #param fileName
* #return
*/
private static Map<String, Map<String, String>> returnMap(
final String fileName) {
final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
BufferedReader br = null;
try {
br = new BufferedReader(new FileReader(fileName));
String line;
while ((line = br.readLine()) != null) {
final String arr[] = line.split("\t", 3);
if (arr.length == 3) {
if (map.containsKey(arr[0])) {
Map<String, String> m = new HashMap<String, String>();
m = map.get(arr[0]);
m.put(arr[1], arr[2]);
} else {
final Map<String, String> m = new HashMap<String, String>();
m.put(arr[1], arr[2]);
map.put(arr[0], m);
}
}
}
} catch (final FileNotFoundException e) {
e.printStackTrace();
} catch (final IOException e) {
e.printStackTrace();
} finally {
if (br != null) {
try {
br.close();
} catch (final Exception e) {
e.printStackTrace();
}
}
}
return map;
}
public static void main(final String[] args) throws Exception {
int c = 0;
c = ToolRunner.run(new Configuration(), new HbaseToElastic(), args);
System.exit(c);
}
}
I want my java program to constantly scan for new files in a particular folder.
I am expecting something like:
while(1) {
String csvFile = "D:\\myfiles\\done\\*.*";
br = new BufferedReader(new FileReader(csvFile));
// DO Some thing
}
Is there a good way to do this?
You probably do not want to execute the loop over and over, as this would cause poor performance. You can watch for directory changes instead, which will suspend the working thread while no new files are available (except from the article linked):
WatchKey key;
try {
key = watcher.take(); // blocking call, similar to sleep()
} catch (InterruptedException x) {
return; // this happens if your thread got interrupted
}
// do something with the recived key
You can use File to get all the files.
Check this:
File folder = new File("D:/myfiles/done");
File[] files = folder.listFiles();
for(File f : files){
// Read f in here
}
This FileObserver class looks quite promising:
/**
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
*
* Copyright (c) 2007 Sun Microsystems Inc. All Rights Reserved
*
* The contents of this file are subject to the terms
* of the Common Development and Distribution License
* (the License). You may not use this file except in
* compliance with the License.
*
* You can obtain a copy of the License at
* https://opensso.dev.java.net/public/CDDLv1.0.html or
* opensso/legal/CDDLv1.0.txt
* See the License for the specific language governing
* permission and limitations under the License.
*
* When distributing Covered Code, include this CDDL
* Header Notice in each file and include the License file
* at opensso/legal/CDDLv1.0.txt.
* If applicable, add the following below the CDDL Header,
* with the fields enclosed by brackets [] replaced by
* your own identifying information:
* "Portions Copyrighted [year] [name of copyright owner]"
*
* $Id: FileObserver.java,v 1.3 2008/06/25 05:44:08 qcheng Exp $
*
*/
package com.sun.identity.sm.flatfile;
import com.iplanet.am.util.SystemProperties;
import com.sun.identity.shared.Constants;
import com.sun.identity.shared.debug.Debug;
import com.sun.identity.sm.SMSObjectListener;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
class FileObserver extends Thread {
private static Debug debug = Debug.getInstance("amSMSEvent");
private Map snapShot;
private int interval;
private boolean running;
private FlatFileEventManager eventManager;
FileObserver(FlatFileEventManager eventManager) {
setDaemon(true);
getPollingInterval();
this.eventManager = eventManager;
}
private void getPollingInterval() {
String time = SystemProperties.get(
Constants.CACHE_POLLING_TIME_PROPERTY);
interval = Constants.DEFAULT_CACHE_POLLING_TIME;
if (time != null) {
try {
interval = Integer.parseInt(time);
} catch (NumberFormatException nfe) {
debug.error(
"FileObserver.getCachePollingInterval", nfe);
}
}
interval = interval * 60 * 1000;
}
/**
* Returns <code>true</code> if thread is running.
*
* #return <code>true</code> if thread is running.
*/
public boolean isRunning() {
return running;
}
/**
* Stops the thread.
*/
public void stopThread() {
running = false;
}
public void run() {
running = true;
snapShot = getCurrentSnapShot();
try {
while (running) {
/*
* This flag set to false in the begin of the thread.
* when a node is added/delete from the file system, we need
* to toggle this flag which in turn ask the
* SMSEnhancedFlatFileObject to rebuild the directory tree.
*/
boolean needReloadRootNode = false;
sleep(interval);
Map newSnapShot = getCurrentSnapShot();
if (snapShot != null) {
for (Iterator i = newSnapShot.keySet().iterator();
i.hasNext();
) {
String filename = (String)i.next();
if (snapShot.containsKey(filename)) {
long prev =((Long)snapShot.get(filename))
.longValue();
long curr =((Long)newSnapShot.get(filename))
.longValue();
if (prev != curr) {
eventManager.notify(getDN(filename),
SMSObjectListener.MODIFY);
}
} else {
if (!needReloadRootNode) {
eventManager.reloadRootNode();
needReloadRootNode = true;
}
eventManager.notify(getDN(filename),
SMSObjectListener.ADD);
}
}
for (Iterator i = snapShot.keySet().iterator();
i.hasNext();
) {
String filename = (String)i.next();
if (!newSnapShot.containsKey(filename)) {
if (!needReloadRootNode) {
eventManager.reloadRootNode();
needReloadRootNode = true;
}
eventManager.notify(getDN(filename),
SMSObjectListener.DELETE);
}
}
}
snapShot = newSnapShot;
}
} catch (InterruptedException e) {
debug.warning("FileObserver.run", e);
}
}
private String getDN(String filename) {
BufferedReader buff = null;
String dn = null;
try{
buff = new BufferedReader(new FileReader(filename));
String line = buff.readLine();
if ((line != null) && line.startsWith("#")) {
dn = line.substring(1);
}
} catch (IOException e) {
debug.warning("FileObserver.getDN", e);
} finally {
if (buff != null) {
try {
buff.close();
} catch (IOException ex) {
//ignored
}
}
}
return dn;
}
private Map getCurrentSnapShot() {
Map snapshot = null;
String baseDir = SystemProperties.get(
SMSFlatFileObjectBase.SMS_FLATFILE_ROOTDIR_PROPERTY);
File dir = new File(baseDir);
String[] files = dir.list();
// if the war is not configured we may not get any files here.
if (files.length > 0) {
snapshot = new HashMap(files.length *2);
for (int i = 0; i < files.length; i++) {
String filename = baseDir + "/" + files[i];
File f = new File(filename);
if (!f.isDirectory()) {
snapshot.put(filename, new Long(f.lastModified()));
}
}
}
return snapshot;
}
}
If you prefer a library you can also check the Apache commons FileAlterationObserver.
This may be useful.
import static java.nio.file.StandardWatchEventKinds.ENTRY_CREATE;
import static java.nio.file.StandardWatchEventKinds.ENTRY_DELETE;
import static java.nio.file.StandardWatchEventKinds.ENTRY_MODIFY;
import java.io.IOException;
import java.nio.file.FileSystems;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.WatchEvent;
import java.nio.file.WatchKey;
import java.nio.file.WatchService;
public class DirectoryWatchDemo {
public static void main(String[] args) {
try {
WatchService watcher = FileSystems.getDefault().newWatchService();
Path dir = Paths.get("D:\\Mani_Test\\");
dir.register(watcher, ENTRY_CREATE, ENTRY_DELETE, ENTRY_MODIFY);
System.out.println("Watch Service registered for dir: " + dir.getFileName());
while (true) {
WatchKey key;
try {
key = watcher.take();
} catch (InterruptedException ex) {
ex.printStackTrace();
return;
}
for (WatchEvent<?> event : key.pollEvents()) {
WatchEvent.Kind<?> kind = event.kind();
WatchEvent<Path> ev = (WatchEvent<Path>) event;
Path fileName = ev.context();
System.out.println(kind.name() + ": " + fileName);
}
boolean valid = key.reset();
if (!valid) {
break;
}
}
} catch (IOException ex) {
System.err.println(ex);
}
}
}
File object can be used for folder also,
Here is the code
String filePath = "D:\\myfiles\\done\\";
File dir = new File(filePath);
if (dir.isDirectory())
{
File[] fileList = dir.listFiles();
if (fileList.length > 0)
{
// Your code here
}
}
I use gml (3.1.1) XSDs in XSD for my application. I want to download all gml XSDs in version 3.1.1 in for example zip file. In other words: base xsd is here and I want to download this XSD with all imports in zip file or something like zip file. Is there any application which supports that?
I've found this downloader but it doesn't works for me (I think that this application is not supporting relative paths in imports which occurs in gml.xsd 3.1.1). Any ideas?
QTAssistant's XSR (I am associated with it) has an easy to use function that allows one to automatically import and refactor XSD content as local files from all sorts of sources. In the process it'll update schema location references, etc.
I've made a simple screen capture of the steps involved in achieving a task like this which should demonstrate its usability.
Based on the solution of mschwehl, I made an improved class to achieve the fetch. It suited well with the question. See https://github.com/mfalaize/schema-fetcher
You can achieve this using SOAP UI.
Follow these steps :
Create a project using the WSDL.
Choose your interface and open in interface viewer.
Navigate to the tab 'WSDL Content'.
Use the last icon under the tab 'WSDL Content' : 'Export the entire WSDL and included/imported files to a local directory'.
select the folder where you want the XSDs to be exported to.
Note: SOAPUI will remove all relative paths and will save all XSDs to the same folder.
I have written a simple java-main that does the job and change to relative url's
package dl;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.Authenticator;
import java.net.PasswordAuthentication;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
public class SchemaPersister {
private static final String EXPORT_FILESYSTEM_ROOT = "C:/export/xsd";
// some caching of the http-responses
private static Map<String,String> _httpContentCache = new HashMap<String,String>();
public static void main(String[] args) {
try {
new SchemaPersister().doIt();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private void doIt() throws Exception {
// // if you need an inouse-Proxy
// final String authUser = "xxxxx";
// final String authPassword = "xxxx"
//
// System.setProperty("http.proxyHost", "xxxxx");
// System.setProperty("http.proxyPort", "xxxx");
// System.setProperty("http.proxyUser", authUser);
// System.setProperty("http.proxyPassword", authPassword);
//
// Authenticator.setDefault(
// new Authenticator() {
// public PasswordAuthentication getPasswordAuthentication() {
// return new PasswordAuthentication(authUser, authPassword.toCharArray());
// }
// }
// );
//
Set <SchemaElement> allElements = new HashSet<SchemaElement>() ;
// URL url = new URL("file:/C:/xauslaender-nachrichten-administration.xsd");
URL url = new URL("http://www.osci.de/xauslaender141/xauslaender-nachrichten-bamf-abh.xsd");
allElements.add ( new SchemaElement(url));
for (SchemaElement e: allElements) {
System.out.println("processing " + e);
e.doAll();
}
System.out.println("done!");
}
class SchemaElement {
private URL _url;
private String _content;
public List <SchemaElement> _imports ;
public List <SchemaElement> _includes ;
public SchemaElement(URL url) {
this._url = url;
}
public void checkIncludesAndImportsRecursive() throws Exception {
InputStream in = new ByteArrayInputStream(downloadContent() .getBytes("UTF-8"));
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(in);
List<Node> includeNodeList = null;
List<Node> importNodeList = null;
includeNodeList = getXpathAttribute(doc,"/*[local-name()='schema']/*[local-name()='include']");
_includes = new ArrayList <SchemaElement> ();
for ( Node element: includeNodeList) {
Node sl = element.getAttributes().getNamedItem("schemaLocation");
if (sl == null) {
System.out.println(_url + " defines one import but no schemaLocation");
continue;
}
String asStringAttribute = sl.getNodeValue();
URL url = buildUrl(asStringAttribute,_url);
SchemaElement tmp = new SchemaElement(url);
tmp.setSchemaLocation(asStringAttribute);
tmp.checkIncludesAndImportsRecursive();
_includes.add(tmp);
}
importNodeList = getXpathAttribute(doc,"/*[local-name()='schema']/*[local-name()='import']");
_imports = new ArrayList <SchemaElement> ();
for ( Node element: importNodeList) {
Node sl = element.getAttributes().getNamedItem("schemaLocation");
if (sl == null) {
System.out.println(_url + " defines one import but no schemaLocation");
continue;
}
String asStringAttribute = sl.getNodeValue();
URL url = buildUrl(asStringAttribute,_url);
SchemaElement tmp = new SchemaElement(url);
tmp.setSchemaLocation(asStringAttribute);
tmp.checkIncludesAndImportsRecursive();
_imports.add(tmp);
}
in.close();
}
private String schemaLocation;
private void setSchemaLocation(String schemaLocation) {
this.schemaLocation = schemaLocation;
}
// http://stackoverflow.com/questions/10159186/how-to-get-parent-url-in-java
private URL buildUrl(String asStringAttribute, URL parent) throws Exception {
if (asStringAttribute.startsWith("http")) {
return new URL(asStringAttribute);
}
if (asStringAttribute.startsWith("file")) {
return new URL(asStringAttribute);
}
// relative URL
URI parentUri = parent.toURI().getPath().endsWith("/") ? parent.toURI().resolve("..") : parent.toURI().resolve(".");
return new URL(parentUri.toURL().toString() + asStringAttribute );
}
public void doAll() throws Exception {
System.out.println("READ ELEMENTS");
checkIncludesAndImportsRecursive();
System.out.println("PRINTING DEPENDENCYS");
printRecursive(0);
System.out.println("GENERATE OUTPUT");
patchAndPersistRecursive(0);
}
public void patchAndPersistRecursive(int level) throws Exception {
File f = new File(EXPORT_FILESYSTEM_ROOT + File.separator + this.getXDSName() );
System.out.println("FILENAME: " + f.getAbsolutePath());
if (_imports.size() > 0) {
for (int i = 0; i < level; i++) {
System.out.print(" ");
}
System.out.println("IMPORTS");
for (SchemaElement kid : _imports) {
kid.patchAndPersistRecursive(level+1);
}
}
if (_includes.size() > 0) {
for (int i = 0; i < level; i++) {
System.out.print(" ");
}
System.out.println("INCLUDES");
for (SchemaElement kid : _includes) {
kid.patchAndPersistRecursive(level+1);
}
}
String contentTemp = downloadContent();
for (SchemaElement i : _imports ) {
if (i.isHTTP()) {
contentTemp = contentTemp.replace(
"<xs:import schemaLocation=\"" + i.getSchemaLocation() ,
"<xs:import schemaLocation=\"" + i.getXDSName() );
}
}
for (SchemaElement i : _includes ) {
if (i.isHTTP()) {
contentTemp = contentTemp.replace(
"<xs:include schemaLocation=\"" + i.getSchemaLocation(),
"<xs:include schemaLocation=\"" + i.getXDSName() );
}
}
FileOutputStream fos = new FileOutputStream(f);
fos.write(contentTemp.getBytes("UTF-8"));
fos.close();
System.out.println("File written: " + f.getAbsolutePath() );
}
public void printRecursive(int level) {
for (int i = 0; i < level; i++) {
System.out.print(" ");
}
System.out.println(_url.toString());
if (this._imports.size() > 0) {
for (int i = 0; i < level; i++) {
System.out.print(" ");
}
System.out.println("IMPORTS");
for (SchemaElement kid : this._imports) {
kid.printRecursive(level+1);
}
}
if (this._includes.size() > 0) {
for (int i = 0; i < level; i++) {
System.out.print(" ");
}
System.out.println("INCLUDES");
for (SchemaElement kid : this._includes) {
kid.printRecursive(level+1);
}
}
}
String getSchemaLocation() {
return schemaLocation;
}
/**
* removes html:// and replaces / with _
* #return
*/
private String getXDSName() {
String tmp = schemaLocation;
// Root on local File-System -- just grap the last part of it
if (tmp == null) {
tmp = _url.toString().replaceFirst(".*/([^/?]+).*", "$1");
}
if ( isHTTP() ) {
tmp = tmp.replace("http://", "");
tmp = tmp.replace("/", "_");
} else {
tmp = tmp.replace("/", "_");
tmp = tmp.replace("\\", "_");
}
return tmp;
}
private boolean isHTTP() {
return _url.getProtocol().startsWith("http");
}
private String downloadContent() throws Exception {
if (_content == null) {
System.out.println("reading content from " + _url.toString());
if (_httpContentCache.containsKey(_url.toString())) {
this._content = _httpContentCache.get(_url.toString());
System.out.println("Cache hit! " + _url.toString());
} else {
System.out.println("Download " + _url.toString());
Scanner scan = new Scanner(_url.openStream(), "UTF-8");
if (isHTTP()) {
this._content = scan.useDelimiter("\\A").next();
} else {
this._content = scan.useDelimiter("\\Z").next();
}
scan.close();
if (this._content != null) {
_httpContentCache.put(_url.toString(), this._content);
}
}
}
if (_content == null) {
throw new NullPointerException("Content of " + _url.toString() + "is null ");
}
return _content;
}
private List<Node> getXpathAttribute(Document doc, String path) throws Exception {
List <Node> returnList = new ArrayList <Node> ();
XPathFactory xPathfactory = XPathFactory.newInstance();
XPath xpath = xPathfactory.newXPath();
{
XPathExpression expr = xpath.compile(path);
NodeList nodeList = (NodeList) expr.evaluate(doc, XPathConstants.NODESET );
for (int i = 0 ; i < nodeList.getLength(); i++) {
Node n = nodeList.item(i);
returnList.add(n);
}
}
return returnList;
}
#Override
public String toString() {
if (_url != null) {
return _url.toString();
}
return super.toString();
}
}
}
I created a python tool to recursively download XSDs with relative paths in import tags (eg: <import schemaLocation="../../../../abc)
https://github.com/n-a-t-e/xsd_download
After downloading the schema you can use xmllint to validate an XML document
I am using org.apache.xmlbeans.impl.tool.SchemaResourceManager from the xmlbeans project. This class is quick and easy to use.
for example:
SchemaResourceManager manager = new SchemaResourceManager(new File(dir));
manager.process(schemaUris, emptyArray(), false, true, true);
manager.writeCache();
This class has a main method that documents the different options available.
How can I get log4j to delete old rotating log files? I know I can set up automated jobs (cron for UNIX and scheduled task for Windows), but I want it cross platform, and I want it in our application's log configuration as a part of our application, rather than in separate code outside in OS specific scripting languages. Our application is not written in OS scripting languages, and I don't want to do this part of it in them.
RollingFileAppender does this. You just need to set maxBackupIndex to the highest value for the backup file.
Logs rotate for a reason, so that you only keep so many log files around. In log4j.xml you can add this to your node:
<param name="MaxBackupIndex" value="20"/>
The value tells log4j.xml to only keep 20 rotated log files around. You can limit this to 5 if you want or even 1. If your application isn't logging that much data, and you have 20 log files spanning the last 8 months, but you only need a weeks worth of logs, then I think you need to tweak your log4j.xml "MaxBackupIndex" and "MaxFileSize" params.
Alternatively, if you are using a properties file (instead of the xml) and wish to save 15 files (for example)
log4j.appender.[appenderName].MaxBackupIndex = 15
There is no default value to control deleting old log files created by DailyRollingFileAppender. But you can write your own custom Appender that deletes old log files in much the same way as setting maxBackupIndex does for RollingFileAppender.
Simple instructions found here
From 1:
If you are trying to use the Apache Log4J DailyRollingFileAppender for a daily log file, you may need to want to specify the maximum number of files which should be kept. Just like rolling RollingFileAppender supports maxBackupIndex. But the current version of Log4j (Apache log4j 1.2.16) does not provide any mechanism to delete old log files if you are using DailyRollingFileAppender. I tried to make small modifications in the original version of DailyRollingFileAppender to add maxBackupIndex property. So, it would be possible to clean up old log files which may not be required for future usage.
You can achieve it using custom log4j appender.
MaxNumberOfDays - possibility to set amount of days of rotated log files.
CompressBackups - possibility to archive old logs with zip extension.
package com.example.package;
import org.apache.log4j.FileAppender;
import org.apache.log4j.Layout;
import org.apache.log4j.helpers.LogLog;
import org.apache.log4j.spi.LoggingEvent;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.Locale;
import java.util.Optional;
import java.util.TimeZone;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
public class CustomLog4jAppender extends FileAppender {
private static final int TOP_OF_TROUBLE = -1;
private static final int TOP_OF_MINUTE = 0;
private static final int TOP_OF_HOUR = 1;
private static final int HALF_DAY = 2;
private static final int TOP_OF_DAY = 3;
private static final int TOP_OF_WEEK = 4;
private static final int TOP_OF_MONTH = 5;
private String datePattern = "'.'yyyy-MM-dd";
private String compressBackups = "false";
private String maxNumberOfDays = "7";
private String scheduledFilename;
private long nextCheck = System.currentTimeMillis() - 1;
private Date now = new Date();
private SimpleDateFormat sdf;
private RollingCalendar rc = new RollingCalendar();
private static final TimeZone gmtTimeZone = TimeZone.getTimeZone("GMT");
public CustomLog4jAppender() {
}
public CustomLog4jAppender(Layout layout, String filename, String datePattern) throws IOException {
super(layout, filename, true);
this.datePattern = datePattern;
activateOptions();
}
public void setDatePattern(String pattern) {
datePattern = pattern;
}
public String getDatePattern() {
return datePattern;
}
#Override
public void activateOptions() {
super.activateOptions();
if (datePattern != null && fileName != null) {
now.setTime(System.currentTimeMillis());
sdf = new SimpleDateFormat(datePattern);
int type = computeCheckPeriod();
printPeriodicity(type);
rc.setType(type);
File file = new File(fileName);
scheduledFilename = fileName + sdf.format(new Date(file.lastModified()));
} else {
LogLog.error("Either File or DatePattern options are not set for appender [" + name + "].");
}
}
private void printPeriodicity(int type) {
String appender = "Log4J Appender: ";
switch (type) {
case TOP_OF_MINUTE:
LogLog.debug(appender + name + " to be rolled every minute.");
break;
case TOP_OF_HOUR:
LogLog.debug(appender + name + " to be rolled on top of every hour.");
break;
case HALF_DAY:
LogLog.debug(appender + name + " to be rolled at midday and midnight.");
break;
case TOP_OF_DAY:
LogLog.debug(appender + name + " to be rolled at midnight.");
break;
case TOP_OF_WEEK:
LogLog.debug(appender + name + " to be rolled at start of week.");
break;
case TOP_OF_MONTH:
LogLog.debug(appender + name + " to be rolled at start of every month.");
break;
default:
LogLog.warn("Unknown periodicity for appender [" + name + "].");
}
}
private int computeCheckPeriod() {
RollingCalendar rollingCalendar = new RollingCalendar(gmtTimeZone, Locale.ENGLISH);
Date epoch = new Date(0);
if (datePattern != null) {
for (int i = TOP_OF_MINUTE; i <= TOP_OF_MONTH; i++) {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat(datePattern);
simpleDateFormat.setTimeZone(gmtTimeZone);
String r0 = simpleDateFormat.format(epoch);
rollingCalendar.setType(i);
Date next = new Date(rollingCalendar.getNextCheckMillis(epoch));
String r1 = simpleDateFormat.format(next);
if (!r0.equals(r1)) {
return i;
}
}
}
return TOP_OF_TROUBLE;
}
private void rollOver() throws IOException {
if (datePattern == null) {
errorHandler.error("Missing DatePattern option in rollOver().");
return;
}
String datedFilename = fileName + sdf.format(now);
if (scheduledFilename.equals(datedFilename)) {
return;
}
this.closeFile();
File target = new File(scheduledFilename);
if (target.exists()) {
Files.delete(target.toPath());
}
File file = new File(fileName);
boolean result = file.renameTo(target);
if (result) {
LogLog.debug(fileName + " -> " + scheduledFilename);
} else {
LogLog.error("Failed to rename [" + fileName + "] to [" + scheduledFilename + "].");
}
try {
this.setFile(fileName, false, this.bufferedIO, this.bufferSize);
} catch (IOException e) {
errorHandler.error("setFile(" + fileName + ", false) call failed.");
}
scheduledFilename = datedFilename;
}
#Override
protected void subAppend(LoggingEvent event) {
long n = System.currentTimeMillis();
if (n >= nextCheck) {
now.setTime(n);
nextCheck = rc.getNextCheckMillis(now);
try {
cleanupAndRollOver();
} catch (IOException ioe) {
LogLog.error("cleanupAndRollover() failed.", ioe);
}
}
super.subAppend(event);
}
public String getCompressBackups() {
return compressBackups;
}
public void setCompressBackups(String compressBackups) {
this.compressBackups = compressBackups;
}
public String getMaxNumberOfDays() {
return maxNumberOfDays;
}
public void setMaxNumberOfDays(String maxNumberOfDays) {
this.maxNumberOfDays = maxNumberOfDays;
}
protected void cleanupAndRollOver() throws IOException {
File file = new File(fileName);
Calendar cal = Calendar.getInstance();
int maxDays = 7;
try {
maxDays = Integer.parseInt(getMaxNumberOfDays());
} catch (Exception e) {
// just leave it at 7.
}
cal.add(Calendar.DATE, -maxDays);
Date cutoffDate = cal.getTime();
if (file.getParentFile().exists()) {
File[] files = file.getParentFile().listFiles(new StartsWithFileFilter(file.getName(), false));
int nameLength = file.getName().length();
for (File value : Optional.ofNullable(files).orElse(new File[0])) {
String datePart;
try {
datePart = value.getName().substring(nameLength);
Date date = sdf.parse(datePart);
if (date.before(cutoffDate)) {
Files.delete(value.toPath());
} else if (getCompressBackups().equalsIgnoreCase("YES") || getCompressBackups().equalsIgnoreCase("TRUE")) {
zipAndDelete(value);
}
} catch (Exception pe) {
// This isn't a file we should touch (it isn't named correctly)
}
}
}
rollOver();
}
private void zipAndDelete(File file) throws IOException {
if (!file.getName().endsWith(".zip")) {
File zipFile = new File(file.getParent(), file.getName() + ".zip");
try (FileInputStream fis = new FileInputStream(file);
FileOutputStream fos = new FileOutputStream(zipFile);
ZipOutputStream zos = new ZipOutputStream(fos)) {
ZipEntry zipEntry = new ZipEntry(file.getName());
zos.putNextEntry(zipEntry);
byte[] buffer = new byte[4096];
while (true) {
int bytesRead = fis.read(buffer);
if (bytesRead == -1) {
break;
} else {
zos.write(buffer, 0, bytesRead);
}
}
zos.closeEntry();
}
Files.delete(file.toPath());
}
}
class StartsWithFileFilter implements FileFilter {
private String startsWith;
private boolean inclDirs;
StartsWithFileFilter(String startsWith, boolean includeDirectories) {
super();
this.startsWith = startsWith.toUpperCase();
inclDirs = includeDirectories;
}
public boolean accept(File pathname) {
if (!inclDirs && pathname.isDirectory()) {
return false;
} else {
return pathname.getName().toUpperCase().startsWith(startsWith);
}
}
}
class RollingCalendar extends GregorianCalendar {
private static final long serialVersionUID = -3560331770601814177L;
int type = CustomLog4jAppender.TOP_OF_TROUBLE;
RollingCalendar() {
super();
}
RollingCalendar(TimeZone tz, Locale locale) {
super(tz, locale);
}
void setType(int type) {
this.type = type;
}
long getNextCheckMillis(Date now) {
return getNextCheckDate(now).getTime();
}
Date getNextCheckDate(Date now) {
this.setTime(now);
switch (type) {
case CustomLog4jAppender.TOP_OF_MINUTE:
this.set(Calendar.SECOND, 0);
this.set(Calendar.MILLISECOND, 0);
this.add(Calendar.MINUTE, 1);
break;
case CustomLog4jAppender.TOP_OF_HOUR:
this.set(Calendar.MINUTE, 0);
this.set(Calendar.SECOND, 0);
this.set(Calendar.MILLISECOND, 0);
this.add(Calendar.HOUR_OF_DAY, 1);
break;
case CustomLog4jAppender.HALF_DAY:
this.set(Calendar.MINUTE, 0);
this.set(Calendar.SECOND, 0);
this.set(Calendar.MILLISECOND, 0);
int hour = get(Calendar.HOUR_OF_DAY);
if (hour < 12) {
this.set(Calendar.HOUR_OF_DAY, 12);
} else {
this.set(Calendar.HOUR_OF_DAY, 0);
this.add(Calendar.DAY_OF_MONTH, 1);
}
break;
case CustomLog4jAppender.TOP_OF_DAY:
this.set(Calendar.HOUR_OF_DAY, 0);
this.set(Calendar.MINUTE, 0);
this.set(Calendar.SECOND, 0);
this.set(Calendar.MILLISECOND, 0);
this.add(Calendar.DATE, 1);
break;
case CustomLog4jAppender.TOP_OF_WEEK:
this.set(Calendar.DAY_OF_WEEK, getFirstDayOfWeek());
this.set(Calendar.HOUR_OF_DAY, 0);
this.set(Calendar.MINUTE, 0);
this.set(Calendar.SECOND, 0);
this.set(Calendar.MILLISECOND, 0);
this.add(Calendar.WEEK_OF_YEAR, 1);
break;
case CustomLog4jAppender.TOP_OF_MONTH:
this.set(Calendar.DATE, 1);
this.set(Calendar.HOUR_OF_DAY, 0);
this.set(Calendar.MINUTE, 0);
this.set(Calendar.SECOND, 0);
this.set(Calendar.MILLISECOND, 0);
this.add(Calendar.MONTH, 1);
break;
default:
throw new IllegalStateException("Unknown periodicity type.");
}
return getTime();
}
}
}
And use this properties in your log4j config file:
log4j.appender.[appenderName]=com.example.package.CustomLog4jAppender
log4j.appender.[appenderName].File=/logs/app-daily.log
log4j.appender.[appenderName].Append=true
log4j.appender.[appenderName].encoding=UTF-8
log4j.appender.[appenderName].layout=org.apache.log4j.EnhancedPatternLayout
log4j.appender.[appenderName].layout.ConversionPattern=%-5.5p %d %C{1.} - %m%n
log4j.appender.[appenderName].DatePattern='.'yyyy-MM-dd
log4j.appender.[appenderName].MaxNumberOfDays=7
log4j.appender.[appenderName].CompressBackups=true