Added testcases for indexing.
Change-Id: I63ceff2056141d8ac576cb78e0a3f4a8e0a02e44
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index e4a6594..3a9349b 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -1,12 +1,11 @@
package de.ids_mannheim.korap;
// Krill classes
-import de.ids_mannheim.korap.*;
import de.ids_mannheim.korap.index.*;
import de.ids_mannheim.korap.response.*;
import de.ids_mannheim.korap.query.SpanElementQuery;
+import de.ids_mannheim.korap.util.KrillProperties;
import de.ids_mannheim.korap.util.QueryException;
-import static de.ids_mannheim.korap.util.KrillProperties.*;
// Lucene classes
import org.apache.lucene.search.*;
@@ -143,8 +142,8 @@
// Some initializations ...
{
- Properties prop = loadProperties();
- Properties info = loadInfo();
+ Properties prop = KrillProperties.loadDefaultProperties();
+ Properties info = KrillProperties.loadInfo();
if (info != null) {
this.version = info.getProperty("krill.version");
this.name = info.getProperty("krill.name");
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index 96054cc..d17215f 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -16,6 +16,8 @@
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.store.MMapDirectory;
import de.ids_mannheim.korap.KrillIndex;
+import de.ids_mannheim.korap.util.KrillProperties;
+
import static de.ids_mannheim.korap.util.KrillProperties.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -27,26 +29,37 @@
* is using the standalone server system,
* this tool may be more suitable for your needs
* (especially as it is way faster).
+ * <br><br>
+ * Input directories should contain files in the json.gz format. Files
+ * of other formats will be skipped or not indexed. The output
+ * directory can be specified in the config file. See
+ * src/main/resources/krill.properties.info to create a config file.
*
- * Usage: java -jar Krill-Indexer.jar [--config propfile]
- * [directories]*
+ * <pre>
+ * Usage:
+ *
+ * java -jar Krill-Indexer.jar -c [propfile] -i [input directories] -o
+ * [output directory]
+ *
+ * java -jar Krill-Indexer.jar --config [propfile] --input [input
+ * directories] --output [output directory]
+ * </pre>
+ *
*
* @author diewald, margaretha
*
*/
public class Indexer {
- KrillIndex index;
- int count;
- int commitCount;
+ private KrillIndex index;
+ private int count;
+ private int commitCount;
- // private static String propFile = "krill.properties";
private static String path = null;
- private static Pattern jsonFilePattern;
+ private Pattern jsonFilePattern;
// Init logger
private final static Logger log = LoggerFactory.getLogger(Indexer.class);
-
/**
* Construct a new indexer object.
*
@@ -55,18 +68,18 @@
* @throws IOException
*/
public Indexer (Properties prop) throws IOException {
- if (this.path == null) {
- this.path = prop.getProperty("krill.indexDir");
+ if (path == null) {
+ path = prop.getProperty("krill.indexDir");
}
- log.info("Output directory: " + this.path);
+ log.info("Output directory: " + path);
// Default to 1000 documents till the next commit
String commitCount = prop.getProperty("krill.index.commit.count",
"1000");
// Create a new index object based on the directory
- this.index = new KrillIndex(new MMapDirectory(Paths.get(this.path)));
+ this.index = new KrillIndex(new MMapDirectory(Paths.get(path)));
this.count = 0;
this.commitCount = Integer.parseInt(commitCount);
@@ -81,16 +94,14 @@
* The {@link File} directory containing
* documents to index.
*/
- public void parse (File dir) {
+ private void parse (File dir) {
Matcher matcher;
for (String file : dir.list()) {
- //log.info("Json file: "+file);
matcher = jsonFilePattern.matcher(file);
if (matcher.find()) {
file = dir.getPath() + '/' + file;
log.info("Adding " + file + " to the index. ");
- // Add file to the index
try {
if (this.index.addDoc(new FileInputStream(file),
true) == null) {
@@ -109,7 +120,8 @@
}
}
else {
- log.warn(file + " does not have json.gz format.");
+ log.warn("Skip " + file
+ + " since it does not have json.gz format.");
}
}
}
@@ -118,16 +130,19 @@
/**
* Commit changes to the index.
*/
- public void commit () {
+ private void commit () {
log.info("Committing index ... ");
try {
this.index.commit();
}
catch (IOException e) {
- log.error("Unable to commit to index " + this.path);
+ log.error("Unable to commit to index " + path);
}
}
+ private void closeIndex() throws IOException{
+ index.close();
+ }
/**
* Main method.
@@ -139,11 +154,11 @@
* @throws IOException
*/
public static void main (String[] argv) throws IOException {
-
+
Options options = new Options();
options.addOption(Option.builder("c").longOpt("config")
.desc("configuration file (defaults to "
- + de.ids_mannheim.korap.util.KrillProperties.propStr
+ + KrillProperties.defaultPropertiesLocation
+ ").")
.hasArg().argName("properties file").required().build());
options.addOption(Option.builder("i").longOpt("inputDir")
@@ -180,7 +195,7 @@
"Krill indexer\n java -jar -c <properties file> -i <input directories> "
+ "[-o <output directory>]",
options);
- System.exit(0);
+ return;
}
catch (ParseException e) {
log.error("Unexpected error: " + e);
@@ -188,23 +203,27 @@
}
// Load properties
- Properties prop = loadProperties(propFile);
+ Properties prop = KrillProperties.loadProperties(propFile);
// Get indexer object
- Indexer ki = new Indexer(prop);
+ Indexer indexer = new Indexer(prop);
// Iterate over list of directories
for (String arg : inputDirectories) {
- log.info("Indexing files in"+arg);
+ log.info("Indexing files in " + arg);
File f = new File(arg);
if (f.isDirectory())
- ki.parse(f);
+ indexer.parse(f);
}
-
+ indexer.closeIndex();
// Final commit
- ki.commit();
log.info("Finished indexing.");
// Finish indexing
- System.out.println("Indexed " + ki.count + " files.");
+ String message = "Indexed " + indexer.count + " file";
+ if (indexer.count > 1) {
+ message += "s";
+ }
+ System.out.print(message + ".");
+
}
}
diff --git a/src/main/java/de/ids_mannheim/korap/server/Node.java b/src/main/java/de/ids_mannheim/korap/server/Node.java
index 058cf33..9291d8b 100644
--- a/src/main/java/de/ids_mannheim/korap/server/Node.java
+++ b/src/main/java/de/ids_mannheim/korap/server/Node.java
@@ -20,6 +20,8 @@
import java.beans.PropertyVetoException;
import de.ids_mannheim.korap.KrillIndex;
+import de.ids_mannheim.korap.util.KrillProperties;
+
import org.apache.lucene.store.MMapDirectory;
import static de.ids_mannheim.korap.util.KrillProperties.*;
@@ -96,7 +98,7 @@
};
};
- Properties prop = loadProperties(propFile);
+ Properties prop = KrillProperties.loadProperties(propFile);
// Node properties
if (path != null && path.equals(":memory:")) {
diff --git a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
index 8d2f466..1ba261c 100644
--- a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
+++ b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
@@ -2,27 +2,33 @@
import java.util.*;
import java.io.*;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import de.ids_mannheim.korap.Krill;
-// Todo: Properties may be loaded twice - althogh Java may cache automatically
+/**
+ *
+ * Todo: Properties may be loaded twice - although Java may cache automatically
+ *
+ * @author diewald, margaretha
+ *
+ */
public class KrillProperties {
- public static String propStr = "krill.properties";
- private static String infoStr = "krill.info";
+ public static final String defaultPropertiesLocation = "krill.properties";
+ public static final String defaultInfoLocation = "krill.info";
private static Properties prop, info;
// Logger
- private final static Logger log = LoggerFactory.getLogger(Krill.class);
-
+ private final static Logger log = LoggerFactory
+ .getLogger(KrillProperties.class);
// Load properties from file
- public static Properties loadProperties () {
+ public static Properties loadDefaultProperties () {
if (prop != null)
return prop;
- prop = loadProperties(propStr);
+ prop = loadProperties(defaultPropertiesLocation);
return prop;
};
@@ -30,22 +36,23 @@
// Load properties from file
public static Properties loadProperties (String propFile) {
if (propFile == null)
- return loadProperties();
+ return loadDefaultProperties();
InputStream iFile;
try {
iFile = new FileInputStream(propFile);
prop = new Properties();
prop.load(iFile);
+
}
catch (IOException t) {
try {
iFile = KrillProperties.class.getClassLoader()
.getResourceAsStream(propFile);
-
if (iFile == null) {
log.warn(
- "Cannot find {}. Please create it using \"{}.info\" as template.",
+ "Cannot find {}. Please create it using "
+ + "\"src/main/resources/krill.properties.info\" as template.",
propFile, propFile);
return null;
};
@@ -68,10 +75,10 @@
try {
info = new Properties();
InputStream iFile = KrillProperties.class.getClassLoader()
- .getResourceAsStream(infoStr);
+ .getResourceAsStream(defaultInfoLocation);
if (iFile == null) {
- log.error("Cannot find {}.", infoStr);
+ log.error("Cannot find {}.", defaultInfoLocation);
return null;
};
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
new file mode 100644
index 0000000..ea2aea9
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -0,0 +1,112 @@
+package de.ids_mannheim.korap;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import de.ids_mannheim.korap.index.Indexer;
+
+/**
+ * @author margaretha
+ *
+ */
+public class TestIndexer {
+ private Logger logger = LoggerFactory.getLogger(TestIndexer.class);
+ private final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+ private String info = "usage: Krill indexer";
+ private File outputDirectory = new File("test-index");
+
+ @Test
+ public void testArguments () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bzk" });
+ assertEquals("Indexed 1 file.", outputStream.toString());
+ }
+
+ @Test
+ public void testOutputArgument () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bzk", "-o", "test-output"});
+ assertEquals("Indexed 1 file.", outputStream.toString());
+ }
+
+ @Test
+ public void testMultipleInputFiles () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/wiki" });
+ assertEquals("Indexed 14 files.", outputStream.toString());
+ }
+
+ @Test
+ public void testMultipleInputDirectories () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i",
+ "src/test/resources/bzk;src/test/resources/goe;src/test/resources/sgbr",
+ "-o", "test-index" });
+ assertEquals("Indexed 3 files.", outputStream.toString());
+ }
+
+ @Test
+ public void testEmptyArgument () throws IOException {
+ Indexer.main(new String[] {});
+ logger.info(outputStream.toString());
+ assertEquals(true, outputStream.toString().startsWith(info));
+ }
+
+
+ @Test
+ public void testMissingConfig () throws IOException {
+ Indexer.main(new String[] { "-i", "src/test/resources/bzk",
+ "-o test-index" });
+ logger.info(outputStream.toString());
+ assertEquals(true, outputStream.toString().startsWith(info));
+ }
+
+ @Test
+ public void testMissingInput () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-o", "test-index" });
+ logger.info(outputStream.toString());
+ assertEquals(true, outputStream.toString().startsWith(info));
+ }
+
+ @Before
+ public void setOutputStream () {
+ System.setOut(new PrintStream(outputStream));
+ }
+
+ @After
+ public void cleanOutputStream () {
+ System.setOut(null);
+ }
+
+ @Before
+ public void cleanOutputDirectory () {
+
+ if (outputDirectory.exists()) {
+ logger.debug("Output directory exists");
+ deleteFile(outputDirectory);
+ }
+ }
+
+ private void deleteFile (File path) {
+ if (path.isDirectory()) {
+ File file;
+ for (String filename : path.list()) {
+ file = new File(path + "/" + filename);
+ deleteFile(file);
+ logger.debug(file.getAbsolutePath());
+ }
+ }
+ path.delete();
+ }
+}
diff --git a/src/test/java/de/ids_mannheim/korap/query/TestSpanRelationQueryJSON.java b/src/test/java/de/ids_mannheim/korap/query/TestSpanRelationQueryJSON.java
index 8be9a79..3122b4d 100644
--- a/src/test/java/de/ids_mannheim/korap/query/TestSpanRelationQueryJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/query/TestSpanRelationQueryJSON.java
@@ -70,7 +70,7 @@
@Test
- public void testMatchBothRelationNodeWithAttribute ()
+ public void testMatchBothRelationNodesWithAttribute ()
throws QueryException {
String filepath = getClass()
.getResource(
diff --git a/src/test/resources/krill.properties b/src/test/resources/krill.properties
index f9cbc50..e14926b 100644
--- a/src/test/resources/krill.properties
+++ b/src/test/resources/krill.properties
@@ -1,2 +1,3 @@
krill.version = ${project.version}
krill.name = ${project.name}
+krill.indexDir = test-output
\ No newline at end of file