Renamed and cleaned up KorapIndexer
diff --git a/Changes b/Changes
index b679679..781c546 100644
--- a/Changes
+++ b/Changes
@@ -2,14 +2,16 @@
- [feature] Deserialization of arbitrary elements with attributes (margaretha)
- [cleanup] Extract KrillMeta from Krill,
rename KorapQuery to KrillQuery,
- extract QueryBuilder from KrillQuery (diewald)
+ extract QueryBuilder from KrillQuery,
+ renamed KorapIndexer to index/Indexer (diewald)
+ - [documentation] Improved documentation for API classes (diewald)
0.50 2015-02-23
- Project name is now "Krill"
0.49.4 2015-02-20
- - [documentation] Improved documentation for API classes,
- improved test coverage for utility classes (diewald)
+ - [documentation] Improved documentation for API classes (diewald)
+ - [cleanup] Improved test coverage for utility classes (diewald)
- [performance] Updated Lucene dependency from 4.5.1 to 4.10.3,
Updated Jackson dependency from 2.4.0 to 2.4.4,
Updated Jersey dependency from 2.4.1 to 2.15 (diewald)
diff --git a/pom.xml b/pom.xml
index 56b7e06..5d4440a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -235,7 +235,7 @@
<archive>
<manifest>
<addClasspath>true</addClasspath>
- <mainClass>de.ids_mannheim.korap.KorapIndexer</mainClass>
+ <mainClass>de.ids_mannheim.korap.index.Indexer</mainClass>
</manifest>
</archive>
<appendAssemblyId>false</appendAssemblyId>
@@ -245,16 +245,16 @@
</configuration>
<executions>
<execution>
- <id>KorapIndexer</id>
+ <id>Indexer</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
- <finalName>KorapIndexer</finalName>
+ <finalName>Indexer</finalName>
<archive>
<manifest>
- <mainClass>de.ids_mannheim.korap.KorapIndexer</mainClass>
+ <mainClass>de.ids_mannheim.korap.index.Indexer</mainClass>
</manifest>
</archive>
</configuration>
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndexer.java b/src/main/java/de/ids_mannheim/korap/KorapIndexer.java
deleted file mode 100644
index b935db8..0000000
--- a/src/main/java/de/ids_mannheim/korap/KorapIndexer.java
+++ /dev/null
@@ -1,86 +0,0 @@
-package de.ids_mannheim.korap;
-import java.util.*;
-import java.io.*;
-import org.apache.lucene.store.MMapDirectory;
-import de.ids_mannheim.korap.KorapIndex;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class KorapIndexer {
- KorapIndex index;
- String indexDir;
- int count;
- int commitCount;
-
- // Init logger
- private final static Logger log = LoggerFactory.getLogger(KorapIndexer.class);
-
- public KorapIndexer(Properties prop) throws IOException {
- this.indexDir = prop.getProperty("lucene.indexDir");
-
- System.out.println("Index to " + this.indexDir);
-
- String commitCount = prop.getProperty("lucene.index.commit.count", "1000");
-
- this.index = new KorapIndex(new MMapDirectory(new File(indexDir)));
- this.count = 0;
- this.commitCount = Integer.parseInt(commitCount);
- };
-
-
- public void parse (File dir) {
- for (String file : dir.list()) {
- if (file.matches("^[^\\.].+?\\.json\\.gz$")) {
- String found = dir.getPath() + '/' + file;
- System.out.print(" Index " + found + " ... ");
- if (this.index.addDocFile(found, true) == null) {
- System.out.println("fail.");
- continue;
- };
- System.out.println("done (" + count + ").");
- this.count++;
-
- if ((this.count % this.commitCount) == 0)
- this.commit();
- };
- };
- };
-
-
- public void commit () {
- System.out.println("-----");
- System.out.print(" Commit ... ");
- try {
- this.index.commit();
- }
- catch (IOException e) {
- System.err.println("Unable to commit to index " + this.indexDir);
- };
- System.out.println("done.");
- };
-
-
-
- public static void main (String[] argv) throws IOException {
- Properties prop = new Properties();
- InputStream fr = new FileInputStream(argv[0]);
- prop.load(fr);
- KorapIndexer ki = new KorapIndexer(prop);
- System.out.println();
-
- for (String arg : Arrays.copyOfRange(argv, 1, argv.length)) {
- File f = new File(arg);
- if (f.isDirectory())
- ki.parse(f);
- };
-
-
- // Final commit
- ki.commit();
-
- // Finish indexing
- System.out.println("-----");
- System.out.println(" Indexed " + ki.count + " files.");
- System.out.println();
- };
-};
diff --git a/src/main/java/de/ids_mannheim/korap/KrillQuery.java b/src/main/java/de/ids_mannheim/korap/KrillQuery.java
index 01385c7..d7b68dc 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillQuery.java
@@ -29,9 +29,11 @@
* @author diewald
*/
/*
- Todo: Use full-blown jsonld processor
+ TODO: Merge this with SpanQueryWrapper
- Todo: All queries with a final right expansion
+ TODO: Use full-blown jsonld processor
+
+ TODO: All queries with a final right expansion
e.g. der alte []
should be wrapped in a contains(<base/s=t>) to ensure
they are not outside the text.
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
new file mode 100644
index 0000000..bfa15f2
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -0,0 +1,141 @@
+package de.ids_mannheim.korap.index;
+import java.util.*;
+import java.io.*;
+import org.apache.lucene.store.MMapDirectory;
+import de.ids_mannheim.korap.KorapIndex;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is a runnable indexer tool for
+ * Krill. Although the preferred index method
+ * is using the standalone server system,
+ * this tool may be more suitable for your needs
+ * (especially as it is way faster).
+ *
+ * Usage: java -jar Krill-X.XX.jar [propfile] [directories]*
+ */
+public class Indexer {
+ KorapIndex index;
+ String indexDir;
+ int count;
+ int commitCount;
+
+ // Init logger
+ private final static Logger log =
+ LoggerFactory.getLogger(KorapIndex.class);
+
+
+ /**
+ * Construct a new indexer object.
+ *
+ * @param prop A {@link Properties} object with
+ * at least the following information:
+ * <tt>lucene.indexDir</tt>.
+ * @throws IOException
+ */
+ public Indexer (Properties prop) throws IOException {
+ this.indexDir = prop.getProperty("lucene.indexDir");
+
+ System.out.println("Index to " + this.indexDir);
+
+ // Default to 1000 documents till the next commit
+ String commitCount = prop.getProperty("lucene.index.commit.count", "1000");
+
+ // Create a new index object based on the directory
+ this.index = new KorapIndex(new MMapDirectory(new File(indexDir)));
+ this.count = 0;
+ this.commitCount = Integer.parseInt(commitCount);
+ };
+
+ /**
+ * Parse a directory for document files.
+ *
+ * @param dir The {@link File} directory containing
+ * documents to index.
+ */
+ public void parse (File dir) {
+ for (String file : dir.list()) {
+ if (file.matches("^[^\\.].+?\\.json\\.gz$")) {
+ String found = dir.getPath() + '/' + file;
+ System.out.print(" Index " + found + " ... ");
+
+ // Add file to the index
+ if (this.index.addDocFile(found, true) == null) {
+ System.out.println("fail.");
+ continue;
+ };
+ System.out.println("done (" + count + ").");
+ this.count++;
+
+ // Commit in case the commit count is reached
+ if ((this.count % this.commitCount) == 0)
+ this.commit();
+ };
+ };
+ };
+
+ /**
+ * Commit changes to the index.
+ */
+ public void commit () {
+ System.out.println("-----");
+ System.out.print(" Commit ... ");
+ try {
+ this.index.commit();
+ }
+ catch (IOException e) {
+ System.err.println("Unable to commit to index " + this.indexDir);
+ };
+ System.out.println("done.");
+ };
+
+ /**
+ * Main method.
+ *
+ * @param argv Argument list,
+ * expecting the properties file
+ * and a list of directories
+ * @throws IOException
+ */
+ public static void main (String[] argv) throws IOException {
+ Properties prop = new Properties();
+
+ // Needed at least 2 parameters
+ if (argv.length < 2) {
+
+ String jar = new File(Indexer.class.getProtectionDomain()
+ .getCodeSource()
+ .getLocation()
+ .getPath()).getName();
+ System.out.println("Usage: java -jar " + jar +
+ " [propfile] [directories]*");
+ return;
+ };
+
+ // Load properties
+ InputStream fr = new FileInputStream(argv[0]);
+ prop.load(fr);
+
+ // Get indexer object
+ Indexer ki = new Indexer(prop);
+
+ // Empty line
+ System.out.println();
+
+ // Iterate over list of directories
+ for (String arg : Arrays.copyOfRange(argv, 1, argv.length)) {
+ File f = new File(arg);
+ if (f.isDirectory())
+ ki.parse(f);
+ };
+
+ // Final commit
+ ki.commit();
+
+ // Finish indexing
+ System.out.println("-----");
+ System.out.println(" Indexed " + ki.count + " files.");
+ System.out.println();
+ };
+};
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanQueryWrapper.java
index 1f99fd6..dd6bac6 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanQueryWrapper.java
@@ -15,6 +15,10 @@
* This class is meant to be extended by
* wrapper classes.
*
+ * <strong>Warning</strong>: SpanQueryWrapper
+ * will probably be merged with {@link KrillQuery}
+ * in near future. Use of this API is on your own risk.
+ *
* @author diewald
*/
public class SpanQueryWrapper {