Upsert documents instead of adding per default in the indexing phase
Change-Id: Ia7fccd59ce1b6a114c8e59da69f3d132a56bc7d2
diff --git a/Changes b/Changes
index b7d9b32..b0aab67 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.58.5 2019-03-04
+0.58.5 2019-03-06
- [bugfix] Fix bug where duplicate keys occured in
field data output (diewald)
- [bugfix] Fix bug where fields already set where lifted
@@ -8,6 +8,9 @@
by adding indexCreationDate and indexLastModified field
(diewald)
- [bugfix] Fixed #50 multiple timeout warnings (margaretha)
+ - [feature] Instead of adding, the Indexer now upserts documents
+ to avoid multiple documents with the same text sigle
+ (diewald)
0.58.4 2019-02-05
- [cleanup] Remove deprecated methods setLicense/getLicense,
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 7c9a19d..540db59 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -485,6 +485,20 @@
return this.addDoc(doc);
};
+
+
+ /**
+ * Update a document in the index as a {@link FieldDocument}
+ * if it already exists (based on the textSigle), otherwise
+ * insert it to the index.
+ *
+ * @param json
+ * The JSON document to add to the index.
+ * @return The {@link FieldDocument}.
+ */
+ public FieldDocument upsertDoc (InputStream json, boolean gzip) {
+ return this.upsertDoc(_fromFile(json, gzip));
+ };
/**
@@ -622,7 +636,7 @@
public FieldDocument addDoc (InputStream json, boolean gzip) {
return this.addDoc(_fromFile(json, gzip));
};
-
+
/**
* Add a document to the index as a JSON string
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index 0967860..005ac2e 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -57,6 +57,7 @@
private int commitCount;
private static String path = null;
+ private static boolean addInsteadOfUpsert = false;
private Pattern jsonFilePattern;
// Init logger
@@ -103,14 +104,24 @@
matcher = jsonFilePattern.matcher(file);
if (matcher.find()) {
file = dir.getPath() + '/' + file;
- log.info("Adding " + file + " to the index. ");
try {
- if (this.index.addDoc(new FileInputStream(file),
- true) == null) {
- log.warn("fail.");
- continue;
+ if (addInsteadOfUpsert) {
+ log.info("Add " + file + " to the index. ");
+ if (this.index.addDoc(new FileInputStream(file),
+ true) == null) {
+ log.warn("fail.");
+ continue;
+ }
}
+ else {
+ log.info("Add or update " + file + " to the index. ");
+ if (this.index.upsertDoc(new FileInputStream(file),
+ true) == null) {
+ log.warn("fail.");
+ continue;
+ };
+ };
this.count++;
if (DEBUG){
log.debug("Finished adding files. (" + count + ").");
@@ -175,14 +186,17 @@
.desc("index output directory (defaults to "
+ "krill.indexDir in the configuration.")
.hasArg().argName("output directory").build());
+ options.addOption(Option.builder("a").longOpt("addInsteadofUpsert")
+ .desc("Always add files to the index, never update")
+ .build());
+
CommandLineParser parser = new DefaultParser();
String propFile = null;
String[] inputDirectories = null;
try {
CommandLine cmd = parser.parse(options, argv);
-
log.info("Configuration file: " + cmd.getOptionValue("c"));
propFile = cmd.getOptionValue("c");
log.info("Input directories: "
@@ -193,12 +207,16 @@
log.info("Output directory: " + cmd.getOptionValue("o"));
path = cmd.getOptionValue("o");
}
+
+ if (cmd.hasOption("a")) {
+ addInsteadOfUpsert = true;
+ };
}
catch (MissingOptionException e) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(
"Krill indexer\n java -jar -c <properties file> -i <input directories> "
- + "[-o <output directory>]",
+ + "[-o <output directory> -a]",
options);
return;
}
@@ -224,7 +242,10 @@
// Final commit
log.info("Finished indexing.");
// Finish indexing
- String message = "Indexed " + indexer.count + " file";
+ String message = "Added ";
+ if (!addInsteadOfUpsert)
+ message += "or updated ";
+ message += indexer.count + " file";
if (indexer.count > 1) {
message += "s";
}
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index 52a0094..d0925e4 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -29,30 +29,42 @@
public void testArguments () throws IOException {
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
"-i", "src/test/resources/bzk" });
- assertEquals("Indexed 1 file.", outputStream.toString());
+ assertEquals("Added or updated 1 file.", outputStream.toString());
}
@Test
public void testOutputArgument () throws IOException {
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
"-i", "src/test/resources/bzk", "-o", "test-output"});
- assertEquals("Indexed 1 file.", outputStream.toString());
+ assertEquals("Added or updated 1 file.", outputStream.toString());
}
@Test
public void testMultipleInputFiles () throws IOException {
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
"-i", "src/test/resources/wiki" });
- assertEquals("Indexed 17 files.", outputStream.toString());
+ assertEquals("Added or updated 17 files.", outputStream.toString());
}
+
+ @Test
+ public void testAdding () throws IOException {
+ Indexer.main(new String[] {
+ "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bzk",
+ "-a" });
+ logger.info(outputStream.toString());
+ assertEquals(outputStream.toString(), "Added 1 file.");
+ }
+
+
@Test
public void testMultipleInputDirectories () throws IOException {
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
"-i",
"src/test/resources/bzk;src/test/resources/goe;src/test/resources/sgbr",
"-o", "test-index" });
- assertEquals("Indexed 5 files.", outputStream.toString());
+ assertEquals("Added or updated 5 files.", outputStream.toString());
}
@Test
@@ -70,7 +82,7 @@
logger.info(outputStream.toString());
assertEquals(true, outputStream.toString().startsWith(info));
}
-
+
@Test
public void testMissingInput () throws IOException {
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java b/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
index 802a319..923fec1 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
@@ -664,7 +664,11 @@
assertEquals(mfs.getFieldValue("content"), "Example3");
assertEquals(ki.numberOf("documents"), 2);
-
+
+ // Test Inputstream method
+ ki.upsertDoc(getClass().getResourceAsStream("/wiki/WPD17-H81-63495.json.gz"), true);
+ ki.commit();
+ assertEquals(ki.numberOf("documents"), 3);
};