Upsert documents instead of adding per default in the indexing phase

Change-Id: Ia7fccd59ce1b6a114c8e59da69f3d132a56bc7d2
diff --git a/Changes b/Changes
index b7d9b32..b0aab67 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.58.5 2019-03-04
+0.58.5 2019-03-06
     - [bugfix] Fix bug where duplicate keys occured in
       field data output (diewald)
     - [bugfix] Fix bug where fields already set where lifted
@@ -8,6 +8,9 @@
       by adding indexCreationDate and indexLastModified field
       (diewald)
     - [bugfix] Fixed #50 multiple timeout warnings (margaretha) 
+    - [feature] Instead of adding, the Indexer now upserts documents
+      to avoid multiple documents with the same text sigle
+      (diewald)
 
 0.58.4 2019-02-05
     - [cleanup] Remove deprecated methods setLicense/getLicense,
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 7c9a19d..540db59 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -485,6 +485,20 @@
 
         return this.addDoc(doc);
     };
+
+
+    /**
+     * Update a document in the index as a {@link FieldDocument}
+     * if it already exists (based on the textSigle), otherwise
+     * insert it to the index.
+     * 
+     * @param json
+     *            The JSON document to add to the index.
+     * @return The {@link FieldDocument}.
+     */
+    public FieldDocument upsertDoc (InputStream json, boolean gzip) {
+        return this.upsertDoc(_fromFile(json, gzip));
+    };  
     
 
     /**
@@ -622,7 +636,7 @@
     public FieldDocument addDoc (InputStream json, boolean gzip) {
         return this.addDoc(_fromFile(json, gzip));
     };
-
+    
 
     /**
      * Add a document to the index as a JSON string
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index 0967860..005ac2e 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -57,6 +57,7 @@
     private int commitCount;
 
     private static String path = null;
+    private static boolean addInsteadOfUpsert = false;
     private Pattern jsonFilePattern;
 
     // Init logger
@@ -103,14 +104,24 @@
             matcher = jsonFilePattern.matcher(file);
             if (matcher.find()) {
                 file = dir.getPath() + '/' + file;
-                log.info("Adding " + file + " to the index. ");
 
                 try {
-                    if (this.index.addDoc(new FileInputStream(file),
-                            true) == null) {
-                        log.warn("fail.");
-                        continue;
+                    if (addInsteadOfUpsert) {
+                        log.info("Add " + file + " to the index. ");
+                        if (this.index.addDoc(new FileInputStream(file),
+                                              true) == null) {
+                            log.warn("fail.");
+                            continue;
+                        }
                     }
+                    else {
+                        log.info("Add or update " + file + " to the index. ");
+                        if (this.index.upsertDoc(new FileInputStream(file),
+                                                 true) == null) {
+                            log.warn("fail.");
+                            continue;
+                        };
+                    };
                     this.count++;
                     if (DEBUG){
                         log.debug("Finished adding files. (" + count + ").");
@@ -175,14 +186,17 @@
                 .desc("index output directory (defaults to "
                         + "krill.indexDir in the configuration.")
                 .hasArg().argName("output directory").build());
+        options.addOption(Option.builder("a").longOpt("addInsteadofUpsert")
+                .desc("Always add files to the index, never update")
+                .build());
 
+        
         CommandLineParser parser = new DefaultParser();
 
         String propFile = null;
         String[] inputDirectories = null;
         try {
             CommandLine cmd = parser.parse(options, argv);
-
             log.info("Configuration file: " + cmd.getOptionValue("c"));
             propFile = cmd.getOptionValue("c");
             log.info("Input directories: "
@@ -193,12 +207,16 @@
                 log.info("Output directory: " + cmd.getOptionValue("o"));
                 path = cmd.getOptionValue("o");
             }
+
+            if (cmd.hasOption("a")) {
+                addInsteadOfUpsert = true;
+            };
         }
         catch (MissingOptionException e) {
             HelpFormatter formatter = new HelpFormatter();
             formatter.printHelp(
                     "Krill indexer\n java -jar -c <properties file> -i <input directories> "
-                            + "[-o <output directory>]",
+                            + "[-o <output directory> -a]",
                     options);
             return;
         }
@@ -224,7 +242,10 @@
         // Final commit
         log.info("Finished indexing.");
         // Finish indexing
-        String message = "Indexed " + indexer.count + " file";
+        String message = "Added ";
+        if (!addInsteadOfUpsert)
+            message += "or updated ";
+        message += indexer.count + " file";
         if (indexer.count > 1) {
             message += "s";
         }
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index 52a0094..d0925e4 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -29,30 +29,42 @@
     public void testArguments () throws IOException {

         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",

                 "-i", "src/test/resources/bzk" });

-        assertEquals("Indexed 1 file.", outputStream.toString());

+        assertEquals("Added or updated 1 file.", outputStream.toString());

     }

 

     @Test

     public void testOutputArgument () throws IOException {

         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",

                 "-i", "src/test/resources/bzk", "-o", "test-output"});

-        assertEquals("Indexed 1 file.", outputStream.toString());

+        assertEquals("Added or updated 1 file.", outputStream.toString());

     }

 

     @Test

     public void testMultipleInputFiles () throws IOException {

         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",

                 "-i", "src/test/resources/wiki" });

-        assertEquals("Indexed 17 files.", outputStream.toString());

+        assertEquals("Added or updated 17 files.", outputStream.toString());

     }

 

+

+    @Test

+    public void testAdding () throws IOException {

+        Indexer.main(new String[] {

+                "-c", "src/test/resources/krill.properties",

+                "-i", "src/test/resources/bzk",

+                "-a" });

+        logger.info(outputStream.toString());

+        assertEquals(outputStream.toString(), "Added 1 file.");

+    }

+

+    

     @Test

     public void testMultipleInputDirectories () throws IOException {

         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",

                 "-i",

                 "src/test/resources/bzk;src/test/resources/goe;src/test/resources/sgbr",

                 "-o", "test-index" });

-        assertEquals("Indexed 5 files.", outputStream.toString());

+        assertEquals("Added or updated 5 files.", outputStream.toString());

     }

 

     @Test

@@ -70,7 +82,7 @@
         logger.info(outputStream.toString());

         assertEquals(true, outputStream.toString().startsWith(info));

     }

-

+    

     @Test

     public void testMissingInput () throws IOException {

         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",

diff --git a/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java b/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
index 802a319..923fec1 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
@@ -664,7 +664,11 @@
         assertEquals(mfs.getFieldValue("content"), "Example3");
 
         assertEquals(ki.numberOf("documents"), 2);
-        
+
+        // Test Inputstream method
+        ki.upsertDoc(getClass().getResourceAsStream("/wiki/WPD17-H81-63495.json.gz"), true);
+        ki.commit();
+        assertEquals(ki.numberOf("documents"), 3);
     };