Add delete option to Krill-Indexer

Change-Id: I5cbc39ce0bdeb6d155043e3ff5a3c856c1a2e75b
diff --git a/Changes b/Changes
index 606d9aa..d7247ac 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,6 @@
     - [performance] Add leaf cache. (diewald)
     - [bugfix] Fix fingerprinter (wasn't threadsafe; diewald)
+    - [feature] Add --delete option to Krill-Indexer (diewald)
 
 0.64.5 2025-12-03
     - [maintenance] Update to Java 21 (diewald)
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index c2083c6..44826a5 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -394,12 +394,16 @@
                         + "<filename>.json.gz files, zip files containing .json or .json.gz files, "
                         + "or tar files (including .tar.gz) containing .json or .json.gz files. "
                         + "The indexer will automatically detect the type.")
-                .hasArgs().argName("input paths").required()
+                .hasArgs().argName("input paths")
                 .valueSeparator(Character.valueOf(';')).build());
         options.addOption(Option.builder("o").longOpt("outputDir")
                 .desc("index output directory (defaults to "
                         + "krill.indexDir in the configuration.")
                 .hasArg().argName("output directory").build());
+        options.addOption(Option.builder("D").longOpt("delete")
+                .desc("delete documents from the index by field and value "
+                        + "(example: -D textSigle GOE/AGX/00002).")
+                .numberOfArgs(2).argName("field value").build());
         options.addOption(Option.builder("a").longOpt("addInsteadofUpsert")
                 .desc("Always add files to the index, never update")
                 .build());
@@ -411,21 +415,36 @@
 
         String propFile = null;
         String[] inputPaths = null;
+        String deleteField = null;
+        String deleteValue = null;
         boolean showProgress = false;
         try {
             CommandLine cmd = parser.parse(options, argv);
             log.info("Configuration file: " + cmd.getOptionValue("c"));
             propFile = cmd.getOptionValue("c");
-            
-            log.info("Input paths: "
-                    + StringUtils.join(cmd.getOptionValues("i"), ";"));
-            inputPaths = cmd.getOptionValues("i");
+
+            if (cmd.hasOption("i")) {
+                inputPaths = cmd.getOptionValues("i");
+                log.info("Input paths: " + StringUtils.join(inputPaths, ";"));
+            }
 
             if (cmd.hasOption("o")) {
                 log.info("Output directory: " + cmd.getOptionValue("o"));
                 path = cmd.getOptionValue("o");
             }
 
+            if (cmd.hasOption("D")) {
+                String[] deleteArgs = cmd.getOptionValues("D");
+                deleteField = deleteArgs[0];
+                deleteValue = deleteArgs[1];
+                log.info("Delete documents with {}={}", deleteField, deleteValue);
+            }
+
+            if (inputPaths == null && !cmd.hasOption("D")) {
+                throw new MissingOptionException(
+                        "Missing required option: either -i/--input or -D/--delete");
+            }
+
             if (cmd.hasOption("a")) {
                 addInsteadOfUpsert = true;
             };
@@ -435,9 +454,11 @@
         }
         catch (MissingOptionException e) {
             HelpFormatter formatter = new HelpFormatter();
+            String helpSyntax = "Krill indexer\n java -jar Krill-Indexer.jar -c <properties file> "
+                    + "[-i <input paths>] [-D <field> <value>] "
+                    + "[-o <output directory> -a --progress]";
             formatter.printHelp(
-                    "Krill indexer\n java -jar -c <properties file> -i <input paths> "
-                            + "[-o <output directory> -a --progress]",
+                    helpSyntax,
                     options);
             return;
         }
@@ -460,7 +481,7 @@
             }
 
             // Initialize progress if requested
-            if (showProgress) {
+            if (showProgress && inputPaths != null) {
                 long total = countTargetFiles(inputPaths);
                 if (total > 0) {
                     indexer.initProgress(total);
@@ -468,41 +489,54 @@
             }
 
             // Iterate over list of input paths (auto-detect directories vs zip/tar files)
-            for (String arg : inputPaths) {
-                File f = new File(arg);
-                
-                if (f.isDirectory()) {
-                    log.info("Indexing files in directory " + arg);
-                    indexer.parse(f);
-                }
-                else if (f.isFile() && f.getName().toLowerCase().endsWith(".zip")) {
-                    log.info("Indexing files in zip " + arg);
-                    indexer.parseZip(f);
-                }
-                else if (f.isFile() && (f.getName().toLowerCase().endsWith(".tar") || 
-                                       f.getName().toLowerCase().endsWith(".tar.gz") ||
-                                       f.getName().toLowerCase().endsWith(".tgz"))) {
-                    log.info("Indexing files in tar " + arg);
-                    indexer.parseTar(f);
-                }
-                else {
-                    log.warn("Skipping " + arg + " - not a valid directory, zip file, or tar file");
+            if (inputPaths != null) {
+                for (String arg : inputPaths) {
+                    File f = new File(arg);
+                    
+                    if (f.isDirectory()) {
+                        log.info("Indexing files in directory " + arg);
+                        indexer.parse(f);
+                    }
+                    else if (f.isFile() && f.getName().toLowerCase().endsWith(".zip")) {
+                        log.info("Indexing files in zip " + arg);
+                        indexer.parseZip(f);
+                    }
+                    else if (f.isFile() && (f.getName().toLowerCase().endsWith(".tar") || 
+                                           f.getName().toLowerCase().endsWith(".tar.gz") ||
+                                           f.getName().toLowerCase().endsWith(".tgz"))) {
+                        log.info("Indexing files in tar " + arg);
+                        indexer.parseTar(f);
+                    }
+                    else {
+                        log.warn("Skipping " + arg + " - not a valid directory, zip file, or tar file");
+                    }
                 }
             }
+
+            if (deleteField != null && deleteValue != null) {
+                indexer.index.delDocs(deleteField, deleteValue);
+            }
+
             indexer.finishProgress();
             indexer.closeIndex();
 
             // Final commit
             log.info("Finished indexing.");
-            // Finish indexing
-            String message = "Added ";
-            if (!addInsteadOfUpsert)
-                message += "or updated ";
-            message += indexer.count + " file";
-            if (indexer.count > 1) {
-                message += "s";
+            // Finish indexing/deletion
+            if (inputPaths != null) {
+                String message = "Added ";
+                if (!addInsteadOfUpsert)
+                    message += "or updated ";
+                message += indexer.count + " file";
+                if (indexer.count > 1) {
+                    message += "s";
+                }
+                System.out.println(message + ".");
             }
-            System.out.println(message + ".");
+            else {
+                System.out.println("Deleted documents where " + deleteField + "="
+                        + deleteValue + ".");
+            }
         }
 
         catch (IOException e) {
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index 7b568ae..9f0390c 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -7,9 +7,13 @@
 import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
 import java.io.PrintStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.zip.GZIPOutputStream;
 
 import org.junit.After;
 import org.junit.AfterClass;
@@ -293,6 +297,38 @@
         assertEquals(0L, invalidZip);
     }
 
+    @Test
+    public void testDeleteByTextSigleOption () throws IOException {
+        Path inputDir = Files.createTempDirectory(tempBaseDirectory.toPath(),
+                "delete-input");
+        Path jsonPath = Paths.get("src/test/resources/goe/AGX-00002.json");
+        Path gzPath = inputDir.resolve("AGX-00002.json.gz");
+        gzipFile(jsonPath, gzPath);
+
+        String outputDir = getTestOutputPath("test-delete-index");
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                "-i", inputDir.toString(), "-o", outputDir });
+        assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
+
+        outputStream.reset();
+
+
+        KrillIndex ki = new KrillIndex(Paths.get(outputDir));
+        assertEquals(1, ki.numberOf("documents"));
+
+
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                "-o", outputDir, "-D", "textSigle", "GOE_AGX.00002" });
+        assertTrue(outputStream.toString()
+                .startsWith("Deleted documents where textSigle=GOE_AGX.00002."));
+
+        ki = new KrillIndex(Paths.get(outputDir));
+        
+        assertEquals(0, ki.numberOf("documents"));
+        ki.close();
+        
+    }
+
     @Before
     public void setOutputStream () {
         System.setOut(new PrintStream(outputStream));
@@ -322,4 +358,16 @@
         }
         path.delete();
     }
+
+    private static void gzipFile (Path input, Path output) throws IOException {
+        try (InputStream in = Files.newInputStream(input);
+                OutputStream out = new GZIPOutputStream(
+                        Files.newOutputStream(output))) {
+            byte[] buffer = new byte[8192];
+            int bytesRead;
+            while ((bytesRead = in.read(buffer)) != -1) {
+                out.write(buffer, 0, bytesRead);
+            }
+        }
+    }
 }