Add delete option to Krill-Indexer
Change-Id: I5cbc39ce0bdeb6d155043e3ff5a3c856c1a2e75b
diff --git a/Changes b/Changes
index 606d9aa..d7247ac 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,6 @@
- [performance] Add leaf cache. (diewald)
- [bugfix] Fix fingerprinter (wasn't threadsafe; diewald)
+ - [feature] Add --delete option to Krill-Indexer (diewald)
0.64.5 2025-12-03
- [maintenance] Update to Java 21 (diewald)
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index c2083c6..44826a5 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -394,12 +394,16 @@
+ "<filename>.json.gz files, zip files containing .json or .json.gz files, "
+ "or tar files (including .tar.gz) containing .json or .json.gz files. "
+ "The indexer will automatically detect the type.")
- .hasArgs().argName("input paths").required()
+ .hasArgs().argName("input paths")
.valueSeparator(Character.valueOf(';')).build());
options.addOption(Option.builder("o").longOpt("outputDir")
.desc("index output directory (defaults to "
+ "krill.indexDir in the configuration.")
.hasArg().argName("output directory").build());
+ options.addOption(Option.builder("D").longOpt("delete")
+ .desc("delete documents from the index by field and value "
+ + "(example: -D textSigle GOE/AGX/00002).")
+ .numberOfArgs(2).argName("field value").build());
options.addOption(Option.builder("a").longOpt("addInsteadofUpsert")
.desc("Always add files to the index, never update")
.build());
@@ -411,21 +415,36 @@
String propFile = null;
String[] inputPaths = null;
+ String deleteField = null;
+ String deleteValue = null;
boolean showProgress = false;
try {
CommandLine cmd = parser.parse(options, argv);
log.info("Configuration file: " + cmd.getOptionValue("c"));
propFile = cmd.getOptionValue("c");
-
- log.info("Input paths: "
- + StringUtils.join(cmd.getOptionValues("i"), ";"));
- inputPaths = cmd.getOptionValues("i");
+
+ if (cmd.hasOption("i")) {
+ inputPaths = cmd.getOptionValues("i");
+ log.info("Input paths: " + StringUtils.join(inputPaths, ";"));
+ }
if (cmd.hasOption("o")) {
log.info("Output directory: " + cmd.getOptionValue("o"));
path = cmd.getOptionValue("o");
}
+ if (cmd.hasOption("D")) {
+ String[] deleteArgs = cmd.getOptionValues("D");
+ deleteField = deleteArgs[0];
+ deleteValue = deleteArgs[1];
+ log.info("Delete documents with {}={}", deleteField, deleteValue);
+ }
+
+ if (inputPaths == null && !cmd.hasOption("D")) {
+ throw new MissingOptionException(
+ "Missing required option: either -i/--input or -D/--delete");
+ }
+
if (cmd.hasOption("a")) {
addInsteadOfUpsert = true;
};
@@ -435,9 +454,11 @@
}
catch (MissingOptionException e) {
HelpFormatter formatter = new HelpFormatter();
+ String helpSyntax = "Krill indexer\n java -jar Krill-Indexer.jar -c <properties file> "
+ + "[-i <input paths>] [-D <field> <value>] "
+ + "[-o <output directory> -a --progress]";
formatter.printHelp(
- "Krill indexer\n java -jar -c <properties file> -i <input paths> "
- + "[-o <output directory> -a --progress]",
+ helpSyntax,
options);
return;
}
@@ -460,7 +481,7 @@
}
// Initialize progress if requested
- if (showProgress) {
+ if (showProgress && inputPaths != null) {
long total = countTargetFiles(inputPaths);
if (total > 0) {
indexer.initProgress(total);
@@ -468,41 +489,54 @@
}
// Iterate over list of input paths (auto-detect directories vs zip/tar files)
- for (String arg : inputPaths) {
- File f = new File(arg);
-
- if (f.isDirectory()) {
- log.info("Indexing files in directory " + arg);
- indexer.parse(f);
- }
- else if (f.isFile() && f.getName().toLowerCase().endsWith(".zip")) {
- log.info("Indexing files in zip " + arg);
- indexer.parseZip(f);
- }
- else if (f.isFile() && (f.getName().toLowerCase().endsWith(".tar") ||
- f.getName().toLowerCase().endsWith(".tar.gz") ||
- f.getName().toLowerCase().endsWith(".tgz"))) {
- log.info("Indexing files in tar " + arg);
- indexer.parseTar(f);
- }
- else {
- log.warn("Skipping " + arg + " - not a valid directory, zip file, or tar file");
+ if (inputPaths != null) {
+ for (String arg : inputPaths) {
+ File f = new File(arg);
+
+ if (f.isDirectory()) {
+ log.info("Indexing files in directory " + arg);
+ indexer.parse(f);
+ }
+ else if (f.isFile() && f.getName().toLowerCase().endsWith(".zip")) {
+ log.info("Indexing files in zip " + arg);
+ indexer.parseZip(f);
+ }
+ else if (f.isFile() && (f.getName().toLowerCase().endsWith(".tar") ||
+ f.getName().toLowerCase().endsWith(".tar.gz") ||
+ f.getName().toLowerCase().endsWith(".tgz"))) {
+ log.info("Indexing files in tar " + arg);
+ indexer.parseTar(f);
+ }
+ else {
+ log.warn("Skipping " + arg + " - not a valid directory, zip file, or tar file");
+ }
}
}
+
+ if (deleteField != null && deleteValue != null) {
+ indexer.index.delDocs(deleteField, deleteValue);
+ }
+
indexer.finishProgress();
indexer.closeIndex();
// Final commit
log.info("Finished indexing.");
- // Finish indexing
- String message = "Added ";
- if (!addInsteadOfUpsert)
- message += "or updated ";
- message += indexer.count + " file";
- if (indexer.count > 1) {
- message += "s";
+ // Finish indexing/deletion
+ if (inputPaths != null) {
+ String message = "Added ";
+ if (!addInsteadOfUpsert)
+ message += "or updated ";
+ message += indexer.count + " file";
+ if (indexer.count > 1) {
+ message += "s";
+ }
+ System.out.println(message + ".");
}
- System.out.println(message + ".");
+ else {
+ System.out.println("Deleted documents where " + deleteField + "="
+ + deleteValue + ".");
+ }
}
catch (IOException e) {
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index 7b568ae..9f0390c 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -7,9 +7,13 @@
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
import java.io.PrintStream;
import java.nio.file.Files;
import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.zip.GZIPOutputStream;
import org.junit.After;
import org.junit.AfterClass;
@@ -293,6 +297,38 @@
assertEquals(0L, invalidZip);
}
+ @Test
+ public void testDeleteByTextSigleOption () throws IOException {
+ Path inputDir = Files.createTempDirectory(tempBaseDirectory.toPath(),
+ "delete-input");
+ Path jsonPath = Paths.get("src/test/resources/goe/AGX-00002.json");
+ Path gzPath = inputDir.resolve("AGX-00002.json.gz");
+ gzipFile(jsonPath, gzPath);
+
+ String outputDir = getTestOutputPath("test-delete-index");
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", inputDir.toString(), "-o", outputDir });
+ assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
+
+ outputStream.reset();
+
+
+ KrillIndex ki = new KrillIndex(Paths.get(outputDir));
+ assertEquals(1, ki.numberOf("documents"));
+
+
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-o", outputDir, "-D", "textSigle", "GOE_AGX.00002" });
+ assertTrue(outputStream.toString()
+ .startsWith("Deleted documents where textSigle=GOE_AGX.00002."));
+
+ ki = new KrillIndex(Paths.get(outputDir));
+
+ assertEquals(0, ki.numberOf("documents"));
+ ki.close();
+
+ }
+
@Before
public void setOutputStream () {
System.setOut(new PrintStream(outputStream));
@@ -322,4 +358,16 @@
}
path.delete();
}
+
+ private static void gzipFile (Path input, Path output) throws IOException {
+ try (InputStream in = Files.newInputStream(input);
+ OutputStream out = new GZIPOutputStream(
+ Files.newOutputStream(output))) {
+ byte[] buffer = new byte[8192];
+ int bytesRead;
+ while ((bytesRead = in.read(buffer)) != -1) {
+ out.write(buffer, 0, bytesRead);
+ }
+ }
+ }
}