Upsert documents instead of adding per default in the indexing phase
Change-Id: Ia7fccd59ce1b6a114c8e59da69f3d132a56bc7d2
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 7c9a19d..540db59 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -485,6 +485,20 @@
return this.addDoc(doc);
};
+
+
+ /**
+ * Update a document in the index as a {@link FieldDocument}
+ * if it already exists (based on the textSigle), otherwise
+ * insert it to the index.
+ *
+ * @param json
+ * The JSON document to add to the index.
+ * @return The {@link FieldDocument}.
+ */
+ public FieldDocument upsertDoc (InputStream json, boolean gzip) {
+ return this.upsertDoc(_fromFile(json, gzip));
+ };
/**
@@ -622,7 +636,7 @@
public FieldDocument addDoc (InputStream json, boolean gzip) {
return this.addDoc(_fromFile(json, gzip));
};
-
+
/**
* Add a document to the index as a JSON string
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index 0967860..005ac2e 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -57,6 +57,7 @@
private int commitCount;
private static String path = null;
+ private static boolean addInsteadOfUpsert = false;
private Pattern jsonFilePattern;
// Init logger
@@ -103,14 +104,24 @@
matcher = jsonFilePattern.matcher(file);
if (matcher.find()) {
file = dir.getPath() + '/' + file;
- log.info("Adding " + file + " to the index. ");
try {
- if (this.index.addDoc(new FileInputStream(file),
- true) == null) {
- log.warn("fail.");
- continue;
+ if (addInsteadOfUpsert) {
+ log.info("Add " + file + " to the index. ");
+ if (this.index.addDoc(new FileInputStream(file),
+ true) == null) {
+ log.warn("fail.");
+ continue;
+ }
}
+ else {
+ log.info("Add or update " + file + " to the index. ");
+ if (this.index.upsertDoc(new FileInputStream(file),
+ true) == null) {
+ log.warn("fail.");
+ continue;
+ };
+ };
this.count++;
if (DEBUG){
log.debug("Finished adding files. (" + count + ").");
@@ -175,14 +186,17 @@
.desc("index output directory (defaults to "
+ "krill.indexDir in the configuration.")
.hasArg().argName("output directory").build());
+ options.addOption(Option.builder("a").longOpt("addInsteadofUpsert")
+ .desc("Always add files to the index, never update")
+ .build());
+
CommandLineParser parser = new DefaultParser();
String propFile = null;
String[] inputDirectories = null;
try {
CommandLine cmd = parser.parse(options, argv);
-
log.info("Configuration file: " + cmd.getOptionValue("c"));
propFile = cmd.getOptionValue("c");
log.info("Input directories: "
@@ -193,12 +207,16 @@
log.info("Output directory: " + cmd.getOptionValue("o"));
path = cmd.getOptionValue("o");
}
+
+ if (cmd.hasOption("a")) {
+ addInsteadOfUpsert = true;
+ };
}
catch (MissingOptionException e) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(
"Krill indexer\n java -jar -c <properties file> -i <input directories> "
- + "[-o <output directory>]",
+ + "[-o <output directory> -a]",
options);
return;
}
@@ -224,7 +242,10 @@
// Final commit
log.info("Finished indexing.");
// Finish indexing
- String message = "Indexed " + indexer.count + " file";
+ String message = "Added ";
+ if (!addInsteadOfUpsert)
+ message += "or updated ";
+ message += indexer.count + " file";
if (indexer.count > 1) {
message += "s";
}