Add krill.index.textSize.max configuration option
... to increase maximum text size
Resolves #205
DeLiKo@DNB-XXL requires krill.index.textSize.max = 120000000
Change-Id: I1cd64ffc38179ae1fd965e5ef5f7ec7503fbcd21
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 43041db..28b3e0b 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -17,6 +17,8 @@
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.StreamReadConstraints;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.document.Document;
@@ -76,6 +78,8 @@
import de.ids_mannheim.korap.util.KrillProperties;
import de.ids_mannheim.korap.util.QueryException;
+import static com.fasterxml.jackson.core.StreamReadConstraints.DEFAULT_MAX_STRING_LEN;
+
/**
* <p>KrillIndex implements a simple API for searching in and writing
* to a
@@ -249,6 +253,23 @@
return this.version;
};
+ public void setMaxStringLength(int maxStringLength) {
+ if (maxStringLength < DEFAULT_MAX_STRING_LEN) {
+ throw new IllegalArgumentException("Maximum string length must not be smaller than the default value: "
+ + DEFAULT_MAX_STRING_LEN);
+ }
+
+ StreamReadConstraints constraints = StreamReadConstraints.builder()
+ .maxStringLength(maxStringLength)
+ .build();
+
+ JsonFactory factory = JsonFactory.builder()
+ .streamReadConstraints(constraints)
+ .build();
+
+ this.mapper = new ObjectMapper(factory);
+ log.info("Maximum string length set to {}.", maxStringLength);
+ }
/**
* Get the name of the index.
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index b726ffd..6db23cb 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -25,6 +25,8 @@
import de.ids_mannheim.korap.KrillIndex;
import de.ids_mannheim.korap.util.KrillProperties;
+import static com.fasterxml.jackson.core.StreamReadConstraints.DEFAULT_MAX_STRING_LEN;
+
/**
* Standalone indexer tool for Krill.
* Although the preferred index method
@@ -195,7 +197,7 @@
options.addOption(Option.builder("a").longOpt("addInsteadofUpsert")
.desc("Always add files to the index, never update")
.build());
-
+
CommandLineParser parser = new DefaultParser();
String propFile = null;
@@ -216,7 +218,6 @@
if (cmd.hasOption("a")) {
addInsteadOfUpsert = true;
};
-
}
catch (MissingOptionException e) {
HelpFormatter formatter = new HelpFormatter();
@@ -237,6 +238,12 @@
try {
// Get indexer object
Indexer indexer = new Indexer(prop);
+
+ // Apply max text size from configuration
+ if (KrillProperties.maxTextSize > DEFAULT_MAX_STRING_LEN) {
+ log.info("Setting max text length to " + KrillProperties.maxTextSize);
+ indexer.index.setMaxStringLength(KrillProperties.maxTextSize);
+ }
// Iterate over list of directories
for (String arg : inputDirectories) {
diff --git a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
index 68c3e7c..b5b009c 100644
--- a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
+++ b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
@@ -6,6 +6,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import static com.fasterxml.jackson.core.StreamReadConstraints.DEFAULT_MAX_STRING_LEN;
+
/**
*
* Todo: Properties may be loaded twice - although Java may cache automatically
@@ -23,6 +25,7 @@
public static int maxTokenContextSize = 60;
public static int maxCharContextSize = 500;
public static int defaultSearchContextLength = 6;
+ public static int maxTextSize = DEFAULT_MAX_STRING_LEN; // Default max text size
public static boolean matchExpansionIncludeContextSize = false;
@@ -89,6 +92,7 @@
// EM: not implemented yet
// String maxCharContextSize = prop.getProperty("krill.context.max.char");
String defaultSearchContextLength = prop.getProperty("krill.search.context.default");
+ String maxTextSizeValue = prop.getProperty("krill.index.textSize.max");
try {
if (maxTokenMatchSize != null) {
@@ -107,6 +111,18 @@
KrillProperties.defaultSearchContextLength = Integer
.parseInt(defaultSearchContextLength);
}
+ if (maxTextSizeValue != null) {
+ int userMaxTextLength = Integer
+ .parseInt(maxTextSizeValue);
+ if (userMaxTextLength < DEFAULT_MAX_STRING_LEN) {
+ log.warn("Specified krill.index.textSize.max is too small. Using default value: "
+ + DEFAULT_MAX_STRING_LEN);
+ KrillProperties.maxTextSize = DEFAULT_MAX_STRING_LEN;
+ } else {
+ KrillProperties.maxTextSize = userMaxTextLength;
+ }
+
+ }
}
catch (NumberFormatException e) {
log.error("A Krill property expects numerical values: "
diff --git a/src/main/resources/krill.properties.info b/src/main/resources/krill.properties.info
index 2f4871f..45fc56b 100644
--- a/src/main/resources/krill.properties.info
+++ b/src/main/resources/krill.properties.info
@@ -14,3 +14,4 @@
krill.index.commit.log = log/krill.commit.log
krill.index.commit.auto = 500
krill.index.relations.max = 100
+krill.index.textSize.max = 20000000