Add krill.index.textSize.max configuration option
... to increase maximum text size
Resolves #205
DeLiKo@DNB-XXL requires krill.index.textSize.max = 120000000
Change-Id: I1cd64ffc38179ae1fd965e5ef5f7ec7503fbcd21
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 43041db..28b3e0b 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -17,6 +17,8 @@
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.StreamReadConstraints;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.document.Document;
@@ -76,6 +78,8 @@
import de.ids_mannheim.korap.util.KrillProperties;
import de.ids_mannheim.korap.util.QueryException;
+import static com.fasterxml.jackson.core.StreamReadConstraints.DEFAULT_MAX_STRING_LEN;
+
/**
* <p>KrillIndex implements a simple API for searching in and writing
* to a
@@ -249,6 +253,23 @@
return this.version;
};
+ public void setMaxStringLength(int maxStringLength) {
+ if (maxStringLength < DEFAULT_MAX_STRING_LEN) {
+ throw new IllegalArgumentException("Maximum string length must not be smaller than the default value: "
+ + DEFAULT_MAX_STRING_LEN);
+ }
+
+ StreamReadConstraints constraints = StreamReadConstraints.builder()
+ .maxStringLength(maxStringLength)
+ .build();
+
+ JsonFactory factory = JsonFactory.builder()
+ .streamReadConstraints(constraints)
+ .build();
+
+ this.mapper = new ObjectMapper(factory);
+ log.info("Maximum string length set to {}.", maxStringLength);
+ }
/**
* Get the name of the index.
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index b726ffd..6db23cb 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -25,6 +25,8 @@
import de.ids_mannheim.korap.KrillIndex;
import de.ids_mannheim.korap.util.KrillProperties;
+import static com.fasterxml.jackson.core.StreamReadConstraints.DEFAULT_MAX_STRING_LEN;
+
/**
* Standalone indexer tool for Krill.
* Although the preferred index method
@@ -195,7 +197,7 @@
options.addOption(Option.builder("a").longOpt("addInsteadofUpsert")
.desc("Always add files to the index, never update")
.build());
-
+
CommandLineParser parser = new DefaultParser();
String propFile = null;
@@ -216,7 +218,6 @@
if (cmd.hasOption("a")) {
addInsteadOfUpsert = true;
};
-
}
catch (MissingOptionException e) {
HelpFormatter formatter = new HelpFormatter();
@@ -237,6 +238,12 @@
try {
// Get indexer object
Indexer indexer = new Indexer(prop);
+
+ // Apply max text size from configuration
+ if (KrillProperties.maxTextSize > DEFAULT_MAX_STRING_LEN) {
+ log.info("Setting max text length to " + KrillProperties.maxTextSize);
+ indexer.index.setMaxStringLength(KrillProperties.maxTextSize);
+ }
// Iterate over list of directories
for (String arg : inputDirectories) {
diff --git a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
index 68c3e7c..b5b009c 100644
--- a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
+++ b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
@@ -6,6 +6,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import static com.fasterxml.jackson.core.StreamReadConstraints.DEFAULT_MAX_STRING_LEN;
+
/**
*
* Todo: Properties may be loaded twice - although Java may cache automatically
@@ -23,6 +25,7 @@
public static int maxTokenContextSize = 60;
public static int maxCharContextSize = 500;
public static int defaultSearchContextLength = 6;
+ public static int maxTextSize = DEFAULT_MAX_STRING_LEN; // Default max text size
public static boolean matchExpansionIncludeContextSize = false;
@@ -89,6 +92,7 @@
// EM: not implemented yet
// String maxCharContextSize = prop.getProperty("krill.context.max.char");
String defaultSearchContextLength = prop.getProperty("krill.search.context.default");
+ String maxTextSizeValue = prop.getProperty("krill.index.textSize.max");
try {
if (maxTokenMatchSize != null) {
@@ -107,6 +111,18 @@
KrillProperties.defaultSearchContextLength = Integer
.parseInt(defaultSearchContextLength);
}
+ if (maxTextSizeValue != null) {
+ int userMaxTextLength = Integer
+ .parseInt(maxTextSizeValue);
+ if (userMaxTextLength < DEFAULT_MAX_STRING_LEN) {
+ log.warn("Specified krill.index.textSize.max is too small. Using default value: "
+ + DEFAULT_MAX_STRING_LEN);
+ KrillProperties.maxTextSize = DEFAULT_MAX_STRING_LEN;
+ } else {
+ KrillProperties.maxTextSize = userMaxTextLength;
+ }
+
+ }
}
catch (NumberFormatException e) {
log.error("A Krill property expects numerical values: "
diff --git a/src/main/resources/krill.properties.info b/src/main/resources/krill.properties.info
index 2f4871f..45fc56b 100644
--- a/src/main/resources/krill.properties.info
+++ b/src/main/resources/krill.properties.info
@@ -14,3 +14,4 @@
krill.index.commit.log = log/krill.commit.log
krill.index.commit.auto = 500
krill.index.relations.max = 100
+krill.index.textSize.max = 20000000
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index ee4d1dc..bb8a152 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -4,6 +4,7 @@
import java.io.ByteArrayOutputStream;
import java.io.File;
+import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintStream;
@@ -105,6 +106,24 @@
assertEquals(outputStream.toString(), "Added 1 file.\n");
}
+ @Test
+ public void testMaxTextSize () throws IOException {
+ // Create a temporary properties file with the max text size setting
+ File tempPropertiesFile = File.createTempFile("krill", ".properties");
+ FileWriter writer = new FileWriter(tempPropertiesFile);
+ writer.write("krill.version = ${project.version}\n");
+ writer.write("krill.name = ${project.name}\n");
+ writer.write("krill.indexDir = test-output\n");
+ writer.write("krill.index.textSize.max = 25000000\n");
+ writer.close();
+
+ Indexer.main(new String[] { "-c", tempPropertiesFile.getAbsolutePath(),
+ "-i", "src/test/resources/bzk", "-o", "test-output-1"});
+ assertEquals("Added or updated 1 file.\n", outputStream.toString());
+
+ tempPropertiesFile.delete();
+ }
+
@Before
public void setOutputStream () {
System.setOut(new PrintStream(outputStream));