Add krill.index.textSize.max configuration option

...  to increase maximum text size

Resolves #205

DeLiKo@DNB-XXL requires krill.index.textSize.max = 120000000

Change-Id: I1cd64ffc38179ae1fd965e5ef5f7ec7503fbcd21
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 43041db..28b3e0b 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -17,6 +17,8 @@
 import java.util.regex.Pattern;
 import java.util.zip.GZIPInputStream;
 
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.StreamReadConstraints;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
 import org.apache.lucene.document.Document;
@@ -76,6 +78,8 @@
 import de.ids_mannheim.korap.util.KrillProperties;
 import de.ids_mannheim.korap.util.QueryException;
 
+import static com.fasterxml.jackson.core.StreamReadConstraints.DEFAULT_MAX_STRING_LEN;
+
 /**
  * <p>KrillIndex implements a simple API for searching in and writing
  * to a
@@ -249,6 +253,23 @@
         return this.version;
     };
 
+    public void setMaxStringLength(int maxStringLength) {
+        if (maxStringLength < DEFAULT_MAX_STRING_LEN) {
+            throw new IllegalArgumentException("Maximum string length must not be smaller than the default value: "
+                    + DEFAULT_MAX_STRING_LEN);
+        }
+
+        StreamReadConstraints constraints = StreamReadConstraints.builder()
+                .maxStringLength(maxStringLength)
+                .build();
+
+        JsonFactory factory = JsonFactory.builder()
+                .streamReadConstraints(constraints)
+                .build();
+
+        this.mapper = new ObjectMapper(factory);
+        log.info("Maximum string length set to {}.", maxStringLength);
+    }
 
     /**
      * Get the name of the index.
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index b726ffd..6db23cb 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -25,6 +25,8 @@
 import de.ids_mannheim.korap.KrillIndex;
 import de.ids_mannheim.korap.util.KrillProperties;
 
+import static com.fasterxml.jackson.core.StreamReadConstraints.DEFAULT_MAX_STRING_LEN;
+
 /**
  * Standalone indexer tool for Krill.
  * Although the preferred index method
@@ -195,7 +197,7 @@
         options.addOption(Option.builder("a").longOpt("addInsteadofUpsert")
                 .desc("Always add files to the index, never update")
                 .build());
-        
+
         CommandLineParser parser = new DefaultParser();
 
         String propFile = null;
@@ -216,7 +218,6 @@
             if (cmd.hasOption("a")) {
                 addInsteadOfUpsert = true;
             };
-
         }
         catch (MissingOptionException e) {
             HelpFormatter formatter = new HelpFormatter();
@@ -237,6 +238,12 @@
         try {
             // Get indexer object
             Indexer indexer = new Indexer(prop);
+            
+            // Apply max text size from configuration
+            if (KrillProperties.maxTextSize > DEFAULT_MAX_STRING_LEN) {
+                log.info("Setting max text length to " + KrillProperties.maxTextSize);
+                indexer.index.setMaxStringLength(KrillProperties.maxTextSize);
+            }
 
             // Iterate over list of directories
             for (String arg : inputDirectories) {
diff --git a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
index 68c3e7c..b5b009c 100644
--- a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
+++ b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
@@ -6,6 +6,8 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static com.fasterxml.jackson.core.StreamReadConstraints.DEFAULT_MAX_STRING_LEN;
+
 /**
  * 
  * Todo: Properties may be loaded twice - although Java may cache automatically
@@ -23,6 +25,7 @@
     public static int maxTokenContextSize = 60;
     public static int maxCharContextSize = 500;
     public static int defaultSearchContextLength = 6;
+    public static int maxTextSize = DEFAULT_MAX_STRING_LEN; // Default max text size
     
     public static boolean matchExpansionIncludeContextSize = false;
     
@@ -89,6 +92,7 @@
         // EM: not implemented yet
 //        String maxCharContextSize = prop.getProperty("krill.context.max.char");
         String defaultSearchContextLength = prop.getProperty("krill.search.context.default");
+        String maxTextSizeValue = prop.getProperty("krill.index.textSize.max");
 
         try {
             if (maxTokenMatchSize != null) {
@@ -107,6 +111,18 @@
                 KrillProperties.defaultSearchContextLength = Integer
                         .parseInt(defaultSearchContextLength);
             }
+            if (maxTextSizeValue != null) {
+                int userMaxTextLength = Integer
+                        .parseInt(maxTextSizeValue);
+                if (userMaxTextLength < DEFAULT_MAX_STRING_LEN) {
+                    log.warn("Specified krill.index.textSize.max is too small. Using default value: "
+                            + DEFAULT_MAX_STRING_LEN);
+                    KrillProperties.maxTextSize = DEFAULT_MAX_STRING_LEN;
+                } else {
+                    KrillProperties.maxTextSize = userMaxTextLength;
+                }
+
+            }
         }
         catch (NumberFormatException e) {
             log.error("A Krill property expects numerical values: "
diff --git a/src/main/resources/krill.properties.info b/src/main/resources/krill.properties.info
index 2f4871f..45fc56b 100644
--- a/src/main/resources/krill.properties.info
+++ b/src/main/resources/krill.properties.info
@@ -14,3 +14,4 @@
 krill.index.commit.log = log/krill.commit.log
 krill.index.commit.auto = 500
 krill.index.relations.max = 100
+krill.index.textSize.max = 20000000
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index ee4d1dc..bb8a152 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -4,6 +4,7 @@
 

 import java.io.ByteArrayOutputStream;

 import java.io.File;

+import java.io.FileWriter;

 import java.io.IOException;

 import java.io.PrintStream;

 

@@ -105,6 +106,24 @@
         assertEquals(outputStream.toString(), "Added 1 file.\n");

     }

 

+    @Test

+    public void testMaxTextSize () throws IOException {

+        // Create a temporary properties file with the max text size setting

+        File tempPropertiesFile = File.createTempFile("krill", ".properties");

+        FileWriter writer = new FileWriter(tempPropertiesFile);

+        writer.write("krill.version = ${project.version}\n");

+        writer.write("krill.name = ${project.name}\n");

+        writer.write("krill.indexDir = test-output\n");

+        writer.write("krill.index.textSize.max = 25000000\n");

+        writer.close();

+        

+        Indexer.main(new String[] { "-c", tempPropertiesFile.getAbsolutePath(),

+                "-i", "src/test/resources/bzk", "-o", "test-output-1"});

+        assertEquals("Added or updated 1 file.\n", outputStream.toString());

+        

+        tempPropertiesFile.delete();

+    }

+

     @Before

     public void setOutputStream () {

         System.setOut(new PrintStream(outputStream));