Restrict term length to never exceed Lucene's fixed term length

Change-Id: Icc3be552e95ca15967b544168e0c3be4d533d00f
diff --git a/Changes b/Changes
index 7d63f83..5c618d9 100644
--- a/Changes
+++ b/Changes
@@ -3,6 +3,8 @@
     - [feature] Make VC cache location customizable (margaretha)
     - [bugfix] Improve handling of C2 #IN frames serialization
       (diewald)
+    - [bugfix] Restrict term length to never exceed Lucene
+      boundaries (diewald)
 
 0.62.2 2024-02-20
     - [feature] Support MMap directory parameters directly
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java b/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
index 5303876..ae49892 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
@@ -56,6 +56,8 @@
 
     private static short i, l;
 
+    private static final int MAX_TERM_LENGTH = 1024;
+    
     // This advices the java compiler to ignore all loggings
     public static final boolean DEBUG = false;
     private final Logger log = LoggerFactory
@@ -122,6 +124,11 @@
      * @return The {@link MultIterm} object for chaining.
      */
     public MultiTerm setTerm (String term) {
+        if (term.length() > (MAX_TERM_LENGTH - 2)) {
+            term = term.substring(0, MAX_TERM_LENGTH - 2);
+            log.warn("Term %s... exceeds %i cahracters - cutted", term, MAX_TERM_LENGTH);
+        }
+
         this.term = term;
         return this;
     };
@@ -468,7 +475,7 @@
                                 + termSurface[0]);
             };
         };
-        this.term = _unescape(stringOffset[0]);
+        this.setTerm(_unescape(stringOffset[0]));
     };
 
 
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index c83d87d..8f931d5 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -24,6 +24,7 @@
     private final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();

     private String info = "usage: Krill indexer";

     private File outputDirectory = new File("test-index");

+    private File outputDirectory2 = new File("test-index2");

 

     @Test

     public void testArguments () throws IOException {

@@ -91,6 +92,17 @@
         assertEquals(true, outputStream.toString().startsWith(info));

     }

 

+    @Test

+    public void testUnicodeProblem () throws IOException {

+        Indexer.main(new String[] {

+                "-c", "src/test/resources/krill.properties",

+                "-i", "src/test/resources/bug",

+                "-o", "test-index2"

+            });

+        logger.info(outputStream.toString());

+        assertEquals(outputStream.toString(), "Added 1 file.\n");

+    }

+

     @Before

     public void setOutputStream () {

         System.setOut(new PrintStream(outputStream));

@@ -107,6 +119,11 @@
         if (outputDirectory.exists()) {

             logger.debug("Output directory exists");

             deleteFile(outputDirectory);

+            deleteFile(outputDirectory2);

+        }

+        if (outputDirectory2.exists()) {

+            logger.debug("Output directory 2 exists");

+            deleteFile(outputDirectory2);

         }

     }

 

diff --git a/src/test/resources/bug/BSP-2013-01-32-longatt.json.gz b/src/test/resources/bug/BSP-2013-01-32-longatt.json.gz
new file mode 100644
index 0000000..9982a53
--- /dev/null
+++ b/src/test/resources/bug/BSP-2013-01-32-longatt.json.gz
Binary files differ