Restrict term length to never exceed Lucene's fixed term length
Change-Id: Icc3be552e95ca15967b544168e0c3be4d533d00f
diff --git a/Changes b/Changes
index 7d63f83..5c618d9 100644
--- a/Changes
+++ b/Changes
@@ -3,6 +3,8 @@
- [feature] Make VC cache location customizable (margaretha)
- [bugfix] Improve handling of C2 #IN frames serialization
(diewald)
+ - [bugfix] Restrict term length to never exceed Lucene
+ boundaries (diewald)
0.62.2 2024-02-20
- [feature] Support MMap directory parameters directly
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java b/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
index 5303876..ae49892 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
@@ -56,6 +56,8 @@
private static short i, l;
+ private static final int MAX_TERM_LENGTH = 1024;
+
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
private final Logger log = LoggerFactory
@@ -122,6 +124,11 @@
* @return The {@link MultIterm} object for chaining.
*/
public MultiTerm setTerm (String term) {
+ if (term.length() > (MAX_TERM_LENGTH - 2)) {
+ term = term.substring(0, MAX_TERM_LENGTH - 2);
+ log.warn("Term %s... exceeds %i cahracters - cutted", term, MAX_TERM_LENGTH);
+ }
+
this.term = term;
return this;
};
@@ -468,7 +475,7 @@
+ termSurface[0]);
};
};
- this.term = _unescape(stringOffset[0]);
+ this.setTerm(_unescape(stringOffset[0]));
};
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index c83d87d..8f931d5 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -24,6 +24,7 @@
private final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
private String info = "usage: Krill indexer";
private File outputDirectory = new File("test-index");
+ private File outputDirectory2 = new File("test-index2");
@Test
public void testArguments () throws IOException {
@@ -91,6 +92,17 @@
assertEquals(true, outputStream.toString().startsWith(info));
}
+ @Test
+ public void testUnicodeProblem () throws IOException {
+ Indexer.main(new String[] {
+ "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bug",
+ "-o", "test-index2"
+ });
+ logger.info(outputStream.toString());
+ assertEquals(outputStream.toString(), "Added 1 file.\n");
+ }
+
@Before
public void setOutputStream () {
System.setOut(new PrintStream(outputStream));
@@ -107,6 +119,11 @@
if (outputDirectory.exists()) {
logger.debug("Output directory exists");
deleteFile(outputDirectory);
+ deleteFile(outputDirectory2);
+ }
+ if (outputDirectory2.exists()) {
+ logger.debug("Output directory 2 exists");
+ deleteFile(outputDirectory2);
}
}
diff --git a/src/test/resources/bug/BSP-2013-01-32-longatt.json.gz b/src/test/resources/bug/BSP-2013-01-32-longatt.json.gz
new file mode 100644
index 0000000..9982a53
--- /dev/null
+++ b/src/test/resources/bug/BSP-2013-01-32-longatt.json.gz
Binary files differ