Don't treat soft hyphens (U+00AD) as token boundaries
Resolves #131
Change-Id: Ia62f5ff91d82ef22830d8fd31afb701c9b703c26
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3974aa9..faa935c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,10 @@
# Changelog
+## 2.3.1-SNAPSHOT
+
+* Fixed soft hyphens (U+00AD) being incorrectly treated as token boundaries (issue #131)
+
## 2.3.0 [2025-12-23]
* Fixed genderstern and omission asterisk breaking after hyphens (issue #115)
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 59e0e81..fdd631a 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -347,7 +347,8 @@
THAI = [\u0E00-\u0E59]
// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
-ALPHANUM = ({LETTER}|{THAI}|[:digit:]|_)+
+// Soft hyphen (\u00AD) is included to prevent it from acting as a token boundary (issue #131)
+ALPHANUM = ({LETTER}|{THAI}|[:digit:]|_|\u00AD)+
// case insensitivity is useful sometimes
a = [aA]
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index fc8850d..82f758b 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -815,4 +815,14 @@
assertEquals(".", tokens[10]);
assertEquals(11, tokens.length);
}
+
+ // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/131
+ @Test
+ public void testSoftHyphensShouldNotSplitWords() {
+ DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+ // Soft hyphen U+00AD between word parts should not cause token split
+ String[] tokens = tok.tokenize("Donau\u00ADdampf\u00ADschiff");
+ assertEquals("Donau\u00ADdampf\u00ADschiff", tokens[0]);
+ assertEquals(1, tokens.length);
+ }
}