Don't treat soft hyphens (U+00AD) as token boundaries

Resolves #131

Change-Id: Ia62f5ff91d82ef22830d8fd31afb701c9b703c26
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3974aa9..faa935c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,10 @@
 # Changelog
 
 
+## 2.3.1-SNAPSHOT
+
+* Fixed soft hyphens (U+00AD) being incorrectly treated as token boundaries (issue #131)
+
 ## 2.3.0 [2025-12-23]
 
 * Fixed genderstern and omission asterisk breaking after hyphens (issue #115)
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 59e0e81..fdd631a 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -347,7 +347,8 @@
 THAI       = [\u0E00-\u0E59]
 
 // basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
-ALPHANUM   = ({LETTER}|{THAI}|[:digit:]|_)+
+// Soft hyphen (\u00AD) is included to prevent it from acting as a token boundary (issue #131)
+ALPHANUM   = ({LETTER}|{THAI}|[:digit:]|_|\u00AD)+
 
 // case insensitivity is useful sometimes
 a = [aA]
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index fc8850d..82f758b 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -815,4 +815,14 @@
         assertEquals(".", tokens[10]);
         assertEquals(11, tokens.length);
     }
+
+    // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/131
+    @Test
+    public void testSoftHyphensShouldNotSplitWords() {
+        DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+        // Soft hyphen U+00AD between word parts should not cause token split
+        String[] tokens = tok.tokenize("Donau\u00ADdampf\u00ADschiff");
+        assertEquals("Donau\u00ADdampf\u00ADschiff", tokens[0]);
+        assertEquals(1, tokens.length);
+    }
 }