Fixed thousands separators not being handled consistently
Thanks @notesjor
Resolves #135
Change-Id: I2d1be0329af6729bdea51431cfcbf24b6dcbc3db
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 106aadd..60a12df 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,9 @@
- Short forms for determiners, adjectives, pronouns: `eine(n)`, `gute:r`, `ihm/r`, `diese(r)`, `ein(e)`
* Added `de_old` German tokenizer variant without gender-sensitive rules
(use `-l de_old` to split forms like `Nutzer:in` into separate tokens)
+* Fixed thousands separators not being handled consistently (issue #135):
+ - Apostrophe `'` (Swiss format: `1'000'000`)
+ - Thin space U+2009 and narrow no-break space U+202F
## 2.3.1 [2026-01-28]
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 1d93e6f..0b9cbb8 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -663,12 +663,17 @@
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
+// THOUSANDS_SEP for thousands separators: apostrophe (Swiss) and thin/narrow no-break space (issue #135)
+THOUSANDS_SEP = ("'"|"'"|[\u2009\u202F])
+
NUM = ({ALPHANUM} {P} {HAS_DIGIT}
| {HAS_DIGIT} {P} {ALPHANUM}
| {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
| {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
| {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
- | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
+ | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
+ | {HAS_DIGIT} ({THOUSANDS_SEP} {HAS_DIGIT})+
+ | {HAS_DIGIT} ({THOUSANDS_SEP} {HAS_DIGIT})+ {P} {HAS_DIGIT})
/* floating point literals */
@@ -682,6 +687,7 @@
// punctuation
P = ("_"|"-"|"."|",")|{SLASH}
+
Q = [’\'`]
PUNCT = ({P}|{Q}|[?!@#$%\^&*_:;\]\[\"»«\202\204\206\207\213\221\222\223\224\225\226\227\233])
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index b0ee8c3..11b4938 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -1215,4 +1215,55 @@
assertEquals("!", tokens[2]);
assertEquals(3, tokens.length);
}
+
+ // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/135
+ @Test
+ public void testTokenizerThousandsSeparators() {
+ DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+
+ // Swiss apostrophe format (straight apostrophe)
+ String[] tokens = tok.tokenize("Das kostet 1'000'000 Franken");
+ assertEquals("Das", tokens[0]);
+ assertEquals("kostet", tokens[1]);
+ assertEquals("1'000'000", tokens[2]);
+ assertEquals("Franken", tokens[3]);
+ assertEquals(4, tokens.length);
+
+ // Swiss apostrophe format (curly apostrophe)
+ tokens = tok.tokenize("Der Preis ist 1'234'567 CHF");
+ assertEquals("Der", tokens[0]);
+ assertEquals("Preis", tokens[1]);
+ assertEquals("ist", tokens[2]);
+ assertEquals("1'234'567", tokens[3]);
+ assertEquals("CHF", tokens[4]);
+ assertEquals(5, tokens.length);
+
+ // Swiss format with decimal
+ tokens = tok.tokenize("Betrag: 1'234'567.89");
+ assertEquals("Betrag", tokens[0]);
+ assertEquals(":", tokens[1]);
+ assertEquals("1'234'567.89", tokens[2]);
+ assertEquals(3, tokens.length);
+
+ // Thin space format (U+2009)
+ tokens = tok.tokenize("Population: 1\u2009000\u2009000");
+ assertEquals("Population", tokens[0]);
+ assertEquals(":", tokens[1]);
+ assertEquals("1\u2009000\u2009000", tokens[2]);
+ assertEquals(3, tokens.length);
+
+ // Narrow no-break space format (U+202F)
+ tokens = tok.tokenize("Value: 1\u202F234\u202F567");
+ assertEquals("Value", tokens[0]);
+ assertEquals(":", tokens[1]);
+ assertEquals("1\u202F234\u202F567", tokens[2]);
+ assertEquals(3, tokens.length);
+
+ // Thin space with decimal
+ tokens = tok.tokenize("Result: 1\u2009000\u2009000,50");
+ assertEquals("Result", tokens[0]);
+ assertEquals(":", tokens[1]);
+ assertEquals("1\u2009000\u2009000,50", tokens[2]);
+ assertEquals(3, tokens.length);
+ }
}