Introduce hyphenated abreviations in german tokenizer
Change-Id: I12e01e4931b486d85202107ab82077aa16ee4aad
diff --git a/Changes b/Changes
index 9af4aaa..f01547d 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.3.1 2026-02-03
+ - Introduce hyphenated abbreviations in german tokenizer.
+
0.2.2 2023-09-06
- Fix behaviour for end of text character positions
when no end of sentence occured before.
diff --git a/datok_test.go b/datok_test.go
index 8130690..4e2a44d 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -791,6 +791,22 @@
assert.Equal("kriegste", tokens[8])
assert.Equal(9, len(tokens))
+ // Regression test for hyphenated abbreviations from Wiktionary (2024-12)
+ tokens = ttokenize(dat, w, "Ich wohne in Ba.-Wü. und bin Dipl.-Ing. bei Reg.-Bez. Karlsruhe.")
+ assert.Equal("Ich", tokens[0])
+ assert.Equal("wohne", tokens[1])
+ assert.Equal("in", tokens[2])
+ assert.Equal("Ba.-Wü.", tokens[3])
+ assert.Equal("und", tokens[4])
+ assert.Equal("bin", tokens[5])
+ assert.Equal("Dipl.-Ing.", tokens[6])
+ assert.Equal("bei", tokens[7])
+ assert.Equal("Reg.-Bez.", tokens[8])
+ assert.Equal("Karlsruhe", tokens[9])
+ assert.Equal(".", tokens[10])
+ assert.Equal(11, len(tokens));
+
+
/*
@Test
public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
diff --git a/src/de/tokenizer.xfst b/src/de/tokenizer.xfst
index 9670f9a..ce4bec3 100644
--- a/src/de/tokenizer.xfst
+++ b/src/de/tokenizer.xfst
@@ -56,13 +56,15 @@
! of the IDS.
define Abbr [ @txt"de/abbrv.txt" | Letter ] %.;
+define HypAbbr [ Abbr ( %- Abbr )+ | {Ba.-Wü.}];
+
define Streetname Word {str} %.;
source all/allpost.xfst
echo - Compile Real Token
-define RealToken [Punct|Emdash|Abbr|Streetname|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission];
+define RealToken [Punct|Emdash|HypAbbr|Streetname|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission];
echo - Introduce Token splitter