Add single character abbreviations
Change-Id: I8697f92adb3c6ad130b55260b1829bd61673a9f2
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index ea20605..0100d6e 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -108,7 +108,9 @@
ß (->) {SS}
];
-define Abbr @txt"txt/abbrv.txt" %.;
+define Letter [ [ AsciiLetter | ö | ü | ä | ß ] .o. Caseinsensitive ];
+
+define Abbr [ @txt"txt/abbrv.txt" | Letter ] %.;
define Plusampersand @txt"txt/plusampersand.txt";
@@ -125,7 +127,7 @@
! acronyms: U.S.A., I.B.M., etc.
! use a post-filter to remove dots
-define AcronymDep Char %. [Char %.]+;
+define AcronymDep Letter %. [Letter %.]+;
define Dot "."|[["["|"("] "d" "o" "t" [")"|"]"]] .o. Caseinsensitive;
define At "@"|[["["|"("] "a" "t" [")"|"]"]] .o. Caseinsensitive;
@@ -220,8 +222,8 @@
echo - Introduce Token splitter
define Token [
- RealToken @-> ... NLout,
Abbr @-> ... NLout,
+ RealToken @-> ... NLout,
Plusampersand @-> ... NLout,
Emoji @-> ... NLout,
[Streetname|Omission|Emdash] @-> ... NLout