Add single character abbreviations
Change-Id: I8697f92adb3c6ad130b55260b1829bd61673a9f2
diff --git a/matrix_test.go b/matrix_test.go
index bc755b3..85dd796 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -729,6 +729,21 @@
assert.Equal(".", tokens[9])
assert.Equal(10, len(tokens))
+ // waste example
+ tokens = ttokenize(mat, w, "Am 24.1.1806 feierte E. T. A. Hoffmann seinen 30. Geburtstag.")
+ assert.Equal(tokens[0], "Am")
+ assert.Equal(tokens[1], "24.1.1806")
+ assert.Equal(tokens[2], "feierte")
+ assert.Equal(tokens[3], "E.")
+ assert.Equal(tokens[4], "T.")
+ assert.Equal(tokens[5], "A.")
+ assert.Equal(tokens[6], "Hoffmann")
+ assert.Equal(tokens[7], "seinen")
+ assert.Equal(tokens[8], "30.")
+ assert.Equal(tokens[9], "Geburtstag")
+ assert.Equal(tokens[10], ".")
+ assert.Equal(11, len(tokens))
+
/*
@Test
public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index ea20605..0100d6e 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -108,7 +108,9 @@
ß (->) {SS}
];
-define Abbr @txt"txt/abbrv.txt" %.;
+define Letter [ [ AsciiLetter | ö | ü | ä | ß ] .o. Caseinsensitive ];
+
+define Abbr [ @txt"txt/abbrv.txt" | Letter ] %.;
define Plusampersand @txt"txt/plusampersand.txt";
@@ -125,7 +127,7 @@
! acronyms: U.S.A., I.B.M., etc.
! use a post-filter to remove dots
-define AcronymDep Char %. [Char %.]+;
+define AcronymDep Letter %. [Letter %.]+;
define Dot "."|[["["|"("] "d" "o" "t" [")"|"]"]] .o. Caseinsensitive;
define At "@"|[["["|"("] "a" "t" [")"|"]"]] .o. Caseinsensitive;
@@ -220,8 +222,8 @@
echo - Introduce Token splitter
define Token [
- RealToken @-> ... NLout,
Abbr @-> ... NLout,
+ RealToken @-> ... NLout,
Plusampersand @-> ... NLout,
Emoji @-> ... NLout,
[Streetname|Omission|Emdash] @-> ... NLout