Fix handling of "z.B."
Change-Id: Ie419cae80f16443c4f3f656fe827023d00c6bc12
diff --git a/cmd/datok.go b/cmd/datok.go
index 1c9d99b..1c1a781 100644
--- a/cmd/datok.go
+++ b/cmd/datok.go
@@ -11,6 +11,10 @@
"github.com/alecthomas/kong"
)
+// TODO:
+// - Support version information based on
+// https://blog.carlmjohnson.net/post/2021/golang-118-minor-features/
+
var cli struct {
Convert struct {
Foma string `kong:"required,short='i',help='The Foma FST file'"`
diff --git a/matrix_test.go b/matrix_test.go
index 7200608..e7aa154 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -787,6 +787,21 @@
assert.Equal(tokens[2], ".")
assert.Equal(3, len(tokens))
+ // z.B.
+ tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
+ assert.Equal(tokens[0], "Dies")
+ assert.Equal(tokens[1], "sind")
+ assert.Equal(tokens[2], "z.")
+ assert.Equal(tokens[3], "B.")
+ assert.Equal(tokens[4], "zwei")
+ assert.Equal(tokens[5], "Wörter")
+ assert.Equal(tokens[6], "-")
+ assert.Equal(tokens[7], "z.")
+ assert.Equal(tokens[8], "B.")
+ assert.Equal(tokens[9], "auch")
+ assert.Equal(tokens[10], ".")
+ assert.Equal(11, len(tokens))
+
/*
@Test
public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index ccc27c1..fb97c5c 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -129,7 +129,7 @@
! acronyms: U.S.A., I.B.M., etc.
! use a post-filter to remove dots
-define AcronymDep Letter %. [Letter %.]+;
+define AcronymDep Letter %. Letter %. [Letter %.]+;
define Dot "."|[["["|"("] "d" "o" "t" [")"|"]"]] .o. Caseinsensitive;
define At "@"|[["["|"("] "a" "t" [")"|"]"]] .o. Caseinsensitive;