Add time format to transducer
Change-Id: Idfc6a6af71a8e1254e9065bc1382ec46279fd430
diff --git a/cmd/datok.go b/cmd/datok.go
index d9a71f0..66bc7e9 100644
--- a/cmd/datok.go
+++ b/cmd/datok.go
@@ -103,6 +103,4 @@
dat.TransduceTokenWriter(os.Stdin, tw)
tw.Flush()
}
-
- fmt.Println("\n")
}
diff --git a/matrix_test.go b/matrix_test.go
index f985736..b7655e5 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -707,6 +707,15 @@
assert.Equal("kriegste", tokens[8])
assert.Equal(9, len(tokens))
+ tokens = ttokenize(mat, w, "Es ist gleich 2:30 Uhr.")
+ assert.Equal("Es", tokens[0])
+ assert.Equal("ist", tokens[1])
+ assert.Equal("gleich", tokens[2])
+ assert.Equal("2:30", tokens[3])
+ assert.Equal("Uhr", tokens[4])
+ assert.Equal(".", tokens[5])
+ assert.Equal(6, len(tokens))
+
/*
@Test
public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 7b16fb8..7e4145d 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -117,6 +117,9 @@
define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
+! 20:00 Uhr
+define Times [ ([%0|1|2]) Digit ( WS ) ":" [%0|1|2|3|4|5|6] Digit ];
+
source emoji.xfst
define Emoji;
@@ -212,7 +215,7 @@
echo - Compile Real Token
-define RealToken [Punct|Word|XML|Email|URL|SNS|Domain|AcronymDep|File|Ord|Num|Years];
+define RealToken [Punct|Word|XML|Email|URL|SNS|Domain|AcronymDep|File|Ord|Num|Years|Times];
echo - Introduce Token splitter