Improving time parsing
Change-Id: I32d1a2843d3b25e30cb49442c0463f1fa0365b7e
diff --git a/Readme.md b/Readme.md
index 4ac9e94..0e04e83 100644
--- a/Readme.md
+++ b/Readme.md
@@ -28,7 +28,7 @@
The special `END OF TRANSMISSION` character (`\x04`) can be used to mark the end of a text.
-*Caution*: When experimenting with STDIN this way, you may need to disable history expansion.
+> *Caution*: When experimenting with STDIN this way, you may need to disable history expansion.
## Conventions
@@ -73,7 +73,7 @@
read regex Tokenizer;
```
-*Hint*: For development it's easier to replace `@_TOKEN_SYMBOL_@`
+> *Hint*: For development it's easier to replace `@_TOKEN_SYMBOL_@`
with a newline.
## Building
diff --git a/matrix_test.go b/matrix_test.go
index b7655e5..bc755b3 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -716,6 +716,19 @@
assert.Equal(".", tokens[5])
assert.Equal(6, len(tokens))
+ tokens = ttokenize(mat, w, "Sie schwamm die Strecke in 00:00:57,34 00:57,341 0:57 Stunden.")
+ assert.Equal("Sie", tokens[0])
+ assert.Equal("schwamm", tokens[1])
+ assert.Equal("die", tokens[2])
+ assert.Equal("Strecke", tokens[3])
+ assert.Equal("in", tokens[4])
+ assert.Equal("00:00:57,34", tokens[5])
+ assert.Equal("00:57,341", tokens[6])
+ assert.Equal("0:57", tokens[7])
+ assert.Equal("Stunden", tokens[8])
+ assert.Equal(".", tokens[9])
+ assert.Equal(10, len(tokens))
+
/*
@Test
public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 7e4145d..ea20605 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -117,8 +117,8 @@
define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
-! 20:00 Uhr
-define Times [ ([%0|1|2]) Digit ( WS ) ":" [%0|1|2|3|4|5|6] Digit ];
+! 20:00 Uhr, 00:12:25,34 Minuten
+define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
source emoji.xfst
define Emoji;
diff --git a/testdata/tokenizer.datok b/testdata/tokenizer.datok
index 3986ea2..f7bc9cb 100644
--- a/testdata/tokenizer.datok
+++ b/testdata/tokenizer.datok
Binary files differ
diff --git a/testdata/tokenizer.fst b/testdata/tokenizer.fst
index 671ffe4..76232e6 100644
--- a/testdata/tokenizer.fst
+++ b/testdata/tokenizer.fst
Binary files differ
diff --git a/testdata/tokenizer.matok b/testdata/tokenizer.matok
index 7e4e3af..d276902 100644
--- a/testdata/tokenizer.matok
+++ b/testdata/tokenizer.matok
Binary files differ