Further improve speech rule for eos with more quotation marks
Change-Id: Ife5bc78b6e0beafe3a52c4cecb760bff2854cbaa
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 3d63e5b..4193c28 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -44,10 +44,9 @@
[%, %,]];
! Right punctuation - excluding the characters that can be used as apostrophe
-define RPS ["”"|"›"|"»"|%"|[%’ %’]|["'" "'"]|[%‘ %‘]];
-define RP [SP|RPS|","|";"|":"|
+define RP [SP|","|";"|":"|
")"|"]"|"}"|
- ! differs
+ "”"|"›"|"»"|%"|[%’ %’]|["'" "'"]|[%‘ %‘]|
"*"|"/"|"_"]; ! Can be Markdown
define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@|%&];
@@ -222,9 +221,9 @@
echo - Introduce Sentence splitter
! And compose Whitespace ignorance
read regex Token .o. [
- SP NLout %" @-> ... NLout \/ _ NLout \%,
+ SP NLout ["”"|"›"|"»"|%"|%’|"'"] @-> ... NLout \/ _ NLout \%,
] .o. [
- SP @-> ... NLout \/ NLout _ NLout \%"
+ SP @-> ... NLout \/ NLout _ NLout [? - "”" - "›" - "»" - %" - %’ - "'"]
] .o. [
[WS|NL]+ @-> 0 || [ .#. | NLout ] _
];
diff --git a/testdata/tokenizer.datok b/testdata/tokenizer.datok
index 81023a6..18a99b4 100644
--- a/testdata/tokenizer.datok
+++ b/testdata/tokenizer.datok
Binary files differ
diff --git a/testdata/tokenizer.fst b/testdata/tokenizer.fst
index d1632b8..73cbda5 100644
--- a/testdata/tokenizer.fst
+++ b/testdata/tokenizer.fst
Binary files differ
diff --git a/testdata/tokenizer.matok b/testdata/tokenizer.matok
index f332244..fe2b6f1 100644
--- a/testdata/tokenizer.matok
+++ b/testdata/tokenizer.matok
Binary files differ