Support quote combinations at the end of sentences
Change-Id: I6ae7015b1b69464238360e7c7205e56e113430bc
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 4193c28..8a93c21 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -209,7 +209,7 @@
echo - Introduce Token splitter
define Token [
- RealToken @-> ... NLout,
+ [%. %. %. | RealToken] @-> ... NLout,
XML @-> ... NLout,
URL @-> ... NLout,
Email @-> ... NLout,
@@ -220,10 +220,13 @@
echo - Introduce Sentence splitter
! And compose Whitespace ignorance
+
+define DQuotes ["”"|%"|"»"|"«"];
+
read regex Token .o. [
- SP NLout ["”"|"›"|"»"|%"|%’|"'"] @-> ... NLout \/ _ NLout \%,
+ SP NLout [DQuotes | "›" (NLout DQuotes)| %‹ (NLout DQuotes)| %’ (NLout DQuotes)| "'" (NLout DQuotes)] @-> ... NLout \/ _ NLout \%,
] .o. [
- SP @-> ... NLout \/ NLout _ NLout [? - "”" - "›" - "»" - %" - %’ - "'"]
+ SP @-> ... NLout \/ NLout _ NLout [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - NLout]
] .o. [
[WS|NL]+ @-> 0 || [ .#. | NLout ] _
];