Improve handling of ellipsis
Change-Id: I758e096678091f52fd3bc00b2a5f6ad1358881cc
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 8a93c21..cf183b7 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -209,7 +209,7 @@
echo - Introduce Token splitter
define Token [
- [%. %. %. | RealToken] @-> ... NLout,
+ RealToken @-> ... NLout,
XML @-> ... NLout,
URL @-> ... NLout,
Email @-> ... NLout,
@@ -222,12 +222,15 @@
! And compose Whitespace ignorance
define DQuotes ["”"|%"|"»"|"«"];
+define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä];
read regex Token .o. [
SP NLout [DQuotes | "›" (NLout DQuotes)| %‹ (NLout DQuotes)| %’ (NLout DQuotes)| "'" (NLout DQuotes)] @-> ... NLout \/ _ NLout \%,
] .o. [
SP @-> ... NLout \/ NLout _ NLout [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - NLout]
] .o. [
+ [%. %. %.] @-> ... NLout \/ _ NLout WS+ NotSmallCaps
+] .o. [
[WS|NL]+ @-> 0 || [ .#. | NLout ] _
];