Improve handling of ellipsis Change-Id: I758e096678091f52fd3bc00b2a5f6ad1358881cc

commit: 4222ac87aa5a497bb71ae0744c1cee342644364c [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Mar 11 01:06:21 2022 +0100
committer: Akron <nils@diewald-online.de> Fri Mar 11 01:06:21 2022 +0100
tree: 482d4099b21e5617ddfedc9ee6fd2f33710d6dae
parent: ece3f019bff4a6f4a1293eb4c0b72a39c41c8873 [diff] [blame]
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 8a93c21..cf183b7 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst

@@ -209,7 +209,7 @@
 echo - Introduce Token splitter
 
 define Token [
-  [%. %. %. | RealToken] @-> ... NLout,
+  RealToken @-> ... NLout,
   XML @-> ... NLout,
   URL @-> ... NLout,
   Email @-> ... NLout,
@@ -222,12 +222,15 @@
 ! And compose Whitespace ignorance
 
 define DQuotes ["”"|%"|"»"|"«"];
+define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä];
 
 read regex Token .o. [
   SP NLout [DQuotes | "›" (NLout DQuotes)| %‹ (NLout DQuotes)| %’ (NLout DQuotes)| "'" (NLout DQuotes)] @-> ... NLout \/ _ NLout \%,
 ] .o. [
   SP @-> ... NLout \/ NLout _ NLout [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - NLout]
 ] .o. [
+  [%. %. %.] @-> ... NLout \/ _ NLout WS+ NotSmallCaps
+] .o. [
   [WS|NL]+ @-> 0 || [ .#. | NLout ] _
 ];
commit	4222ac87aa5a497bb71ae0744c1cee342644364c	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Mar 11 01:06:21 2022 +0100
committer	Akron <nils@diewald-online.de>	Fri Mar 11 01:06:21 2022 +0100
tree	482d4099b21e5617ddfedc9ee6fd2f33710d6dae
parent	ece3f019bff4a6f4a1293eb4c0b72a39c41c8873 [diff] [blame]