Split tokenizer rules into language-specific and language-dependent
Change-Id: I6e5eeabe9a306ce9fc3e62340ba7b948ecc4fa0f
diff --git a/src/all/allsentencesplit.xfst b/src/all/allsentencesplit.xfst
new file mode 100644
index 0000000..bef8cbd
--- /dev/null
+++ b/src/all/allsentencesplit.xfst
@@ -0,0 +1,19 @@
+echo - Introduce Sentence splitter
+! And compose Whitespace ignorance
+
+read regex Token .o. [
+ ! Put a Token boundary behind the longest possible
+ ! sentence ending punctuation sequence,
+ ! that isn't followed by a comma
+ SentenceEnd @-> ... NLout \/ _ NLout \%,
+] .o. [
+ ! Put a Token boundary behind a punctuation
+ ! that is not a start of a punctuation sequence
+ SP @-> ... NLout \/ NLout _ NLout NotSentenceExtension
+] .o. [
+ ! Put a Token boundary behind ... if not followed by a small character
+ [%. %. %.] @-> ... NLout \/ _ NLout WS+ NotSmallCaps
+] .o. [
+ ! Remove whitespace between Tokens
+ [WS|NL]+ @-> 0 || [ .#. | NLout ] _
+];