Split tokenizer rules into language-specific and language-dependent Change-Id: I6e5eeabe9a306ce9fc3e62340ba7b948ecc4fa0f

commit: 78f6714529aea5209c4363edf94570a4f5686d19 [log] [tgz]
author: Akron <nils@diewald-online.de> Sat Apr 09 14:10:44 2022 +0200
committer: Akron <nils@diewald-online.de> Sat Apr 09 14:10:44 2022 +0200
tree: 907711b536979f8a39dd63a8f05364fd9f524331
parent: 61948ef87b5c0e556439fff72a270ac1f5ca9bc7 [diff] [blame]
diff --git a/src/all/allsentencesplit.xfst b/src/all/allsentencesplit.xfst
new file mode 100644
index 0000000..bef8cbd
--- /dev/null
+++ b/src/all/allsentencesplit.xfst

@@ -0,0 +1,19 @@
+echo - Introduce Sentence splitter
+! And compose Whitespace ignorance
+
+read regex Token .o. [
+  ! Put a Token boundary behind the longest possible
+  ! sentence ending punctuation sequence,
+  ! that isn't followed by a comma
+  SentenceEnd @-> ... NLout \/ _ NLout \%,
+] .o. [
+  ! Put a Token boundary behind a punctuation                      
+  ! that is not a start of a punctuation sequence
+  SP @-> ... NLout \/ NLout _ NLout NotSentenceExtension
+] .o. [
+  ! Put a Token boundary behind ... if not followed by a small character
+  [%. %. %.] @-> ... NLout \/ _ NLout WS+ NotSmallCaps
+] .o. [
+  ! Remove whitespace between Tokens
+  [WS|NL]+ @-> 0 || [ .#. | NLout ] _
+];
commit	78f6714529aea5209c4363edf94570a4f5686d19	[log] [tgz]
author	Akron <nils@diewald-online.de>	Sat Apr 09 14:10:44 2022 +0200
committer	Akron <nils@diewald-online.de>	Sat Apr 09 14:10:44 2022 +0200
tree	907711b536979f8a39dd63a8f05364fd9f524331
parent	61948ef87b5c0e556439fff72a270ac1f5ca9bc7 [diff] [blame]