Split tokenizer rules into language-specific and language-dependent Change-Id: I6e5eeabe9a306ce9fc3e62340ba7b948ecc4fa0f

commit: 78f6714529aea5209c4363edf94570a4f5686d19 [log] [tgz]
author: Akron <nils@diewald-online.de> Sat Apr 09 14:10:44 2022 +0200
committer: Akron <nils@diewald-online.de> Sat Apr 09 14:10:44 2022 +0200
tree: 907711b536979f8a39dd63a8f05364fd9f524331
parent: 61948ef87b5c0e556439fff72a270ac1f5ca9bc7 [diff]
diff --git a/src/all/allpost.xfst b/src/all/allpost.xfst
new file mode 100644
index 0000000..6ea6dcf
--- /dev/null
+++ b/src/all/allpost.xfst

@@ -0,0 +1,51 @@
+! General rules that require certain language specific definitions
+
+! A solution to the "(author): problem" may be to add ) at the end of any
+! string as a possible ending
+define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
+
+! 20:00 Uhr, 00:12:25,34
+define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
+
+! Emoticons
+source all/emoticons.xfst
+
+! XML sources
+source all/xml.xfst
+
+! XML entities
+source all/entities.xfst
+
+! Technical protocols
+source all/protocols.xfst
+
+! Also supports
+!   19.4.2015, 19/4/2015 etc.
+define DigitPunct ["_"|"-"|"."|","|Slash];
+define Num Digit+ [DigitPunct Digit+]* (Char+);
+
+! ordinals
+define Ord Digit ( Digit (Digit) ) %.;
+
+! TODO:
+!   floating point, serial, model numbers, ip addresses, etc.
+!   every other segment must have at least one digit
+
+! Omission words like "fu**ing!"
+define Omission Char+ Asterisk Asterisk+ Char*;
+
+! acronyms: U.S.A., I.B.M., etc.
+! use a post-filter to remove dots
+define AcronymDep Letter %. Letter %. [Letter %.]+;
+
+
+! TODO: Name words with ' and `
+
+! Support ASCII elements, like
+! +---------------+
+! <---->, -->, <--
+!       +---------------+
+! <---> | Worker Node N |
+!       +---------------+
+! |============= Core =============|
+

diff --git a/src/all/allpref.xfst b/src/all/allpref.xfst
new file mode 100644
index 0000000..bb5183f
--- /dev/null
+++ b/src/all/allpref.xfst

@@ -0,0 +1,70 @@
+! This tokenizer is based on work by
+!  - StandardTokenizerImpl by the Lucene project
+!    under the Apache License
+!  - https://github.com/dlwh/epic by David Hall (2014)
+!    under the Apacahe License
+!  - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
+!    under the Apache License
+!  - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
+!    under the MIT License
+
+define NLout "@_TOKEN_SYMBOL_@";
+! define NLout "\u000a";
+
+define Digit [%0|1|2|3|4|5|6|7|8|9];
+define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
+define HexLetter [Digit|a|A|b|B|c|C|d|D|e|E|f|F];
+define EOT "\u0004";
+
+!!!!!!!!!!!!!!!!!
+! <from tmorph> !
+!!!!!!!!!!!!!!!!!
+define WS [" "|"\u0009"|"\u00a0"|"\u1680"|
+           "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"| 
+           "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
+           "\u202f"|"\u205f"|"\u3000"];
+
+define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];
+
+! Punctuation that ends sentences
+! Differs!
+define SP [["."|"?"|"!"]+|"…"];
+
+! Left punctuation
+define LP ["("|"["|"{"|
+           "“"|"‘"|"‹"|"«"|
+           "'"|%"|
+           ! differs
+           ["'" "'"] |
+           "*"|"/"|"_"| ! Can be Markdown
+           ! from book
+           [%, %,]];
+
+! Right punctuation - excluding the characters that can be used as apostrophe
+define RP [SP|","|";"|":"|
+              ")"|"]"|"}"|
+              "”"|"›"|"»"|%"|[%’ %’]|["'" "'"]|[%‘ %‘]|
+              "*"|"/"|"_"]; ! Can be Markdown
+
+define DQuotes ["”"|%"|"»"|"«"];
+
+define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@|%&];
+define Apos %'|%’|%`;
+define Punct [LP|RP|Sym];
+!define nonSym \[WS|LP|RP|Sym];
+!!!!!!!!!!!!!!!!!!
+! </from tmorph> !
+!!!!!!!!!!!!!!!!!!
+
+define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
+define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
+define Slash ["⁄"|"∕"|"／"|"/"];
+define Asterisk ["*"];
+
+define Char \[WS|NL|Punct|Apos]; ! |¨;
+
+!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
+
+define SentenceEnd SP NLout [DQuotes (NLout ")") | ["›"|%‹|%’|"'"] ( NLout DQuotes (NLout ")") | NLout ")" ) | ")" ] (NLout SP);
+
+define NotSentenceExtension [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - ")" - NLout];
\ No newline at end of file

diff --git a/src/all/allsentencesplit.xfst b/src/all/allsentencesplit.xfst
new file mode 100644
index 0000000..bef8cbd
--- /dev/null
+++ b/src/all/allsentencesplit.xfst

@@ -0,0 +1,19 @@
+echo - Introduce Sentence splitter
+! And compose Whitespace ignorance
+
+read regex Token .o. [
+  ! Put a Token boundary behind the longest possible
+  ! sentence ending punctuation sequence,
+  ! that isn't followed by a comma
+  SentenceEnd @-> ... NLout \/ _ NLout \%,
+] .o. [
+  ! Put a Token boundary behind a punctuation                      
+  ! that is not a start of a punctuation sequence
+  SP @-> ... NLout \/ NLout _ NLout NotSentenceExtension
+] .o. [
+  ! Put a Token boundary behind ... if not followed by a small character
+  [%. %. %.] @-> ... NLout \/ _ NLout WS+ NotSmallCaps
+] .o. [
+  ! Remove whitespace between Tokens
+  [WS|NL]+ @-> 0 || [ .#. | NLout ] _
+];

diff --git a/src/all/emoticons.xfst b/src/all/emoticons.xfst
new file mode 100644
index 0000000..9664848
--- /dev/null
+++ b/src/all/emoticons.xfst

@@ -0,0 +1,42 @@
+! Partially by Park, Barash, Fink & Cha (2013)
+
+define verticalemoticon [
+[ "ಠ" "_" "ಠ"]|
+[ "T" ["_"|"."|"-"]+ "T"] |
+[ "♥" ["_"|"."|"-"]+ "♥" ] |
+[ "@" ["_"|"."|"-"]* "@" ] |
+[ "*" ["_"|"."|"-"]+ "*" ] |
+[ "x" ["_"|"."|"-"]+ "x" ] |
+[ "X" ["_"|"."|"-"]+ "X" ] |
+[ "-" ["_"|"."]+ "-" ] |
+[ "." ["_"]+ "." ] |
+[ "^" ["_"|"."|"-"]* "^" ] |
+[ ">" ["_"|"."|"-"]* "<" ] |
+[ ["o"|"O"] ["_"|"."|"-"]+ ["o"|"O"] ] 
+];
+
+define Emoticons [
+["<" ("/") "3"+] |
+verticalemoticon (";"+|"^") |
+["(" verticalemoticon ")"] |
+
+! May be end of brackets as well, like
+!   Author (2018):
+[ [")"|"("] ["'"|"-"|"o"]* [":"|"="|"x"] ] |
+! May be end of xml, like
+!   <b class="emp">=</b>
+[ ["<"*|">"*] ["B"|"X"|"8"|":"|";"|"="|"x"] ["'"|"-"|"o"]* ["/"|"<"|"C"|"["|")"|"("|"D"|"P"|"d"|"p"|"3"|">"|"o"|"O"|"*"]] |
+[ ["D"|">"] ("'") ":"] |
+
+! May be end of square bracket
+!   Author [2018]:
+["]" ":"] |
+[(">") [";"|":"] ["-"|"*"]* [ ")" | "(" | %] | %[ ]+ ] |
+[(">") [";"|":"] ["-"]* ["*"|"P"|"p"|"o"|"O"|"D"]] |
+["x" "("] |
+["^" (".") "^"] |
+[%\ ["{" "o" "}"|"o"|"m"] "/"] |
+[":" ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"|"-"]+ ":"] |
+[">" "_" "<"] |
+["*" "<" ":" "-" ")"]
+];

diff --git a/src/all/entities.xfst b/src/all/entities.xfst
new file mode 100644
index 0000000..a5b3149
--- /dev/null
+++ b/src/all/entities.xfst

@@ -0,0 +1,5 @@
+define XMLentitiesDec ["#" Digit Digit+ ];
+define XMLentitiesHex ["#" ["x"|"X"] HexLetter HexLetter+ ];
+define XMLentitiesStr [[[AsciiLetter .o. Caseinsensitive].l | Digit | "_" | "-" ] ];
+
+define XMLEntities ["&" [XMLentitiesDec|XMLentitiesHex|XMLentitiesStr XMLentitiesStr+] ";"];
\ No newline at end of file

diff --git a/src/all/protocols.xfst b/src/all/protocols.xfst
new file mode 100644
index 0000000..34b7b41
--- /dev/null
+++ b/src/all/protocols.xfst

@@ -0,0 +1,39 @@
+define URLChar [Char|[Sym - ["<"|">"|%"]]];
+
+define Dot "."|[["["|"("] "d" "o" "t" [")"|"]"]] .o. Caseinsensitive;
+define At "@"|[["["|"("] "a" "t" [")"|"]"]] .o. Caseinsensitive;
+
+define TldEnd [{org}|{de}|{com}] .o. Caseinsensitive;
+
+! Very relaxed URL scheme, not based on the strict Lucene implementation
+define URL [ [ [{http} (s) | {ftp} | {file}] ":" "/" "/"] | [{www} Dot] ]
+URLChar [URLChar|SP]* URLChar
+.o. Caseinsensitive;
+
+define Domain Char+ [Dash Char+]* Dot TldEnd;
+
+! Email addresses
+define Email URLChar+ At URLChar+ [Dot URLChar+]+;
+
+! Twitter user, hashtag, Google+
+define SNS ["@"|"#"|"+"] Char+;
+
+define FileEnd [
+                [{htm} ("l")]|
+                [{doc} ("x")]|
+                {pdf}|
+                ["j" "p" ("e") "g"]|
+                ["m" "p" ["3"|"4"]]|
+                {ogg}|
+                {png}|
+                [{ppt} ("x")]|
+                {avi}|
+                {txt}|
+                {xls}|
+                {xml}|
+                {aac}|
+                {gif}|
+                {exe}
+                ] .o. Caseinsensitive;
+
+define File (( AsciiLetter ":" %\ | "/" ) [ Char | "_" | "-" | Char [ %\ | "/" ] ]*) [Char | "-" | "_" ]+ "." FileEnd;
\ No newline at end of file

diff --git a/src/all/xml.xfst b/src/all/xml.xfst
new file mode 100644
index 0000000..06e247d
--- /dev/null
+++ b/src/all/xml.xfst

@@ -0,0 +1,19 @@
+! XML rule
+define XMLns [AsciiLetter [AsciiLetter|Digit|%-]* (%: AsciiLetter [AsciiLetter|Digit|%-]*)] .o. Caseinsensitive;
+define XML [
+  "<" [
+      [
+        XMLns
+        [WS+ XMLns WS*
+          (%= WS*
+            [[%" [? - %" - %>]+ %"] | [%' [? - %' - %>]+ %']]
+          )
+        ]*
+        (WS* "/")   
+      ]
+      |
+      [
+        "/" XMLns
+      ]
+    ] WS* ">"
+].u;
commit	78f6714529aea5209c4363edf94570a4f5686d19	[log] [tgz]
author	Akron <nils@diewald-online.de>	Sat Apr 09 14:10:44 2022 +0200
committer	Akron <nils@diewald-online.de>	Sat Apr 09 14:10:44 2022 +0200
tree	907711b536979f8a39dd63a8f05364fd9f524331
parent	61948ef87b5c0e556439fff72a270ac1f5ca9bc7 [diff]