Split tokenizer rules into language-specific and language-dependent
Change-Id: I6e5eeabe9a306ce9fc3e62340ba7b948ecc4fa0f
diff --git a/src/all/allpost.xfst b/src/all/allpost.xfst
new file mode 100644
index 0000000..6ea6dcf
--- /dev/null
+++ b/src/all/allpost.xfst
@@ -0,0 +1,51 @@
+! General rules that require certain language specific definitions
+
+! A solution to the "(author): problem" may be to add ) at the end of any
+! string as a possible ending
+define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
+
+! 20:00 Uhr, 00:12:25,34
+define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
+
+! Emoticons
+source all/emoticons.xfst
+
+! XML sources
+source all/xml.xfst
+
+! XML entities
+source all/entities.xfst
+
+! Technical protocols
+source all/protocols.xfst
+
+! Also supports
+! 19.4.2015, 19/4/2015 etc.
+define DigitPunct ["_"|"-"|"."|","|Slash];
+define Num Digit+ [DigitPunct Digit+]* (Char+);
+
+! ordinals
+define Ord Digit ( Digit (Digit) ) %.;
+
+! TODO:
+! floating point, serial, model numbers, ip addresses, etc.
+! every other segment must have at least one digit
+
+! Omission words like "fu**ing!"
+define Omission Char+ Asterisk Asterisk+ Char*;
+
+! acronyms: U.S.A., I.B.M., etc.
+! use a post-filter to remove dots
+define AcronymDep Letter %. Letter %. [Letter %.]+;
+
+
+! TODO: Name words with ' and `
+
+! Support ASCII elements, like
+! +---------------+
+! <---->, -->, <--
+! +---------------+
+! <---> | Worker Node N |
+! +---------------+
+! |============= Core =============|
+
diff --git a/src/all/allpref.xfst b/src/all/allpref.xfst
new file mode 100644
index 0000000..bb5183f
--- /dev/null
+++ b/src/all/allpref.xfst
@@ -0,0 +1,70 @@
+! This tokenizer is based on work by
+! - StandardTokenizerImpl by the Lucene project
+! under the Apache License
+! - https://github.com/dlwh/epic by David Hall (2014)
+! under the Apacahe License
+! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
+! under the Apache License
+! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
+! under the MIT License
+
+define NLout "@_TOKEN_SYMBOL_@";
+! define NLout "\u000a";
+
+define Digit [%0|1|2|3|4|5|6|7|8|9];
+define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
+define HexLetter [Digit|a|A|b|B|c|C|d|D|e|E|f|F];
+define EOT "\u0004";
+
+!!!!!!!!!!!!!!!!!
+! <from tmorph> !
+!!!!!!!!!!!!!!!!!
+define WS [" "|"\u0009"|"\u00a0"|"\u1680"|
+ "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
+ "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
+ "\u202f"|"\u205f"|"\u3000"];
+
+define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];
+
+! Punctuation that ends sentences
+! Differs!
+define SP [["."|"?"|"!"]+|"…"];
+
+! Left punctuation
+define LP ["("|"["|"{"|
+ "“"|"‘"|"‹"|"«"|
+ "'"|%"|
+ ! differs
+ ["'" "'"] |
+ "*"|"/"|"_"| ! Can be Markdown
+ ! from book
+ [%, %,]];
+
+! Right punctuation - excluding the characters that can be used as apostrophe
+define RP [SP|","|";"|":"|
+ ")"|"]"|"}"|
+ "”"|"›"|"»"|%"|[%’ %’]|["'" "'"]|[%‘ %‘]|
+ "*"|"/"|"_"]; ! Can be Markdown
+
+define DQuotes ["”"|%"|"»"|"«"];
+
+define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@|%&];
+define Apos %'|%’|%`;
+define Punct [LP|RP|Sym];
+!define nonSym \[WS|LP|RP|Sym];
+!!!!!!!!!!!!!!!!!!
+! </from tmorph> !
+!!!!!!!!!!!!!!!!!!
+
+define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
+define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
+define Slash ["⁄"|"∕"|"/"|"/"];
+define Asterisk ["*"];
+
+define Char \[WS|NL|Punct|Apos]; ! |¨;
+
+!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
+
+define SentenceEnd SP NLout [DQuotes (NLout ")") | ["›"|%‹|%’|"'"] ( NLout DQuotes (NLout ")") | NLout ")" ) | ")" ] (NLout SP);
+
+define NotSentenceExtension [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - ")" - NLout];
\ No newline at end of file
diff --git a/src/all/allsentencesplit.xfst b/src/all/allsentencesplit.xfst
new file mode 100644
index 0000000..bef8cbd
--- /dev/null
+++ b/src/all/allsentencesplit.xfst
@@ -0,0 +1,19 @@
+echo - Introduce Sentence splitter
+! And compose Whitespace ignorance
+
+read regex Token .o. [
+ ! Put a Token boundary behind the longest possible
+ ! sentence ending punctuation sequence,
+ ! that isn't followed by a comma
+ SentenceEnd @-> ... NLout \/ _ NLout \%,
+] .o. [
+ ! Put a Token boundary behind a punctuation
+ ! that is not a start of a punctuation sequence
+ SP @-> ... NLout \/ NLout _ NLout NotSentenceExtension
+] .o. [
+ ! Put a Token boundary behind ... if not followed by a small character
+ [%. %. %.] @-> ... NLout \/ _ NLout WS+ NotSmallCaps
+] .o. [
+ ! Remove whitespace between Tokens
+ [WS|NL]+ @-> 0 || [ .#. | NLout ] _
+];
diff --git a/src/all/emoticons.xfst b/src/all/emoticons.xfst
new file mode 100644
index 0000000..9664848
--- /dev/null
+++ b/src/all/emoticons.xfst
@@ -0,0 +1,42 @@
+! Partially by Park, Barash, Fink & Cha (2013)
+
+define verticalemoticon [
+[ "ಠ" "_" "ಠ"]|
+[ "T" ["_"|"."|"-"]+ "T"] |
+[ "♥" ["_"|"."|"-"]+ "♥" ] |
+[ "@" ["_"|"."|"-"]* "@" ] |
+[ "*" ["_"|"."|"-"]+ "*" ] |
+[ "x" ["_"|"."|"-"]+ "x" ] |
+[ "X" ["_"|"."|"-"]+ "X" ] |
+[ "-" ["_"|"."]+ "-" ] |
+[ "." ["_"]+ "." ] |
+[ "^" ["_"|"."|"-"]* "^" ] |
+[ ">" ["_"|"."|"-"]* "<" ] |
+[ ["o"|"O"] ["_"|"."|"-"]+ ["o"|"O"] ]
+];
+
+define Emoticons [
+["<" ("/") "3"+] |
+verticalemoticon (";"+|"^") |
+["(" verticalemoticon ")"] |
+
+! May be end of brackets as well, like
+! Author (2018):
+[ [")"|"("] ["'"|"-"|"o"]* [":"|"="|"x"] ] |
+! May be end of xml, like
+! <b class="emp">=</b>
+[ ["<"*|">"*] ["B"|"X"|"8"|":"|";"|"="|"x"] ["'"|"-"|"o"]* ["/"|"<"|"C"|"["|")"|"("|"D"|"P"|"d"|"p"|"3"|">"|"o"|"O"|"*"]] |
+[ ["D"|">"] ("'") ":"] |
+
+! May be end of square bracket
+! Author [2018]:
+["]" ":"] |
+[(">") [";"|":"] ["-"|"*"]* [ ")" | "(" | %] | %[ ]+ ] |
+[(">") [";"|":"] ["-"]* ["*"|"P"|"p"|"o"|"O"|"D"]] |
+["x" "("] |
+["^" (".") "^"] |
+[%\ ["{" "o" "}"|"o"|"m"] "/"] |
+[":" ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"|"-"]+ ":"] |
+[">" "_" "<"] |
+["*" "<" ":" "-" ")"]
+];
diff --git a/src/all/entities.xfst b/src/all/entities.xfst
new file mode 100644
index 0000000..a5b3149
--- /dev/null
+++ b/src/all/entities.xfst
@@ -0,0 +1,5 @@
+define XMLentitiesDec ["#" Digit Digit+ ];
+define XMLentitiesHex ["#" ["x"|"X"] HexLetter HexLetter+ ];
+define XMLentitiesStr [[[AsciiLetter .o. Caseinsensitive].l | Digit | "_" | "-" ] ];
+
+define XMLEntities ["&" [XMLentitiesDec|XMLentitiesHex|XMLentitiesStr XMLentitiesStr+] ";"];
\ No newline at end of file
diff --git a/src/all/protocols.xfst b/src/all/protocols.xfst
new file mode 100644
index 0000000..34b7b41
--- /dev/null
+++ b/src/all/protocols.xfst
@@ -0,0 +1,39 @@
+define URLChar [Char|[Sym - ["<"|">"|%"]]];
+
+define Dot "."|[["["|"("] "d" "o" "t" [")"|"]"]] .o. Caseinsensitive;
+define At "@"|[["["|"("] "a" "t" [")"|"]"]] .o. Caseinsensitive;
+
+define TldEnd [{org}|{de}|{com}] .o. Caseinsensitive;
+
+! Very relaxed URL scheme, not based on the strict Lucene implementation
+define URL [ [ [{http} (s) | {ftp} | {file}] ":" "/" "/"] | [{www} Dot] ]
+URLChar [URLChar|SP]* URLChar
+.o. Caseinsensitive;
+
+define Domain Char+ [Dash Char+]* Dot TldEnd;
+
+! Email addresses
+define Email URLChar+ At URLChar+ [Dot URLChar+]+;
+
+! Twitter user, hashtag, Google+
+define SNS ["@"|"#"|"+"] Char+;
+
+define FileEnd [
+ [{htm} ("l")]|
+ [{doc} ("x")]|
+ {pdf}|
+ ["j" "p" ("e") "g"]|
+ ["m" "p" ["3"|"4"]]|
+ {ogg}|
+ {png}|
+ [{ppt} ("x")]|
+ {avi}|
+ {txt}|
+ {xls}|
+ {xml}|
+ {aac}|
+ {gif}|
+ {exe}
+ ] .o. Caseinsensitive;
+
+define File (( AsciiLetter ":" %\ | "/" ) [ Char | "_" | "-" | Char [ %\ | "/" ] ]*) [Char | "-" | "_" ]+ "." FileEnd;
\ No newline at end of file
diff --git a/src/all/xml.xfst b/src/all/xml.xfst
new file mode 100644
index 0000000..06e247d
--- /dev/null
+++ b/src/all/xml.xfst
@@ -0,0 +1,19 @@
+! XML rule
+define XMLns [AsciiLetter [AsciiLetter|Digit|%-]* (%: AsciiLetter [AsciiLetter|Digit|%-]*)] .o. Caseinsensitive;
+define XML [
+ "<" [
+ [
+ XMLns
+ [WS+ XMLns WS*
+ (%= WS*
+ [[%" [? - %" - %>]+ %"] | [%' [? - %' - %>]+ %']]
+ )
+ ]*
+ (WS* "/")
+ ]
+ |
+ [
+ "/" XMLns
+ ]
+ ] WS* ">"
+].u;