| ! Foma complains when this reuses the above definition |
| define CapitalCaseinsensitive [ |
| define Letter [ [ AsciiLetter | ö | ü | ä | è | é | ú | á | â | ê | î | ô | û | ß ] .o. Caseinsensitive ]; |
| define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä - è - é - ú - á - â - ê - î - ô - û]; |
| ! Irrelevant becose of the more general rule followed |
| ! define Clitics [ Apos [{ll}|d|{ve}|s|{re}|m|n|{em}] .o. Caseinsensitive ] | ["n" Apos "t"] .o. Caseinsensitive ]; |
| define Word Char+ ([Apos|Asterisk] Char+)*; |
| define Plusampersand @txt"de/plusampersand.txt"; |
| define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*; |
| ! Abbreviations and Initials |
| define Months [{Jan}|{Feb}|{Mar}|{Apr}|{Jun}|{Jul}|{Aug}|{Sep}(t)|{Oct}|{Nov}|{Dec}]; |
| define Abbr [ [ @txt"en/abbrv.txt" | Letter | Months ] .o. CapitalCaseinsensitive ] %.; |
| echo - Compile Real Token |
| define RealToken [Punct|Emdash|Abbr|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission]; |
| echo - Introduce Token splitter |
| [Emoticons|Arrows] @-> ... NLout |
| source all/allsentencesplit.xfst |
| ! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b |