| define Letter [ [ AsciiLetter | ö | ü | ä | è | é | ú | á | â | ê | î | ô | û | ß ] .o. Caseinsensitive ]; |
| define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä - è - é - ú - á - â - ê - î - ô - û]; |
| define Word Char+ ([Apos|Asterisk] Char+)* ([s|S] [%’|%`]); |
| define Plusampersand @txt"de/plusampersand.txt"; |
| define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*; |
| ! Abbreviations and Initials |
| ! The abbreviation list is part of the sentence splitter tool |
| define Abbr [ @txt"de/abbrv.txt" | Letter ] %.; |
| define Streetname Word {str} %.; |
| echo - Compile Real Token |
| define RealToken [Punct|Emdash|Abbr|Streetname|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission]; |
| echo - Introduce Token splitter |
| [Emoticons|Arrows] @-> ... NLout |
| source all/allsentencesplit.xfst |
| ! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b |