| source all/allpref.xfst |
| |
| define Caseinsensitive [ |
| a (->) A, |
| b (->) B, |
| c (->) C, |
| d (->) D, |
| e (->) E, |
| f (->) F, |
| g (->) G, |
| h (->) H, |
| i (->) I, |
| j (->) J, |
| k (->) K, |
| l (->) L, |
| m (->) M, |
| n (->) N, |
| o (->) O, |
| p (->) P, |
| q (->) Q, |
| r (->) R, |
| s (->) S, |
| t (->) T, |
| u (->) U, |
| v (->) V, |
| w (->) W, |
| x (->) X, |
| y (->) Y, |
| z (->) Z, |
| ö (->) Ö, |
| ü (->) Ü, |
| ä (->) Ä, |
| è (->) È, |
| é (->) É, |
| ú (->) Ú, |
| á (->) Á, |
| â (->) Â, |
| ê (->) Ê, |
| î (->) Î, |
| ô (->) Ô, |
| û (->) Û, |
| ß (->) {SS} |
| ]; |
| |
| ! Foma complains when this reuses the above definition |
| define CapitalCaseinsensitive [ |
| a (->) A, |
| b (->) B, |
| c (->) C, |
| d (->) D, |
| e (->) E, |
| f (->) F, |
| g (->) G, |
| h (->) H, |
| i (->) I, |
| j (->) J, |
| k (->) K, |
| l (->) L, |
| m (->) M, |
| n (->) N, |
| o (->) O, |
| p (->) P, |
| q (->) Q, |
| r (->) R, |
| s (->) S, |
| t (->) T, |
| u (->) U, |
| v (->) V, |
| w (->) W, |
| x (->) X, |
| y (->) Y, |
| z (->) Z, |
| ö (->) Ö, |
| ü (->) Ü, |
| ä (->) Ä, |
| è (->) È, |
| é (->) É, |
| ú (->) Ú, |
| á (->) Á, |
| â (->) Â, |
| ê (->) Ê, |
| î (->) Î, |
| ô (->) Ô, |
| û (->) Û, |
| ß (->) {SS} |
| || .#. _ ]; |
| |
| define Letter [ [ AsciiLetter | ö | ü | ä | è | é | ú | á | â | ê | î | ô | û | ß ] .o. Caseinsensitive ]; |
| |
| define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä - è - é - ú - á - â - ê - î - ô - û]; |
| |
| ! Irrelevant becose of the more general rule followed |
| ! define Clitics [ Apos [{ll}|d|{ve}|s|{re}|m|n|{em}] .o. Caseinsensitive ] | ["n" Apos "t"] .o. Caseinsensitive ]; |
| |
| define Word Char+ ([Apos|Asterisk] Char+)*; |
| |
| define Plusampersand @txt"de/plusampersand.txt"; |
| define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*; |
| |
| ! Abbreviations and Initials |
| define Months [{Jan}|{Feb}|{Mar}|{Apr}|{Jun}|{Jul}|{Aug}|{Sep}(t)|{Oct}|{Nov}|{Dec}]; |
| define Abbr [ [ @txt"en/abbrv.txt" | Letter | Months ] .o. CapitalCaseinsensitive ] %.; |
| |
| source all/allpost.xfst |
| |
| echo - Compile Real Token |
| |
| define RealToken [Punct|Emdash|Abbr|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission]; |
| |
| ! Treatmeant for I as a word in "M. I. Baxter was killed in World War I. So was I." |
| define NonAbbrI [ {am}|{was}|{will}|{have}|{had}|{would}|{do}|{did}|{and}|{War}|{than}|{not}|[P|p]{art} ]; |
| |
| |
| echo - Introduce Token splitter |
| |
| define Token [ |
| RealToken @-> ... NLout, |
| XML @-> ... NLout, |
| URL @-> ... NLout, |
| Email @-> ... NLout, |
| File @-> ... NLout, |
| Domain @-> ... NLout, |
| [Emoticons|Arrows] @-> ... NLout |
| ] .o. ["I" @-> ... NLout \/ NonAbbrI [WS | NLout ]+ _ ]; |
| |
| source all/allsentencesplit.xfst |
| |
| ! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b |