Akron | 78f6714 | 2022-04-09 14:10:44 +0200 | [diff] [blame] | 1 | source all/allpref.xfst |
| 2 | |
| 3 | define Caseinsensitive [ |
| 4 | a (->) A, |
| 5 | b (->) B, |
| 6 | c (->) C, |
| 7 | d (->) D, |
| 8 | e (->) E, |
| 9 | f (->) F, |
| 10 | g (->) G, |
| 11 | h (->) H, |
| 12 | i (->) I, |
| 13 | j (->) J, |
| 14 | k (->) K, |
| 15 | l (->) L, |
| 16 | m (->) M, |
| 17 | n (->) N, |
| 18 | o (->) O, |
| 19 | p (->) P, |
| 20 | q (->) Q, |
| 21 | r (->) R, |
| 22 | s (->) S, |
| 23 | t (->) T, |
| 24 | u (->) U, |
| 25 | v (->) V, |
| 26 | w (->) W, |
| 27 | x (->) X, |
| 28 | y (->) Y, |
| 29 | z (->) Z, |
| 30 | ö (->) Ö, |
| 31 | ü (->) Ü, |
| 32 | ä (->) Ä, |
| 33 | è (->) È, |
| 34 | é (->) É, |
| 35 | ú (->) Ú, |
| 36 | á (->) Á, |
| 37 | â (->) Â, |
| 38 | ê (->) Ê, |
| 39 | î (->) Î, |
| 40 | ô (->) Ô, |
| 41 | û (->) Û, |
| 42 | ß (->) {SS} |
| 43 | ]; |
| 44 | |
| 45 | define Letter [ [ AsciiLetter | ö | ü | ä | è | é | ú | á | â | ê | î | ô | û | ß ] .o. Caseinsensitive ]; |
| 46 | |
| 47 | define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä - è - é - ú - á - â - ê - î - ô - û]; |
| 48 | |
| 49 | define Word Char+ ([Apos|Asterisk] Char+)* ([s|S] [%’|%`]); |
| 50 | |
| 51 | define Plusampersand @txt"de/plusampersand.txt"; |
| 52 | define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*; |
| 53 | |
| 54 | ! Abbreviations and Initials |
| 55 | ! The abbreviation list is part of the sentence splitter tool |
| 56 | ! of the IDS. |
| 57 | define Abbr [ @txt"de/abbrv.txt" | Letter ] %.; |
| 58 | |
| 59 | define Streetname Word {str} %.; |
| 60 | |
| 61 | source all/allpost.xfst |
| 62 | |
| 63 | echo - Compile Real Token |
| 64 | |
| 65 | define RealToken [Punct|Emdash|Abbr|Streetname|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission]; |
| 66 | |
| 67 | echo - Introduce Token splitter |
| 68 | |
| 69 | define Token [ |
| 70 | RealToken @-> ... NLout, |
| 71 | XML @-> ... NLout, |
| 72 | URL @-> ... NLout, |
| 73 | Email @-> ... NLout, |
| 74 | File @-> ... NLout, |
| 75 | Domain @-> ... NLout, |
Akron | 6dcb6ce | 2022-04-09 16:09:51 +0200 | [diff] [blame^] | 76 | [Emoticons|Arrows] @-> ... NLout |
Akron | 78f6714 | 2022-04-09 14:10:44 +0200 | [diff] [blame] | 77 | ]; |
| 78 | |
| 79 | source all/allsentencesplit.xfst |
| 80 | |
| 81 | ! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b |