Akron | be3d366 | 2023-04-26 13:22:38 +0200 | [diff] [blame] | 1 | source all/allpref.xfst |
| 2 | |
| 3 | define Caseinsensitive [ |
| 4 | a (->) A, |
| 5 | b (->) B, |
| 6 | c (->) C, |
| 7 | d (->) D, |
| 8 | e (->) E, |
| 9 | f (->) F, |
| 10 | g (->) G, |
| 11 | h (->) H, |
| 12 | i (->) I, |
| 13 | j (->) J, |
| 14 | k (->) K, |
| 15 | l (->) L, |
| 16 | m (->) M, |
| 17 | n (->) N, |
| 18 | o (->) O, |
| 19 | p (->) P, |
| 20 | q (->) Q, |
| 21 | r (->) R, |
| 22 | s (->) S, |
| 23 | t (->) T, |
| 24 | u (->) U, |
| 25 | v (->) V, |
| 26 | w (->) W, |
| 27 | x (->) X, |
| 28 | y (->) Y, |
| 29 | z (->) Z, |
| 30 | ö (->) Ö, |
| 31 | ü (->) Ü, |
| 32 | ä (->) Ä, |
| 33 | è (->) È, |
| 34 | é (->) É, |
| 35 | ú (->) Ú, |
| 36 | á (->) Á, |
| 37 | â (->) Â, |
| 38 | ê (->) Ê, |
| 39 | î (->) Î, |
| 40 | ô (->) Ô, |
| 41 | û (->) Û, |
| 42 | ß (->) {SS} |
| 43 | ]; |
| 44 | |
| 45 | ! Foma complains when this reuses the above definition |
| 46 | define CapitalCaseinsensitive [ |
| 47 | a (->) A, |
| 48 | b (->) B, |
| 49 | c (->) C, |
| 50 | d (->) D, |
| 51 | e (->) E, |
| 52 | f (->) F, |
| 53 | g (->) G, |
| 54 | h (->) H, |
| 55 | i (->) I, |
| 56 | j (->) J, |
| 57 | k (->) K, |
| 58 | l (->) L, |
| 59 | m (->) M, |
| 60 | n (->) N, |
| 61 | o (->) O, |
| 62 | p (->) P, |
| 63 | q (->) Q, |
| 64 | r (->) R, |
| 65 | s (->) S, |
| 66 | t (->) T, |
| 67 | u (->) U, |
| 68 | v (->) V, |
| 69 | w (->) W, |
| 70 | x (->) X, |
| 71 | y (->) Y, |
| 72 | z (->) Z, |
| 73 | ö (->) Ö, |
| 74 | ü (->) Ü, |
| 75 | ä (->) Ä, |
| 76 | è (->) È, |
| 77 | é (->) É, |
| 78 | ú (->) Ú, |
| 79 | á (->) Á, |
| 80 | â (->) Â, |
| 81 | ê (->) Ê, |
| 82 | î (->) Î, |
| 83 | ô (->) Ô, |
| 84 | û (->) Û, |
| 85 | ß (->) {SS} |
| 86 | || .#. _ ]; |
| 87 | |
| 88 | define Letter [ [ AsciiLetter | ö | ü | ä | è | é | ú | á | â | ê | î | ô | û | ß ] .o. Caseinsensitive ]; |
| 89 | |
| 90 | define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä - è - é - ú - á - â - ê - î - ô - û]; |
| 91 | |
Akron | 72a6422 | 2023-04-26 17:00:45 +0200 | [diff] [blame] | 92 | source en/clitics.xfst |
Akron | be3d366 | 2023-04-26 13:22:38 +0200 | [diff] [blame] | 93 | |
Akron | 72a6422 | 2023-04-26 17:00:45 +0200 | [diff] [blame] | 94 | # define Word Char+ (Apos Char+)*; |
| 95 | define Word [[ Char+ | Clitics ] - CliticsNTPart | CliticsNT]; |
Akron | be3d366 | 2023-04-26 13:22:38 +0200 | [diff] [blame] | 96 | |
| 97 | define Plusampersand @txt"de/plusampersand.txt"; |
| 98 | define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*; |
| 99 | |
| 100 | ! Abbreviations and Initials |
| 101 | define Months [{Jan}|{Feb}|{Mar}|{Apr}|{Jun}|{Jul}|{Aug}|{Sep}(t)|{Oct}|{Nov}|{Dec}]; |
| 102 | define Abbr [ [ @txt"en/abbrv.txt" | Letter | Months ] .o. CapitalCaseinsensitive ] %.; |
| 103 | |
| 104 | source all/allpost.xfst |
| 105 | |
| 106 | echo - Compile Real Token |
| 107 | |
| 108 | define RealToken [Punct|Emdash|Abbr|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission]; |
| 109 | |
Akron | d0dfea8 | 2023-04-26 19:24:17 +0200 | [diff] [blame] | 110 | ! Treatmeant for I as a word in "M. I. Baxter was killed in World War I. So was I." |
| 111 | define NonAbbrI [ {am}|{was}|{will}|{have}|{had}|{would}|{do}|{did}|{and}|{War}|{than}|{not}|[P|p]{art} ]; |
| 112 | |
| 113 | |
Akron | be3d366 | 2023-04-26 13:22:38 +0200 | [diff] [blame] | 114 | echo - Introduce Token splitter |
| 115 | |
| 116 | define Token [ |
| 117 | RealToken @-> ... NLout, |
| 118 | XML @-> ... NLout, |
| 119 | URL @-> ... NLout, |
| 120 | Email @-> ... NLout, |
| 121 | File @-> ... NLout, |
| 122 | Domain @-> ... NLout, |
| 123 | [Emoticons|Arrows] @-> ... NLout |
Akron | 72a6422 | 2023-04-26 17:00:45 +0200 | [diff] [blame] | 124 | ] |
| 125 | ! I as a separate token |
| 126 | .o. ["I" @-> ... NLout \/ NonAbbrI [WS | NLout ]+ _ ] |
| 127 | ! Negative clitics are tokens |
| 128 | .o. [ {n't} ->@ NLout ... \/ NLout ]; |
Akron | be3d366 | 2023-04-26 13:22:38 +0200 | [diff] [blame] | 129 | |
| 130 | source all/allsentencesplit.xfst |
| 131 | |
| 132 | ! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b |