blob: 9670f9ac8c793ed54dbf5b7b3b227185675efcb7 [file] [log] [blame]
Akron78f67142022-04-09 14:10:44 +02001source all/allpref.xfst
2
3define Caseinsensitive [
4a (->) A,
5b (->) B,
6c (->) C,
7d (->) D,
8e (->) E,
9f (->) F,
10g (->) G,
11h (->) H,
12i (->) I,
13j (->) J,
14k (->) K,
15l (->) L,
16m (->) M,
17n (->) N,
18o (->) O,
19p (->) P,
20q (->) Q,
21r (->) R,
22s (->) S,
23t (->) T,
24u (->) U,
25v (->) V,
26w (->) W,
27x (->) X,
28y (->) Y,
29z (->) Z,
30ö (->) Ö,
31ü (->) Ü,
32ä (->) Ä,
33è (->) È,
34é (->) É,
35ú (->) Ú,
36á (->) Á,
37â (->) Â,
38ê (->) Ê,
39î (->) Î,
40ô (->) Ô,
41û (->) Û,
42ß (->) {SS}
43];
44
45define Letter [ [ AsciiLetter | ö | ü | ä | è | é | ú | á | â | ê | î | ô | û | ß ] .o. Caseinsensitive ];
46
47define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä - è - é - ú - á - â - ê - î - ô - û];
48
49define Word Char+ ([Apos|Asterisk] Char+)* ([s|S] [%’|%`]);
50
51define Plusampersand @txt"de/plusampersand.txt";
52define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;
53
54! Abbreviations and Initials
55! The abbreviation list is part of the sentence splitter tool
56! of the IDS.
57define Abbr [ @txt"de/abbrv.txt" | Letter ] %.;
58
59define Streetname Word {str} %.;
60
61source all/allpost.xfst
62
63echo - Compile Real Token
64
65define RealToken [Punct|Emdash|Abbr|Streetname|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission];
66
67echo - Introduce Token splitter
68
69define Token [
70 RealToken @-> ... NLout,
71 XML @-> ... NLout,
72 URL @-> ... NLout,
73 Email @-> ... NLout,
74 File @-> ... NLout,
75 Domain @-> ... NLout,
Akron6dcb6ce2022-04-09 16:09:51 +020076 [Emoticons|Arrows] @-> ... NLout
Akron78f67142022-04-09 14:10:44 +020077];
78
79source all/allsentencesplit.xfst
80
81! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b