blob: a6bff242576ef76c4419334ce9e179ffe46e8975 [file] [log] [blame]
Akron78f67142022-04-09 14:10:44 +02001source all/allpref.xfst
2
3define Caseinsensitive [
4a (->) A,
5b (->) B,
6c (->) C,
7d (->) D,
8e (->) E,
9f (->) F,
10g (->) G,
11h (->) H,
12i (->) I,
13j (->) J,
14k (->) K,
15l (->) L,
16m (->) M,
17n (->) N,
18o (->) O,
19p (->) P,
20q (->) Q,
21r (->) R,
22s (->) S,
23t (->) T,
24u (->) U,
25v (->) V,
26w (->) W,
27x (->) X,
28y (->) Y,
29z (->) Z,
30ö (->) Ö,
31ü (->) Ü,
32ä (->) Ä,
33è (->) È,
34é (->) É,
35ú (->) Ú,
36á (->) Á,
37â (->) Â,
38ê (->) Ê,
39î (->) Î,
40ô (->) Ô,
41û (->) Û,
42ß (->) {SS}
43];
44
45define Letter [ [ AsciiLetter | ö | ü | ä | è | é | ú | á | â | ê | î | ô | û | ß ] .o. Caseinsensitive ];
46
47define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä - è - é - ú - á - â - ê - î - ô - û];
48
Akron3dd560e2026-02-04 11:23:08 +010049define BaseWord Char+ ([Apos|Asterisk] Char+)* ([s|S] [%’|%`]);
Akron78f67142022-04-09 14:10:44 +020050
51define Plusampersand @txt"de/plusampersand.txt";
Akron3dd560e2026-02-04 11:23:08 +010052
53source de/gender.xfst
54
55define CoreWord [Plusampersand | BaseWord] (Dash [Plusampersand | BaseWord])*;
56define Word [ CoreWord | CoreWord GenderEndings ];
57
58! [ Word GenderEndings || WS] @-> ... NLout,
59
Akron78f67142022-04-09 14:10:44 +020060
61! Abbreviations and Initials
62! The abbreviation list is part of the sentence splitter tool
63! of the IDS.
64define Abbr [ @txt"de/abbrv.txt" | Letter ] %.;
65
Akrond8d88952026-02-04 09:02:09 +010066define HypAbbr [ Abbr ( %- Abbr )+ | {Ba.-Wü.}];
67
Akron78f67142022-04-09 14:10:44 +020068define Streetname Word {str} %.;
69
70source all/allpost.xfst
71
72echo - Compile Real Token
73
Akrond8d88952026-02-04 09:02:09 +010074define RealToken [Punct|Emdash|HypAbbr|Streetname|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission];
Akron78f67142022-04-09 14:10:44 +020075
76echo - Introduce Token splitter
77
78define Token [
79 RealToken @-> ... NLout,
80 XML @-> ... NLout,
81 URL @-> ... NLout,
82 Email @-> ... NLout,
83 File @-> ... NLout,
84 Domain @-> ... NLout,
Akrona2f952f2026-02-04 09:51:51 +010085 [Emoticons|Arrows|Wikitemplate] @-> ... NLout
Akron78f67142022-04-09 14:10:44 +020086];
87
88source all/allsentencesplit.xfst
89
90! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b