blob: 20b07f9b32c2cb2087d10f352484e06e09b37cb1 [file] [log] [blame]
Akron310905f2021-08-11 13:49:50 +02001! This tokenizer is based on work by
2! - StandardTokenizerImpl by the Lucene project
3! under the Apache License
4! - https://github.com/dlwh/epic by David Hall (2014)
5! under the Apacahe License
6! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
7! under the Apache License
8! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
9! under the MIT License
10!
11! The abbreviation list is part of the sentence splitter tool
12! of the IDS.
13
14define NLout "@_TOKEN_SYMBOL_@"; !"\u000a";
15define NLin ("\u000d") "\u000a";
16
17define Digit [%0|1|2|3|4|5|6|7|8|9];
18
19!!!!!!!!!!!!!!!!!
20! <from tmorph> !
21!!!!!!!!!!!!!!!!!
22define WS [" "|"\u0009"|"\u000a"|"\u000d"|
23 "\u00a0"|"\u1680"|
24 "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
25 "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
26 "\u2028"|"\u2029"|"\u202f"|"\u205f"|"\u3000"| NLin];
27
28! Punctuation that ends sentences
29! Differs!
30define SP [["."|"?"|"!"]+|"…"]; ! Warning! This results in '...' being a MCS!
31! Left punctuation
32define LP ["("|"["|"{"|
33 "“"|"‘"|"‹"|"«"|
34 "'"|%"|
35 ! differs
36 ["'" "'"] |
37 "*"|"/"|"_"| ! Can be Markdown
38 ! from book
39 [%, %,]];
40! Right punctuation - excluding the characters that can be used as apostrophe
41define RP [SP|","|";"|":"|
42 ")"|"]"|"}"|
43 "”"|"›"|"»"|
44 %"|
45 ! differs
46 ["'" "'"]|
47 "*"|"/"|"_"| ! Can be Markdown
48 ! from book
49 [%‘ %‘]|[%’ %’]];
50
51define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@];
52define Apos %'|%’|%`;
53define Punct LP|RP|Sym;
54!define nonSym \[WS|LP|RP|Sym];
55!!!!!!!!!!!!!!!!!!
56! </from tmorph> !
57!!!!!!!!!!!!!!!!!!
58
59define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
60define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
61define Slash ["⁄"|"∕"|"/"|"/"];
62define Asterisk ["*"];
63
64define Char \[WS|Punct|Apos]; ! |¨;
65
66! source lexicon.xfst
67! define Word;
68define Word Char+ ([Dash|Apos|Asterisk] Char+)*;
69
70define URLChar [Char|[Sym - ["<"|">"|%"]]];
71!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
72
73define Caseinsensitive [
74a (->) A,
75b (->) B,
76c (->) C,
77d (->) D,
78e (->) E,
79f (->) F,
80g (->) G,
81h (->) H,
82i (->) I,
83j (->) J,
84k (->) K,
85l (->) L,
86m (->) M,
87n (->) N,
88o (->) O,
89p (->) P,
90q (->) Q,
91r (->) R,
92s (->) S,
93t (->) T,
94u (->) U,
95v (->) V,
96w (->) W,
97x (->) X,
98y (->) Y,
99z (->) Z,
100ö (->) Ö,
101ü (->) Ü,
102ä (->) Ä,
103ß (->) {SS}
104];
105
106define Abbr @txt"txt/abbrv.txt" .o. Caseinsensitive;
107
108! A solution to the "(author): problem" may be to add ) at the end of any
109! string as a possible ending
110
111define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
112
113source emoji.xfst
114define Emoji;
115
116! acronyms: U.S.A., I.B.M., etc.
117! use a post-filter to remove dots
118define AcronymDep Char %. [Char %.]+;
119
120define Dot "."|[["["|"("] "d" "o" "t" [")"|"]"]] .o. Caseinsensitive;
121define At "@"|[["["|"("] "a" "t" [")"|"]"]] .o. Caseinsensitive;
122
123define TldEnd [{org}|{de}|{com}] .o. Caseinsensitive;
124
125! Very relaxed URL scheme, not based on the strict Lucene implementation
126define URL [ [ [{http} (s) | {ftp} | {file}] ":" "/" "/"] | [{www} Dot] ]
127URLChar [URLChar|SP]* URLChar
128.o. Caseinsensitive;
129
130define Domain Char+ [Dash Char+]* Dot TldEnd;
131
132!define XML "<" Alpha URLChar* (">");
133define XML "<" URLChar+ (">");
134
135!define Email [Alpha [URLChar-At]* At Alpha URLChar* [Dot [[Alpha URLChar+]-Dot-At]]+];
136define Email URLChar+ At URLChar+ [Dot URLChar+]+;
137
138! Twitter user, hashtag, Google+
139define SNS ["@"|"#"|"+"] Char+;
140
141define FileEnd [
142 [{htm} ("l")]|
143 [{doc} ("x")]|
144 {pdf}|
145 ["j" "p" ("e") "g"]|
146 ["m" "p" ["3"|"4"]]|
147 {ogg}|
148 {png}|
149 {avi}|
150 {txt}|
151 {xls}|
152 {xml}|
153 {aac}|
154 {gif}
155 ] .o. Caseinsensitive;
156define File [Char|"-"]+ "." FileEnd;
157
158! Also supports
159! 19.4.2015, 19/4/2015 etc.
160define DigitPunct ["_"|"-"|"."|","|Slash];
161define Num Digit+ [DigitPunct Digit+]* (Char+);
162
163! TODO:
164! floating point, serial, model numbers, ip addresses, etc.
165! every other segment must have at least one digit
166
167! Omission words like "fu**ing!"
168define Omission Char+ Asterisk Asterisk+ Char*;
169
170
171! TODO: Name words with ' and `
172
173! TODO:
174! FNAME = (({LETTER}:[\\/])?|\/)?({LETTER}+|[\\_/-])+\.{EXTENSION}
175
176
177! Support ASCII elements, like
178! +---------------+
179! <---->, -->, <--
180! +---------------+
181! <---> | Worker Node N |
182! +---------------+
183! |============= Core =============|
184
185
186
187define RealToken [XML|Email|URL|SNS|[Abbr %.]|Omission|Domain|AcronymDep|File|Emdash|Punct|Num|Years|Emoji|Word];
188
189echo - Introduce Token splitter
190define Token [RealToken @-> ... NLout]
191.o. [WS+ @-> 0]
192;
193
194echo - Introduce Sentence splitter
195read regex Token .o. [[["."|"!"|"?"]+] @-> ... NLout \/ NLout _];
196
197! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
198
199! In a second pass, XML tags need to be combined. This requires tagging "<..." with ~xmls before \n
200! and anything with > with ~xmle.
201! In case this is part of an emoticon ( >:-P ), this needs to be split again .
202! The same is true for ( and )