blob: f7a089ff29f7fce09debe197e4ea41183747d042 [file] [log] [blame]
Akron310905f2021-08-11 13:49:50 +02001! This tokenizer is based on work by
2! - StandardTokenizerImpl by the Lucene project
3! under the Apache License
4! - https://github.com/dlwh/epic by David Hall (2014)
5! under the Apacahe License
6! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
7! under the Apache License
8! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
9! under the MIT License
10!
11! The abbreviation list is part of the sentence splitter tool
12! of the IDS.
13
Akron4af79f12021-08-11 14:48:17 +020014define NLout "@_TOKEN_SYMBOL_@";
Akron3de361e2021-08-17 09:56:42 +020015! define NLout "\u000a";
Akron310905f2021-08-11 13:49:50 +020016
17define Digit [%0|1|2|3|4|5|6|7|8|9];
Akrone8837b52021-08-11 17:29:58 +020018define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
Akron310905f2021-08-11 13:49:50 +020019
20!!!!!!!!!!!!!!!!!
21! <from tmorph> !
22!!!!!!!!!!!!!!!!!
Akron3de361e2021-08-17 09:56:42 +020023define WS [" "|"\u0009"|"\u00a0"|"\u1680"|
Akron310905f2021-08-11 13:49:50 +020024 "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
25 "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
Akron3de361e2021-08-17 09:56:42 +020026 "\u202f"|"\u205f"|"\u3000"];
27
28define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"];
Akron310905f2021-08-11 13:49:50 +020029
30! Punctuation that ends sentences
31! Differs!
Akron3de361e2021-08-17 09:56:42 +020032define SP [["."|"?"|"!"]+|"…"];
33
Akron310905f2021-08-11 13:49:50 +020034! Left punctuation
35define LP ["("|"["|"{"|
36 "“"|"‘"|"‹"|"«"|
37 "'"|%"|
38 ! differs
39 ["'" "'"] |
40 "*"|"/"|"_"| ! Can be Markdown
41 ! from book
42 [%, %,]];
Akron3de361e2021-08-17 09:56:42 +020043
Akron310905f2021-08-11 13:49:50 +020044! Right punctuation - excluding the characters that can be used as apostrophe
45define RP [SP|","|";"|":"|
46 ")"|"]"|"}"|
47 ""|""|"»"|
48 %"|
49 ! differs
50 ["'" "'"]|
51 "*"|"/"|"_"| ! Can be Markdown
52 ! from book
53 [%‘ %‘]|[%’ %’]];
54
55define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@];
56define Apos %'|%’|%`;
Akron4c2a1ad2021-08-31 00:35:53 +020057define Punct [LP|RP|Sym];
Akron310905f2021-08-11 13:49:50 +020058!define nonSym \[WS|LP|RP|Sym];
59!!!!!!!!!!!!!!!!!!
60! </from tmorph> !
61!!!!!!!!!!!!!!!!!!
62
63define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
64define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
65define Slash ["⁄"|"∕"|"/"|"/"];
66define Asterisk ["*"];
67
Akron3de361e2021-08-17 09:56:42 +020068define Char \[WS|NL|Punct|Apos]; ! |¨;
Akron310905f2021-08-11 13:49:50 +020069
70! source lexicon.xfst
71! define Word;
72define Word Char+ ([Dash|Apos|Asterisk] Char+)*;
73
74define URLChar [Char|[Sym - ["<"|">"|%"]]];
75!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
76
77define Caseinsensitive [
78a (->) A,
79b (->) B,
80c (->) C,
81d (->) D,
82e (->) E,
83f (->) F,
84g (->) G,
85h (->) H,
86i (->) I,
87j (->) J,
88k (->) K,
89l (->) L,
90m (->) M,
91n (->) N,
92o (->) O,
93p (->) P,
94q (->) Q,
95r (->) R,
96s (->) S,
97t (->) T,
98u (->) U,
99v (->) V,
100w (->) W,
101x (->) X,
102y (->) Y,
103z (->) Z,
104ö (->) Ö,
105ü (->) Ü,
106ä (->) Ä,
107ß (->) {SS}
108];
109
Akron3de361e2021-08-17 09:56:42 +0200110define Abbr @txt"txt/abbrv.txt" %.;
Akron310905f2021-08-11 13:49:50 +0200111
Akron57d01612021-08-11 17:53:19 +0200112define Plusampersand @txt"txt/plusampersand.txt";
113
Akron310905f2021-08-11 13:49:50 +0200114! A solution to the "(author): problem" may be to add ) at the end of any
115! string as a possible ending
116
117define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
118
119source emoji.xfst
120define Emoji;
121
122! acronyms: U.S.A., I.B.M., etc.
123! use a post-filter to remove dots
124define AcronymDep Char %. [Char %.]+;
125
126define Dot "."|[["["|"("] "d" "o" "t" [")"|"]"]] .o. Caseinsensitive;
127define At "@"|[["["|"("] "a" "t" [")"|"]"]] .o. Caseinsensitive;
128
129define TldEnd [{org}|{de}|{com}] .o. Caseinsensitive;
130
131! Very relaxed URL scheme, not based on the strict Lucene implementation
132define URL [ [ [{http} (s) | {ftp} | {file}] ":" "/" "/"] | [{www} Dot] ]
133URLChar [URLChar|SP]* URLChar
134.o. Caseinsensitive;
135
136define Domain Char+ [Dash Char+]* Dot TldEnd;
137
Akron4c2a1ad2021-08-31 00:35:53 +0200138! XML rule
139define XMLns [AsciiLetter [AsciiLetter|Digit|%-]* (%: AsciiLetter [AsciiLetter|Digit|%-]*)] .o. Caseinsensitive;
140define XML [
141 "<" [
142 [
143 XMLns
144 [WS+ XMLns WS*
145 (%= WS*
146 [[%" [? - %" - %>]+ %"] | [%' [? - %' - %>]+ %']]
147 )
148 ]*
149 ]
150 |
151 [
152 "/" XMLns
153 ]
154 ] WS* ">"
155].u;
Akron310905f2021-08-11 13:49:50 +0200156
157!define Email [Alpha [URLChar-At]* At Alpha URLChar* [Dot [[Alpha URLChar+]-Dot-At]]+];
158define Email URLChar+ At URLChar+ [Dot URLChar+]+;
159
160! Twitter user, hashtag, Google+
161define SNS ["@"|"#"|"+"] Char+;
162
163define FileEnd [
164 [{htm} ("l")]|
165 [{doc} ("x")]|
166 {pdf}|
167 ["j" "p" ("e") "g"]|
168 ["m" "p" ["3"|"4"]]|
169 {ogg}|
170 {png}|
171 {avi}|
172 {txt}|
173 {xls}|
174 {xml}|
175 {aac}|
Akrone8837b52021-08-11 17:29:58 +0200176 {gif}|
177 {exe}
Akron310905f2021-08-11 13:49:50 +0200178 ] .o. Caseinsensitive;
Akrone8837b52021-08-11 17:29:58 +0200179
180define File (( AsciiLetter ":" %\ | "/" ) [ Char | "_" | "-" | Char [ %\ | "/" ] ]*) [Char | "-" | "_" ]+ "." FileEnd;
Akron310905f2021-08-11 13:49:50 +0200181
Akrona0bded52021-08-11 15:48:02 +0200182define Streetname Word {str} %.;
Akron4af79f12021-08-11 14:48:17 +0200183
Akron310905f2021-08-11 13:49:50 +0200184! Also supports
185! 19.4.2015, 19/4/2015 etc.
186define DigitPunct ["_"|"-"|"."|","|Slash];
187define Num Digit+ [DigitPunct Digit+]* (Char+);
188
Akrona0bded52021-08-11 15:48:02 +0200189! ordinals
190define Ord Digit ( Digit (Digit) ) %.;
191
Akron310905f2021-08-11 13:49:50 +0200192! TODO:
193! floating point, serial, model numbers, ip addresses, etc.
194! every other segment must have at least one digit
195
196! Omission words like "fu**ing!"
197define Omission Char+ Asterisk Asterisk+ Char*;
198
199
200! TODO: Name words with ' and `
201
Akron310905f2021-08-11 13:49:50 +0200202! Support ASCII elements, like
203! +---------------+
204! <---->, -->, <--
205! +---------------+
206! <---> | Worker Node N |
207! +---------------+
208! |============= Core =============|
209
210
Akron4c2a1ad2021-08-31 00:35:53 +0200211echo - Compile Real Token
Akron310905f2021-08-11 13:49:50 +0200212
Akron4c2a1ad2021-08-31 00:35:53 +0200213define RealToken [Punct|Word|XML|Email|URL|SNS|Domain|AcronymDep|File|Ord|Num|Years];
Akron310905f2021-08-11 13:49:50 +0200214
215echo - Introduce Token splitter
Akron4c2a1ad2021-08-31 00:35:53 +0200216
217define Token [
218 RealToken @-> ... NLout,
219 Abbr @-> ... NLout,
220 Plusampersand @-> ... NLout,
221 Emoji @-> ... NLout,
222 [Streetname|Omission|Emdash] @-> ... NLout
223 ]
224.o. [WS+ @-> 0 || NLout _ ]
Akron310905f2021-08-11 13:49:50 +0200225;
226
227echo - Introduce Sentence splitter
Akron4c2a1ad2021-08-31 00:35:53 +0200228read regex Token .o. [[["."|"!"|"?"]+|"…"] @-> ... NLout \/ NLout _ ];
Akron310905f2021-08-11 13:49:50 +0200229
230! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
231
232! In a second pass, XML tags need to be combined. This requires tagging "<..." with ~xmls before \n
233! and anything with > with ~xmle.
234! In case this is part of an emoticon ( >:-P ), this needs to be split again .
235! The same is true for ( and )