blob: 30d06e8e92a0b37a1104002cc27518a4ff1c4353 [file] [log] [blame]
Akron310905f2021-08-11 13:49:50 +02001! This tokenizer is based on work by
2! - StandardTokenizerImpl by the Lucene project
3! under the Apache License
4! - https://github.com/dlwh/epic by David Hall (2014)
5! under the Apacahe License
6! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
7! under the Apache License
8! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
9! under the MIT License
10!
11! The abbreviation list is part of the sentence splitter tool
12! of the IDS.
13
Akron4af79f12021-08-11 14:48:17 +020014define NLout "@_TOKEN_SYMBOL_@";
Akron3de361e2021-08-17 09:56:42 +020015! define NLout "\u000a";
Akron310905f2021-08-11 13:49:50 +020016
17define Digit [%0|1|2|3|4|5|6|7|8|9];
Akrone8837b52021-08-11 17:29:58 +020018define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
Akron11a05d92021-11-06 13:17:11 +010019define HexLetter [Digit|a|A|b|B|c|C|d|D|e|E|f|F];
Akrona854faa2021-10-22 19:31:08 +020020define EOT "\u0004";
Akron310905f2021-08-11 13:49:50 +020021
22!!!!!!!!!!!!!!!!!
23! <from tmorph> !
24!!!!!!!!!!!!!!!!!
Akron3de361e2021-08-17 09:56:42 +020025define WS [" "|"\u0009"|"\u00a0"|"\u1680"|
Akron310905f2021-08-11 13:49:50 +020026 "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
27 "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
Akron3de361e2021-08-17 09:56:42 +020028 "\u202f"|"\u205f"|"\u3000"];
29
Akrona854faa2021-10-22 19:31:08 +020030define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];
Akron310905f2021-08-11 13:49:50 +020031
32! Punctuation that ends sentences
33! Differs!
Akron3de361e2021-08-17 09:56:42 +020034define SP [["."|"?"|"!"]+|"…"];
35
Akron310905f2021-08-11 13:49:50 +020036! Left punctuation
37define LP ["("|"["|"{"|
38 "“"|"‘"|"‹"|"«"|
39 "'"|%"|
40 ! differs
41 ["'" "'"] |
42 "*"|"/"|"_"| ! Can be Markdown
43 ! from book
44 [%, %,]];
Akron3de361e2021-08-17 09:56:42 +020045
Akron310905f2021-08-11 13:49:50 +020046! Right punctuation - excluding the characters that can be used as apostrophe
47define RP [SP|","|";"|":"|
48 ")"|"]"|"}"|
49 ""|""|"»"|
50 %"|
51 ! differs
52 ["'" "'"]|
53 "*"|"/"|"_"| ! Can be Markdown
54 ! from book
55 [%‘ %‘]|[%’ %’]];
56
Akron6742b962021-11-09 01:17:20 +010057define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@|%&];
Akron310905f2021-08-11 13:49:50 +020058define Apos %'|%’|%`;
Akron4c2a1ad2021-08-31 00:35:53 +020059define Punct [LP|RP|Sym];
Akron310905f2021-08-11 13:49:50 +020060!define nonSym \[WS|LP|RP|Sym];
61!!!!!!!!!!!!!!!!!!
62! </from tmorph> !
63!!!!!!!!!!!!!!!!!!
64
65define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
66define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
67define Slash ["⁄"|"∕"|"/"|"/"];
68define Asterisk ["*"];
69
Akron3de361e2021-08-17 09:56:42 +020070define Char \[WS|NL|Punct|Apos]; ! |¨;
Akron310905f2021-08-11 13:49:50 +020071
72! source lexicon.xfst
73! define Word;
74define Word Char+ ([Dash|Apos|Asterisk] Char+)*;
75
76define URLChar [Char|[Sym - ["<"|">"|%"]]];
77!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
78
79define Caseinsensitive [
80a (->) A,
81b (->) B,
82c (->) C,
83d (->) D,
84e (->) E,
85f (->) F,
86g (->) G,
87h (->) H,
88i (->) I,
89j (->) J,
90k (->) K,
91l (->) L,
92m (->) M,
93n (->) N,
94o (->) O,
95p (->) P,
96q (->) Q,
97r (->) R,
98s (->) S,
99t (->) T,
100u (->) U,
101v (->) V,
102w (->) W,
103x (->) X,
104y (->) Y,
105z (->) Z,
106ö (->) Ö,
107ü (->) Ü,
108ä (->) Ä,
109ß (->) {SS}
110];
111
Akronf1106ec2021-11-05 13:04:44 +0100112define Letter [ [ AsciiLetter | ö | ü | ä | ß ] .o. Caseinsensitive ];
113
Akron11a05d92021-11-06 13:17:11 +0100114! Abbreviations and Initials
Akronf1106ec2021-11-05 13:04:44 +0100115define Abbr [ @txt"txt/abbrv.txt" | Letter ] %.;
Akron310905f2021-08-11 13:49:50 +0200116
Akron57d01612021-08-11 17:53:19 +0200117define Plusampersand @txt"txt/plusampersand.txt";
118
Akron310905f2021-08-11 13:49:50 +0200119! A solution to the "(author): problem" may be to add ) at the end of any
120! string as a possible ending
121
122define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
123
Akron17984c82021-10-30 11:44:37 +0200124! 20:00 Uhr, 00:12:25,34 Minuten
125define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
Akron78dba062021-10-28 19:30:46 +0200126
Akron310905f2021-08-11 13:49:50 +0200127source emoji.xfst
128define Emoji;
129
130! acronyms: U.S.A., I.B.M., etc.
131! use a post-filter to remove dots
Akronf1106ec2021-11-05 13:04:44 +0100132define AcronymDep Letter %. [Letter %.]+;
Akron310905f2021-08-11 13:49:50 +0200133
134define Dot "."|[["["|"("] "d" "o" "t" [")"|"]"]] .o. Caseinsensitive;
135define At "@"|[["["|"("] "a" "t" [")"|"]"]] .o. Caseinsensitive;
136
137define TldEnd [{org}|{de}|{com}] .o. Caseinsensitive;
138
139! Very relaxed URL scheme, not based on the strict Lucene implementation
140define URL [ [ [{http} (s) | {ftp} | {file}] ":" "/" "/"] | [{www} Dot] ]
141URLChar [URLChar|SP]* URLChar
142.o. Caseinsensitive;
143
144define Domain Char+ [Dash Char+]* Dot TldEnd;
145
Akron4c2a1ad2021-08-31 00:35:53 +0200146! XML rule
147define XMLns [AsciiLetter [AsciiLetter|Digit|%-]* (%: AsciiLetter [AsciiLetter|Digit|%-]*)] .o. Caseinsensitive;
148define XML [
149 "<" [
150 [
151 XMLns
152 [WS+ XMLns WS*
153 (%= WS*
154 [[%" [? - %" - %>]+ %"] | [%' [? - %' - %>]+ %']]
155 )
156 ]*
Akron066d99c2021-10-28 19:04:59 +0200157 (WS* "/")
Akron4c2a1ad2021-08-31 00:35:53 +0200158 ]
159 |
160 [
161 "/" XMLns
162 ]
163 ] WS* ">"
164].u;
Akron310905f2021-08-11 13:49:50 +0200165
Akron6742b962021-11-09 01:17:20 +0100166! XML entities
167source entities.xfst
168define XMLEntities;
169
170
171! Email addresses
Akron310905f2021-08-11 13:49:50 +0200172define Email URLChar+ At URLChar+ [Dot URLChar+]+;
173
174! Twitter user, hashtag, Google+
175define SNS ["@"|"#"|"+"] Char+;
176
177define FileEnd [
178 [{htm} ("l")]|
179 [{doc} ("x")]|
180 {pdf}|
181 ["j" "p" ("e") "g"]|
182 ["m" "p" ["3"|"4"]]|
183 {ogg}|
184 {png}|
Akron11a05d92021-11-06 13:17:11 +0100185 [{ppt} ("x")]|
Akron310905f2021-08-11 13:49:50 +0200186 {avi}|
187 {txt}|
188 {xls}|
189 {xml}|
190 {aac}|
Akrone8837b52021-08-11 17:29:58 +0200191 {gif}|
192 {exe}
Akron310905f2021-08-11 13:49:50 +0200193 ] .o. Caseinsensitive;
Akrone8837b52021-08-11 17:29:58 +0200194
195define File (( AsciiLetter ":" %\ | "/" ) [ Char | "_" | "-" | Char [ %\ | "/" ] ]*) [Char | "-" | "_" ]+ "." FileEnd;
Akron310905f2021-08-11 13:49:50 +0200196
Akrona0bded52021-08-11 15:48:02 +0200197define Streetname Word {str} %.;
Akron4af79f12021-08-11 14:48:17 +0200198
Akron310905f2021-08-11 13:49:50 +0200199! Also supports
200! 19.4.2015, 19/4/2015 etc.
201define DigitPunct ["_"|"-"|"."|","|Slash];
202define Num Digit+ [DigitPunct Digit+]* (Char+);
203
Akrona0bded52021-08-11 15:48:02 +0200204! ordinals
205define Ord Digit ( Digit (Digit) ) %.;
206
Akron310905f2021-08-11 13:49:50 +0200207! TODO:
208! floating point, serial, model numbers, ip addresses, etc.
209! every other segment must have at least one digit
210
211! Omission words like "fu**ing!"
212define Omission Char+ Asterisk Asterisk+ Char*;
213
214
215! TODO: Name words with ' and `
216
Akron310905f2021-08-11 13:49:50 +0200217! Support ASCII elements, like
218! +---------------+
219! <---->, -->, <--
220! +---------------+
221! <---> | Worker Node N |
222! +---------------+
223! |============= Core =============|
224
225
Akron4c2a1ad2021-08-31 00:35:53 +0200226echo - Compile Real Token
Akron310905f2021-08-11 13:49:50 +0200227
Akron71986452021-11-09 01:36:30 +0100228define RealToken [Punct|Word|SNS|AcronymDep|Ord|Num|Years|Times];
Akron310905f2021-08-11 13:49:50 +0200229
230echo - Introduce Token splitter
Akron4c2a1ad2021-08-31 00:35:53 +0200231
232define Token [
Akron6742b962021-11-09 01:17:20 +0100233 XMLEntities @-> ... NLout,
Akron4c2a1ad2021-08-31 00:35:53 +0200234 Abbr @-> ... NLout,
Akronf1106ec2021-11-05 13:04:44 +0100235 RealToken @-> ... NLout,
Akron71986452021-11-09 01:36:30 +0100236 XML @-> ... NLout,
237 URL @-> ... NLout,
238 Email @-> ... NLout,
239 File @-> ... NLout,
Akron4c2a1ad2021-08-31 00:35:53 +0200240 Plusampersand @-> ... NLout,
Akron71986452021-11-09 01:36:30 +0100241 Domain @-> ... NLout,
Akron4c2a1ad2021-08-31 00:35:53 +0200242 Emoji @-> ... NLout,
243 [Streetname|Omission|Emdash] @-> ... NLout
244 ]
Akrona854faa2021-10-22 19:31:08 +0200245.o. [[WS|NL]+ @-> 0 || NLout _ ]
Akronf6bdfdb2021-10-23 15:56:53 +0200246.o. [[WS|NL]+ @-> 0 || .#. _ ]
Akron310905f2021-08-11 13:49:50 +0200247;
248
249echo - Introduce Sentence splitter
Akron4c2a1ad2021-08-31 00:35:53 +0200250read regex Token .o. [[["."|"!"|"?"]+|"…"] @-> ... NLout \/ NLout _ ];
Akron310905f2021-08-11 13:49:50 +0200251
Akrona854faa2021-10-22 19:31:08 +0200252! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b