blob: 0100d6e9d2f94d64b99307b38f2dfd5bad5644fd [file] [log] [blame]
Akron310905f2021-08-11 13:49:50 +02001! This tokenizer is based on work by
2! - StandardTokenizerImpl by the Lucene project
3! under the Apache License
4! - https://github.com/dlwh/epic by David Hall (2014)
5! under the Apacahe License
6! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
7! under the Apache License
8! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
9! under the MIT License
10!
11! The abbreviation list is part of the sentence splitter tool
12! of the IDS.
13
Akron4af79f12021-08-11 14:48:17 +020014define NLout "@_TOKEN_SYMBOL_@";
Akron3de361e2021-08-17 09:56:42 +020015! define NLout "\u000a";
Akron310905f2021-08-11 13:49:50 +020016
17define Digit [%0|1|2|3|4|5|6|7|8|9];
Akrone8837b52021-08-11 17:29:58 +020018define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
Akrona854faa2021-10-22 19:31:08 +020019define EOT "\u0004";
Akron310905f2021-08-11 13:49:50 +020020
21!!!!!!!!!!!!!!!!!
22! <from tmorph> !
23!!!!!!!!!!!!!!!!!
Akron3de361e2021-08-17 09:56:42 +020024define WS [" "|"\u0009"|"\u00a0"|"\u1680"|
Akron310905f2021-08-11 13:49:50 +020025 "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
26 "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
Akron3de361e2021-08-17 09:56:42 +020027 "\u202f"|"\u205f"|"\u3000"];
28
Akrona854faa2021-10-22 19:31:08 +020029define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];
Akron310905f2021-08-11 13:49:50 +020030
31! Punctuation that ends sentences
32! Differs!
Akron3de361e2021-08-17 09:56:42 +020033define SP [["."|"?"|"!"]+|"…"];
34
Akron310905f2021-08-11 13:49:50 +020035! Left punctuation
36define LP ["("|"["|"{"|
37 "“"|"‘"|"‹"|"«"|
38 "'"|%"|
39 ! differs
40 ["'" "'"] |
41 "*"|"/"|"_"| ! Can be Markdown
42 ! from book
43 [%, %,]];
Akron3de361e2021-08-17 09:56:42 +020044
Akron310905f2021-08-11 13:49:50 +020045! Right punctuation - excluding the characters that can be used as apostrophe
46define RP [SP|","|";"|":"|
47 ")"|"]"|"}"|
48 ""|""|"»"|
49 %"|
50 ! differs
51 ["'" "'"]|
52 "*"|"/"|"_"| ! Can be Markdown
53 ! from book
54 [%‘ %‘]|[%’ %’]];
55
56define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@];
57define Apos %'|%’|%`;
Akron4c2a1ad2021-08-31 00:35:53 +020058define Punct [LP|RP|Sym];
Akron310905f2021-08-11 13:49:50 +020059!define nonSym \[WS|LP|RP|Sym];
60!!!!!!!!!!!!!!!!!!
61! </from tmorph> !
62!!!!!!!!!!!!!!!!!!
63
64define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
65define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
66define Slash ["⁄"|"∕"|"/"|"/"];
67define Asterisk ["*"];
68
Akron3de361e2021-08-17 09:56:42 +020069define Char \[WS|NL|Punct|Apos]; ! |¨;
Akron310905f2021-08-11 13:49:50 +020070
71! source lexicon.xfst
72! define Word;
73define Word Char+ ([Dash|Apos|Asterisk] Char+)*;
74
75define URLChar [Char|[Sym - ["<"|">"|%"]]];
76!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
77
78define Caseinsensitive [
79a (->) A,
80b (->) B,
81c (->) C,
82d (->) D,
83e (->) E,
84f (->) F,
85g (->) G,
86h (->) H,
87i (->) I,
88j (->) J,
89k (->) K,
90l (->) L,
91m (->) M,
92n (->) N,
93o (->) O,
94p (->) P,
95q (->) Q,
96r (->) R,
97s (->) S,
98t (->) T,
99u (->) U,
100v (->) V,
101w (->) W,
102x (->) X,
103y (->) Y,
104z (->) Z,
105ö (->) Ö,
106ü (->) Ü,
107ä (->) Ä,
108ß (->) {SS}
109];
110
Akronf1106ec2021-11-05 13:04:44 +0100111define Letter [ [ AsciiLetter | ö | ü | ä | ß ] .o. Caseinsensitive ];
112
113define Abbr [ @txt"txt/abbrv.txt" | Letter ] %.;
Akron310905f2021-08-11 13:49:50 +0200114
Akron57d01612021-08-11 17:53:19 +0200115define Plusampersand @txt"txt/plusampersand.txt";
116
Akron310905f2021-08-11 13:49:50 +0200117! A solution to the "(author): problem" may be to add ) at the end of any
118! string as a possible ending
119
120define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
121
Akron17984c82021-10-30 11:44:37 +0200122! 20:00 Uhr, 00:12:25,34 Minuten
123define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
Akron78dba062021-10-28 19:30:46 +0200124
Akron310905f2021-08-11 13:49:50 +0200125source emoji.xfst
126define Emoji;
127
128! acronyms: U.S.A., I.B.M., etc.
129! use a post-filter to remove dots
Akronf1106ec2021-11-05 13:04:44 +0100130define AcronymDep Letter %. [Letter %.]+;
Akron310905f2021-08-11 13:49:50 +0200131
132define Dot "."|[["["|"("] "d" "o" "t" [")"|"]"]] .o. Caseinsensitive;
133define At "@"|[["["|"("] "a" "t" [")"|"]"]] .o. Caseinsensitive;
134
135define TldEnd [{org}|{de}|{com}] .o. Caseinsensitive;
136
137! Very relaxed URL scheme, not based on the strict Lucene implementation
138define URL [ [ [{http} (s) | {ftp} | {file}] ":" "/" "/"] | [{www} Dot] ]
139URLChar [URLChar|SP]* URLChar
140.o. Caseinsensitive;
141
142define Domain Char+ [Dash Char+]* Dot TldEnd;
143
Akron4c2a1ad2021-08-31 00:35:53 +0200144! XML rule
145define XMLns [AsciiLetter [AsciiLetter|Digit|%-]* (%: AsciiLetter [AsciiLetter|Digit|%-]*)] .o. Caseinsensitive;
146define XML [
147 "<" [
148 [
149 XMLns
150 [WS+ XMLns WS*
151 (%= WS*
152 [[%" [? - %" - %>]+ %"] | [%' [? - %' - %>]+ %']]
153 )
154 ]*
Akron066d99c2021-10-28 19:04:59 +0200155 (WS* "/")
Akron4c2a1ad2021-08-31 00:35:53 +0200156 ]
157 |
158 [
159 "/" XMLns
160 ]
161 ] WS* ">"
162].u;
Akron310905f2021-08-11 13:49:50 +0200163
164!define Email [Alpha [URLChar-At]* At Alpha URLChar* [Dot [[Alpha URLChar+]-Dot-At]]+];
165define Email URLChar+ At URLChar+ [Dot URLChar+]+;
166
167! Twitter user, hashtag, Google+
168define SNS ["@"|"#"|"+"] Char+;
169
170define FileEnd [
171 [{htm} ("l")]|
172 [{doc} ("x")]|
173 {pdf}|
174 ["j" "p" ("e") "g"]|
175 ["m" "p" ["3"|"4"]]|
176 {ogg}|
177 {png}|
178 {avi}|
179 {txt}|
180 {xls}|
181 {xml}|
182 {aac}|
Akrone8837b52021-08-11 17:29:58 +0200183 {gif}|
184 {exe}
Akron310905f2021-08-11 13:49:50 +0200185 ] .o. Caseinsensitive;
Akrone8837b52021-08-11 17:29:58 +0200186
187define File (( AsciiLetter ":" %\ | "/" ) [ Char | "_" | "-" | Char [ %\ | "/" ] ]*) [Char | "-" | "_" ]+ "." FileEnd;
Akron310905f2021-08-11 13:49:50 +0200188
Akrona0bded52021-08-11 15:48:02 +0200189define Streetname Word {str} %.;
Akron4af79f12021-08-11 14:48:17 +0200190
Akron310905f2021-08-11 13:49:50 +0200191! Also supports
192! 19.4.2015, 19/4/2015 etc.
193define DigitPunct ["_"|"-"|"."|","|Slash];
194define Num Digit+ [DigitPunct Digit+]* (Char+);
195
Akrona0bded52021-08-11 15:48:02 +0200196! ordinals
197define Ord Digit ( Digit (Digit) ) %.;
198
Akron310905f2021-08-11 13:49:50 +0200199! TODO:
200! floating point, serial, model numbers, ip addresses, etc.
201! every other segment must have at least one digit
202
203! Omission words like "fu**ing!"
204define Omission Char+ Asterisk Asterisk+ Char*;
205
206
207! TODO: Name words with ' and `
208
Akron310905f2021-08-11 13:49:50 +0200209! Support ASCII elements, like
210! +---------------+
211! <---->, -->, <--
212! +---------------+
213! <---> | Worker Node N |
214! +---------------+
215! |============= Core =============|
216
217
Akron4c2a1ad2021-08-31 00:35:53 +0200218echo - Compile Real Token
Akron310905f2021-08-11 13:49:50 +0200219
Akron78dba062021-10-28 19:30:46 +0200220define RealToken [Punct|Word|XML|Email|URL|SNS|Domain|AcronymDep|File|Ord|Num|Years|Times];
Akron310905f2021-08-11 13:49:50 +0200221
222echo - Introduce Token splitter
Akron4c2a1ad2021-08-31 00:35:53 +0200223
224define Token [
Akron4c2a1ad2021-08-31 00:35:53 +0200225 Abbr @-> ... NLout,
Akronf1106ec2021-11-05 13:04:44 +0100226 RealToken @-> ... NLout,
Akron4c2a1ad2021-08-31 00:35:53 +0200227 Plusampersand @-> ... NLout,
228 Emoji @-> ... NLout,
229 [Streetname|Omission|Emdash] @-> ... NLout
230 ]
Akrona854faa2021-10-22 19:31:08 +0200231.o. [[WS|NL]+ @-> 0 || NLout _ ]
Akronf6bdfdb2021-10-23 15:56:53 +0200232.o. [[WS|NL]+ @-> 0 || .#. _ ]
Akron310905f2021-08-11 13:49:50 +0200233;
234
235echo - Introduce Sentence splitter
Akron4c2a1ad2021-08-31 00:35:53 +0200236read regex Token .o. [[["."|"!"|"?"]+|"…"] @-> ... NLout \/ NLout _ ];
Akron310905f2021-08-11 13:49:50 +0200237
Akrona854faa2021-10-22 19:31:08 +0200238! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b