blob: 4193c289edc793818eef20a186435992668d1e30 [file] [log] [blame]
Akron310905f2021-08-11 13:49:50 +02001! This tokenizer is based on work by
2! - StandardTokenizerImpl by the Lucene project
3! under the Apache License
4! - https://github.com/dlwh/epic by David Hall (2014)
5! under the Apacahe License
6! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
7! under the Apache License
8! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
9! under the MIT License
10!
11! The abbreviation list is part of the sentence splitter tool
12! of the IDS.
13
Akron4af79f12021-08-11 14:48:17 +020014define NLout "@_TOKEN_SYMBOL_@";
Akron3de361e2021-08-17 09:56:42 +020015! define NLout "\u000a";
Akron310905f2021-08-11 13:49:50 +020016
17define Digit [%0|1|2|3|4|5|6|7|8|9];
Akrone8837b52021-08-11 17:29:58 +020018define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
Akron11a05d92021-11-06 13:17:11 +010019define HexLetter [Digit|a|A|b|B|c|C|d|D|e|E|f|F];
Akrona854faa2021-10-22 19:31:08 +020020define EOT "\u0004";
Akron310905f2021-08-11 13:49:50 +020021
22!!!!!!!!!!!!!!!!!
23! <from tmorph> !
24!!!!!!!!!!!!!!!!!
Akron3de361e2021-08-17 09:56:42 +020025define WS [" "|"\u0009"|"\u00a0"|"\u1680"|
Akron310905f2021-08-11 13:49:50 +020026 "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
27 "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
Akron3de361e2021-08-17 09:56:42 +020028 "\u202f"|"\u205f"|"\u3000"];
29
Akrona854faa2021-10-22 19:31:08 +020030define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];
Akron310905f2021-08-11 13:49:50 +020031
32! Punctuation that ends sentences
33! Differs!
Akron3de361e2021-08-17 09:56:42 +020034define SP [["."|"?"|"!"]+|"…"];
35
Akron310905f2021-08-11 13:49:50 +020036! Left punctuation
37define LP ["("|"["|"{"|
38 "“"|"‘"|"‹"|"«"|
39 "'"|%"|
40 ! differs
41 ["'" "'"] |
42 "*"|"/"|"_"| ! Can be Markdown
43 ! from book
44 [%, %,]];
Akron3de361e2021-08-17 09:56:42 +020045
Akron310905f2021-08-11 13:49:50 +020046! Right punctuation - excluding the characters that can be used as apostrophe
Akrone2008412022-03-09 10:10:13 +010047define RP [SP|","|";"|":"|
Akron310905f2021-08-11 13:49:50 +020048 ")"|"]"|"}"|
Akrone2008412022-03-09 10:10:13 +010049 "”"|"›"|"»"|%"|[%’ %’]|["'" "'"]|[%‘ %‘]|
Akrone96895f2022-03-08 19:58:37 +010050 "*"|"/"|"_"]; ! Can be Markdown
Akron310905f2021-08-11 13:49:50 +020051
Akron6742b962021-11-09 01:17:20 +010052define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@|%&];
Akron310905f2021-08-11 13:49:50 +020053define Apos %'|%’|%`;
Akron4c2a1ad2021-08-31 00:35:53 +020054define Punct [LP|RP|Sym];
Akron310905f2021-08-11 13:49:50 +020055!define nonSym \[WS|LP|RP|Sym];
56!!!!!!!!!!!!!!!!!!
57! </from tmorph> !
58!!!!!!!!!!!!!!!!!!
59
60define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
61define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
62define Slash ["⁄"|"∕"|"/"|"/"];
63define Asterisk ["*"];
64
Akron3de361e2021-08-17 09:56:42 +020065define Char \[WS|NL|Punct|Apos]; ! |¨;
Akron310905f2021-08-11 13:49:50 +020066
Akronb02ad072022-01-19 12:41:44 +010067define Word Char+ ([Apos|Asterisk] Char+)* ([s|S] [%’|%`]);
Akron936c0f52021-12-07 11:30:53 +010068
69define Plusampersand @txt"txt/plusampersand.txt";
70define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;
71
Akron310905f2021-08-11 13:49:50 +020072
73define URLChar [Char|[Sym - ["<"|">"|%"]]];
74!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
75
76define Caseinsensitive [
77a (->) A,
78b (->) B,
79c (->) C,
80d (->) D,
81e (->) E,
82f (->) F,
83g (->) G,
84h (->) H,
85i (->) I,
86j (->) J,
87k (->) K,
88l (->) L,
89m (->) M,
90n (->) N,
91o (->) O,
92p (->) P,
93q (->) Q,
94r (->) R,
95s (->) S,
96t (->) T,
97u (->) U,
98v (->) V,
99w (->) W,
100x (->) X,
101y (->) Y,
102z (->) Z,
103ö (->) Ö,
104ü (->) Ü,
105ä (->) Ä,
106ß (->) {SS}
107];
108
Akronf1106ec2021-11-05 13:04:44 +0100109define Letter [ [ AsciiLetter | ö | ü | ä | ß ] .o. Caseinsensitive ];
110
Akron11a05d92021-11-06 13:17:11 +0100111! Abbreviations and Initials
Akronf1106ec2021-11-05 13:04:44 +0100112define Abbr [ @txt"txt/abbrv.txt" | Letter ] %.;
Akron310905f2021-08-11 13:49:50 +0200113
114! A solution to the "(author): problem" may be to add ) at the end of any
115! string as a possible ending
116
117define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
118
Akron17984c82021-10-30 11:44:37 +0200119! 20:00 Uhr, 00:12:25,34 Minuten
120define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
Akron78dba062021-10-28 19:30:46 +0200121
Akron310905f2021-08-11 13:49:50 +0200122source emoji.xfst
123define Emoji;
124
125! acronyms: U.S.A., I.B.M., etc.
126! use a post-filter to remove dots
Akron54ed7e72022-01-04 12:05:00 +0100127define AcronymDep Letter %. Letter %. [Letter %.]+;
Akron310905f2021-08-11 13:49:50 +0200128
129define Dot "."|[["["|"("] "d" "o" "t" [")"|"]"]] .o. Caseinsensitive;
130define At "@"|[["["|"("] "a" "t" [")"|"]"]] .o. Caseinsensitive;
131
132define TldEnd [{org}|{de}|{com}] .o. Caseinsensitive;
133
134! Very relaxed URL scheme, not based on the strict Lucene implementation
135define URL [ [ [{http} (s) | {ftp} | {file}] ":" "/" "/"] | [{www} Dot] ]
136URLChar [URLChar|SP]* URLChar
137.o. Caseinsensitive;
138
139define Domain Char+ [Dash Char+]* Dot TldEnd;
140
Akronc8406362021-11-09 20:17:50 +0100141! XML sources
142source xml.xfst
143define XML;
Akron310905f2021-08-11 13:49:50 +0200144
Akron6742b962021-11-09 01:17:20 +0100145! XML entities
146source entities.xfst
147define XMLEntities;
148
149
150! Email addresses
Akron310905f2021-08-11 13:49:50 +0200151define Email URLChar+ At URLChar+ [Dot URLChar+]+;
152
153! Twitter user, hashtag, Google+
154define SNS ["@"|"#"|"+"] Char+;
155
156define FileEnd [
157 [{htm} ("l")]|
158 [{doc} ("x")]|
159 {pdf}|
160 ["j" "p" ("e") "g"]|
161 ["m" "p" ["3"|"4"]]|
162 {ogg}|
163 {png}|
Akron11a05d92021-11-06 13:17:11 +0100164 [{ppt} ("x")]|
Akron310905f2021-08-11 13:49:50 +0200165 {avi}|
166 {txt}|
167 {xls}|
168 {xml}|
169 {aac}|
Akrone8837b52021-08-11 17:29:58 +0200170 {gif}|
171 {exe}
Akron310905f2021-08-11 13:49:50 +0200172 ] .o. Caseinsensitive;
Akrone8837b52021-08-11 17:29:58 +0200173
174define File (( AsciiLetter ":" %\ | "/" ) [ Char | "_" | "-" | Char [ %\ | "/" ] ]*) [Char | "-" | "_" ]+ "." FileEnd;
Akron310905f2021-08-11 13:49:50 +0200175
Akrona0bded52021-08-11 15:48:02 +0200176define Streetname Word {str} %.;
Akron4af79f12021-08-11 14:48:17 +0200177
Akron310905f2021-08-11 13:49:50 +0200178! Also supports
179! 19.4.2015, 19/4/2015 etc.
180define DigitPunct ["_"|"-"|"."|","|Slash];
181define Num Digit+ [DigitPunct Digit+]* (Char+);
182
Akrona0bded52021-08-11 15:48:02 +0200183! ordinals
184define Ord Digit ( Digit (Digit) ) %.;
185
Akron310905f2021-08-11 13:49:50 +0200186! TODO:
187! floating point, serial, model numbers, ip addresses, etc.
188! every other segment must have at least one digit
189
190! Omission words like "fu**ing!"
191define Omission Char+ Asterisk Asterisk+ Char*;
192
193
194! TODO: Name words with ' and `
195
Akron310905f2021-08-11 13:49:50 +0200196! Support ASCII elements, like
197! +---------------+
198! <---->, -->, <--
199! +---------------+
200! <---> | Worker Node N |
201! +---------------+
202! |============= Core =============|
203
204
Akron4c2a1ad2021-08-31 00:35:53 +0200205echo - Compile Real Token
Akron310905f2021-08-11 13:49:50 +0200206
Akrone87906b2021-11-24 10:39:14 +0100207define RealToken [Punct|Emdash|Abbr|Streetname|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission];
Akron310905f2021-08-11 13:49:50 +0200208
209echo - Introduce Token splitter
Akron4c2a1ad2021-08-31 00:35:53 +0200210
211define Token [
Akronf1106ec2021-11-05 13:04:44 +0100212 RealToken @-> ... NLout,
Akron71986452021-11-09 01:36:30 +0100213 XML @-> ... NLout,
214 URL @-> ... NLout,
215 Email @-> ... NLout,
216 File @-> ... NLout,
Akron71986452021-11-09 01:36:30 +0100217 Domain @-> ... NLout,
Akronfac8abc2021-11-10 07:19:59 +0100218 Emoji @-> ... NLout
Akrone96895f2022-03-08 19:58:37 +0100219];
Akron310905f2021-08-11 13:49:50 +0200220
221echo - Introduce Sentence splitter
Akrone96895f2022-03-08 19:58:37 +0100222! And compose Whitespace ignorance
223read regex Token .o. [
Akrone2008412022-03-09 10:10:13 +0100224 SP NLout ["”"|"›"|"»"|%"|%’|"'"] @-> ... NLout \/ _ NLout \%,
Akrone96895f2022-03-08 19:58:37 +0100225] .o. [
Akrone2008412022-03-09 10:10:13 +0100226 SP @-> ... NLout \/ NLout _ NLout [? - "”" - "›" - "»" - %" - %’ - "'"]
Akrone96895f2022-03-08 19:58:37 +0100227] .o. [
228 [WS|NL]+ @-> 0 || [ .#. | NLout ] _
229];
Akron310905f2021-08-11 13:49:50 +0200230
Akrona854faa2021-10-22 19:31:08 +0200231! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b