blob: f20d7e2bed6baf788dc8b9304bf9abe24fd14467 [file] [log] [blame]
Akron310905f2021-08-11 13:49:50 +02001! This tokenizer is based on work by
2! - StandardTokenizerImpl by the Lucene project
3! under the Apache License
4! - https://github.com/dlwh/epic by David Hall (2014)
5! under the Apacahe License
6! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
7! under the Apache License
8! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
9! under the MIT License
10!
11! The abbreviation list is part of the sentence splitter tool
12! of the IDS.
13
Akron4af79f12021-08-11 14:48:17 +020014define NLout "@_TOKEN_SYMBOL_@";
Akron3de361e2021-08-17 09:56:42 +020015! define NLout "\u000a";
Akron310905f2021-08-11 13:49:50 +020016
17define Digit [%0|1|2|3|4|5|6|7|8|9];
Akrone8837b52021-08-11 17:29:58 +020018define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
Akron11a05d92021-11-06 13:17:11 +010019define HexLetter [Digit|a|A|b|B|c|C|d|D|e|E|f|F];
Akrona854faa2021-10-22 19:31:08 +020020define EOT "\u0004";
Akron310905f2021-08-11 13:49:50 +020021
22!!!!!!!!!!!!!!!!!
23! <from tmorph> !
24!!!!!!!!!!!!!!!!!
Akron3de361e2021-08-17 09:56:42 +020025define WS [" "|"\u0009"|"\u00a0"|"\u1680"|
Akron310905f2021-08-11 13:49:50 +020026 "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
27 "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
Akron3de361e2021-08-17 09:56:42 +020028 "\u202f"|"\u205f"|"\u3000"];
29
Akrona854faa2021-10-22 19:31:08 +020030define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];
Akron310905f2021-08-11 13:49:50 +020031
32! Punctuation that ends sentences
33! Differs!
Akron3de361e2021-08-17 09:56:42 +020034define SP [["."|"?"|"!"]+|"…"];
35
Akron310905f2021-08-11 13:49:50 +020036! Left punctuation
37define LP ["("|"["|"{"|
38 "“"|"‘"|"‹"|"«"|
39 "'"|%"|
40 ! differs
41 ["'" "'"] |
42 "*"|"/"|"_"| ! Can be Markdown
43 ! from book
44 [%, %,]];
Akron3de361e2021-08-17 09:56:42 +020045
Akron310905f2021-08-11 13:49:50 +020046! Right punctuation - excluding the characters that can be used as apostrophe
Akrone2008412022-03-09 10:10:13 +010047define RP [SP|","|";"|":"|
Akron310905f2021-08-11 13:49:50 +020048 ")"|"]"|"}"|
Akrone2008412022-03-09 10:10:13 +010049 ""|""|"»"|%"|[%’ %’]|["'" "'"]|[%‘ %‘]|
Akrone96895f2022-03-08 19:58:37 +010050 "*"|"/"|"_"]; ! Can be Markdown
Akron310905f2021-08-11 13:49:50 +020051
Akron6742b962021-11-09 01:17:20 +010052define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@|%&];
Akron310905f2021-08-11 13:49:50 +020053define Apos %'|%’|%`;
Akron4c2a1ad2021-08-31 00:35:53 +020054define Punct [LP|RP|Sym];
Akron310905f2021-08-11 13:49:50 +020055!define nonSym \[WS|LP|RP|Sym];
56!!!!!!!!!!!!!!!!!!
57! </from tmorph> !
58!!!!!!!!!!!!!!!!!!
59
60define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
61define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
62define Slash ["⁄"|"∕"|"/"|"/"];
63define Asterisk ["*"];
64
Akron3de361e2021-08-17 09:56:42 +020065define Char \[WS|NL|Punct|Apos]; ! |¨;
Akron310905f2021-08-11 13:49:50 +020066
Akronb02ad072022-01-19 12:41:44 +010067define Word Char+ ([Apos|Asterisk] Char+)* ([s|S] [%’|%`]);
Akron936c0f52021-12-07 11:30:53 +010068
69define Plusampersand @txt"txt/plusampersand.txt";
70define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;
71
Akron310905f2021-08-11 13:49:50 +020072!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
73
74define Caseinsensitive [
75a (->) A,
76b (->) B,
77c (->) C,
78d (->) D,
79e (->) E,
80f (->) F,
81g (->) G,
82h (->) H,
83i (->) I,
84j (->) J,
85k (->) K,
86l (->) L,
87m (->) M,
88n (->) N,
89o (->) O,
90p (->) P,
91q (->) Q,
92r (->) R,
93s (->) S,
94t (->) T,
95u (->) U,
96v (->) V,
97w (->) W,
98x (->) X,
99y (->) Y,
100z (->) Z,
101ö (->) Ö,
102ü (->) Ü,
103ä (->) Ä,
104ß (->) {SS}
105];
106
Akronf1106ec2021-11-05 13:04:44 +0100107define Letter [ [ AsciiLetter | ö | ü | ä | ß ] .o. Caseinsensitive ];
108
Akron11a05d92021-11-06 13:17:11 +0100109! Abbreviations and Initials
Akronf1106ec2021-11-05 13:04:44 +0100110define Abbr [ @txt"txt/abbrv.txt" | Letter ] %.;
Akron310905f2021-08-11 13:49:50 +0200111
112! A solution to the "(author): problem" may be to add ) at the end of any
113! string as a possible ending
114
115define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
116
Akron17984c82021-10-30 11:44:37 +0200117! 20:00 Uhr, 00:12:25,34 Minuten
118define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
Akron78dba062021-10-28 19:30:46 +0200119
Akron61948ef2022-03-30 14:07:57 +0200120! Emoticons
Akronb98e4cf2022-03-27 23:56:49 +0200121source emoticons.xfst
Akron310905f2021-08-11 13:49:50 +0200122
123! acronyms: U.S.A., I.B.M., etc.
124! use a post-filter to remove dots
Akron54ed7e72022-01-04 12:05:00 +0100125define AcronymDep Letter %. Letter %. [Letter %.]+;
Akron310905f2021-08-11 13:49:50 +0200126
Akronc8406362021-11-09 20:17:50 +0100127! XML sources
128source xml.xfst
Akron310905f2021-08-11 13:49:50 +0200129
Akron6742b962021-11-09 01:17:20 +0100130! XML entities
131source entities.xfst
Akron6742b962021-11-09 01:17:20 +0100132
Akron61948ef2022-03-30 14:07:57 +0200133! Technical protocols
134source protocols.xfst
Akron310905f2021-08-11 13:49:50 +0200135
Akrona0bded52021-08-11 15:48:02 +0200136define Streetname Word {str} %.;
Akron4af79f12021-08-11 14:48:17 +0200137
Akron310905f2021-08-11 13:49:50 +0200138! Also supports
139! 19.4.2015, 19/4/2015 etc.
140define DigitPunct ["_"|"-"|"."|","|Slash];
141define Num Digit+ [DigitPunct Digit+]* (Char+);
142
Akrona0bded52021-08-11 15:48:02 +0200143! ordinals
144define Ord Digit ( Digit (Digit) ) %.;
145
Akron310905f2021-08-11 13:49:50 +0200146! TODO:
147! floating point, serial, model numbers, ip addresses, etc.
148! every other segment must have at least one digit
149
150! Omission words like "fu**ing!"
151define Omission Char+ Asterisk Asterisk+ Char*;
152
153
154! TODO: Name words with ' and `
155
Akron310905f2021-08-11 13:49:50 +0200156! Support ASCII elements, like
157! +---------------+
158! <---->, -->, <--
159! +---------------+
160! <---> | Worker Node N |
161! +---------------+
162! |============= Core =============|
163
164
Akron4c2a1ad2021-08-31 00:35:53 +0200165echo - Compile Real Token
Akron310905f2021-08-11 13:49:50 +0200166
Akrone87906b2021-11-24 10:39:14 +0100167define RealToken [Punct|Emdash|Abbr|Streetname|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission];
Akron310905f2021-08-11 13:49:50 +0200168
169echo - Introduce Token splitter
Akron4c2a1ad2021-08-31 00:35:53 +0200170
171define Token [
Akron4222ac82022-03-11 01:06:21 +0100172 RealToken @-> ... NLout,
Akron71986452021-11-09 01:36:30 +0100173 XML @-> ... NLout,
174 URL @-> ... NLout,
175 Email @-> ... NLout,
176 File @-> ... NLout,
Akron71986452021-11-09 01:36:30 +0100177 Domain @-> ... NLout,
Akronb98e4cf2022-03-27 23:56:49 +0200178 Emoticons @-> ... NLout
Akrone96895f2022-03-08 19:58:37 +0100179];
Akron310905f2021-08-11 13:49:50 +0200180
181echo - Introduce Sentence splitter
Akrone96895f2022-03-08 19:58:37 +0100182! And compose Whitespace ignorance
Akronece3f012022-03-09 19:12:15 +0100183
184define DQuotes ["”"|%"|"»"|"«"];
Akron4222ac82022-03-11 01:06:21 +0100185define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä];
Akronece3f012022-03-09 19:12:15 +0100186
Akrone96895f2022-03-08 19:58:37 +0100187read regex Token .o. [
Akron7aa1cbe2022-03-30 12:44:04 +0200188 SP NLout [DQuotes (NLout ")") | ["›"|%‹|%’|"'"] ( NLout DQuotes (NLout ")") | NLout ")" ) | ")" ] (NLout SP) @-> ... NLout \/ _ NLout \%,
Akrone96895f2022-03-08 19:58:37 +0100189] .o. [
Akronf94b9ce2022-03-27 18:18:09 +0200190 SP @-> ... NLout \/ NLout _ NLout [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - ")" - NLout]
Akrone96895f2022-03-08 19:58:37 +0100191] .o. [
Akron4222ac82022-03-11 01:06:21 +0100192 [%. %. %.] @-> ... NLout \/ _ NLout WS+ NotSmallCaps
193] .o. [
Akrone96895f2022-03-08 19:58:37 +0100194 [WS|NL]+ @-> 0 || [ .#. | NLout ] _
195];
Akron310905f2021-08-11 13:49:50 +0200196
Akrona854faa2021-10-22 19:31:08 +0200197! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b