blob: 56917ea37370225287002d7089e77063a340ab31 [file] [log] [blame]
Akron310905f2021-08-11 13:49:50 +02001! This tokenizer is based on work by
2! - StandardTokenizerImpl by the Lucene project
3! under the Apache License
4! - https://github.com/dlwh/epic by David Hall (2014)
5! under the Apacahe License
6! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
7! under the Apache License
8! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
9! under the MIT License
10!
11! The abbreviation list is part of the sentence splitter tool
12! of the IDS.
13
Akron4af79f12021-08-11 14:48:17 +020014! define NLout "\u000a";
15define NLout "@_TOKEN_SYMBOL_@";
Akron310905f2021-08-11 13:49:50 +020016define NLin ("\u000d") "\u000a";
17
18define Digit [%0|1|2|3|4|5|6|7|8|9];
Akrone8837b52021-08-11 17:29:58 +020019define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
Akron310905f2021-08-11 13:49:50 +020020
21!!!!!!!!!!!!!!!!!
22! <from tmorph> !
23!!!!!!!!!!!!!!!!!
24define WS [" "|"\u0009"|"\u000a"|"\u000d"|
25 "\u00a0"|"\u1680"|
26 "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
27 "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
28 "\u2028"|"\u2029"|"\u202f"|"\u205f"|"\u3000"| NLin];
29
30! Punctuation that ends sentences
31! Differs!
32define SP [["."|"?"|"!"]+|"…"]; ! Warning! This results in '...' being a MCS!
33! Left punctuation
34define LP ["("|"["|"{"|
35 "“"|"‘"|"‹"|"«"|
36 "'"|%"|
37 ! differs
38 ["'" "'"] |
39 "*"|"/"|"_"| ! Can be Markdown
40 ! from book
41 [%, %,]];
42! Right punctuation - excluding the characters that can be used as apostrophe
43define RP [SP|","|";"|":"|
44 ")"|"]"|"}"|
45 "”"|"›"|"»"|
46 %"|
47 ! differs
48 ["'" "'"]|
49 "*"|"/"|"_"| ! Can be Markdown
50 ! from book
51 [%‘ %‘]|[%’ %’]];
52
53define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@];
54define Apos %'|%’|%`;
55define Punct LP|RP|Sym;
56!define nonSym \[WS|LP|RP|Sym];
57!!!!!!!!!!!!!!!!!!
58! </from tmorph> !
59!!!!!!!!!!!!!!!!!!
60
61define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
62define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
63define Slash ["⁄"|"∕"|"/"|"/"];
64define Asterisk ["*"];
65
66define Char \[WS|Punct|Apos]; ! |¨;
67
68! source lexicon.xfst
69! define Word;
70define Word Char+ ([Dash|Apos|Asterisk] Char+)*;
71
72define URLChar [Char|[Sym - ["<"|">"|%"]]];
73!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
74
75define Caseinsensitive [
76a (->) A,
77b (->) B,
78c (->) C,
79d (->) D,
80e (->) E,
81f (->) F,
82g (->) G,
83h (->) H,
84i (->) I,
85j (->) J,
86k (->) K,
87l (->) L,
88m (->) M,
89n (->) N,
90o (->) O,
91p (->) P,
92q (->) Q,
93r (->) R,
94s (->) S,
95t (->) T,
96u (->) U,
97v (->) V,
98w (->) W,
99x (->) X,
100y (->) Y,
101z (->) Z,
102ö (->) Ö,
103ü (->) Ü,
104ä (->) Ä,
105ß (->) {SS}
106];
107
Akronfd92d7e2021-08-11 16:31:43 +0200108define Abbr @txt"txt/abbrv.txt";
Akron310905f2021-08-11 13:49:50 +0200109
Akron57d01612021-08-11 17:53:19 +0200110define Plusampersand @txt"txt/plusampersand.txt";
111
Akron310905f2021-08-11 13:49:50 +0200112! A solution to the "(author): problem" may be to add ) at the end of any
113! string as a possible ending
114
115define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
116
117source emoji.xfst
118define Emoji;
119
120! acronyms: U.S.A., I.B.M., etc.
121! use a post-filter to remove dots
122define AcronymDep Char %. [Char %.]+;
123
124define Dot "."|[["["|"("] "d" "o" "t" [")"|"]"]] .o. Caseinsensitive;
125define At "@"|[["["|"("] "a" "t" [")"|"]"]] .o. Caseinsensitive;
126
127define TldEnd [{org}|{de}|{com}] .o. Caseinsensitive;
128
129! Very relaxed URL scheme, not based on the strict Lucene implementation
130define URL [ [ [{http} (s) | {ftp} | {file}] ":" "/" "/"] | [{www} Dot] ]
131URLChar [URLChar|SP]* URLChar
132.o. Caseinsensitive;
133
134define Domain Char+ [Dash Char+]* Dot TldEnd;
135
136!define XML "<" Alpha URLChar* (">");
137define XML "<" URLChar+ (">");
138
139!define Email [Alpha [URLChar-At]* At Alpha URLChar* [Dot [[Alpha URLChar+]-Dot-At]]+];
140define Email URLChar+ At URLChar+ [Dot URLChar+]+;
141
142! Twitter user, hashtag, Google+
143define SNS ["@"|"#"|"+"] Char+;
144
145define FileEnd [
146 [{htm} ("l")]|
147 [{doc} ("x")]|
148 {pdf}|
149 ["j" "p" ("e") "g"]|
150 ["m" "p" ["3"|"4"]]|
151 {ogg}|
152 {png}|
153 {avi}|
154 {txt}|
155 {xls}|
156 {xml}|
157 {aac}|
Akrone8837b52021-08-11 17:29:58 +0200158 {gif}|
159 {exe}
Akron310905f2021-08-11 13:49:50 +0200160 ] .o. Caseinsensitive;
Akrone8837b52021-08-11 17:29:58 +0200161
162define File (( AsciiLetter ":" %\ | "/" ) [ Char | "_" | "-" | Char [ %\ | "/" ] ]*) [Char | "-" | "_" ]+ "." FileEnd;
Akron310905f2021-08-11 13:49:50 +0200163
Akrona0bded52021-08-11 15:48:02 +0200164define Streetname Word {str} %.;
Akron4af79f12021-08-11 14:48:17 +0200165
Akron310905f2021-08-11 13:49:50 +0200166! Also supports
167! 19.4.2015, 19/4/2015 etc.
168define DigitPunct ["_"|"-"|"."|","|Slash];
169define Num Digit+ [DigitPunct Digit+]* (Char+);
170
Akrona0bded52021-08-11 15:48:02 +0200171! ordinals
172define Ord Digit ( Digit (Digit) ) %.;
173
Akron310905f2021-08-11 13:49:50 +0200174! TODO:
175! floating point, serial, model numbers, ip addresses, etc.
176! every other segment must have at least one digit
177
178! Omission words like "fu**ing!"
179define Omission Char+ Asterisk Asterisk+ Char*;
180
181
182! TODO: Name words with ' and `
183
Akron310905f2021-08-11 13:49:50 +0200184! Support ASCII elements, like
185! +---------------+
186! <---->, -->, <--
187! +---------------+
188! <---> | Worker Node N |
189! +---------------+
190! |============= Core =============|
191
192
193
Akron57d01612021-08-11 17:53:19 +0200194define RealToken [XML|Email|URL|SNS|[Abbr %.]|Plusampersand|Streetname|Omission|Domain|AcronymDep|File|Emdash|Punct|Ord|Num|Years|Emoji|Word];
Akron310905f2021-08-11 13:49:50 +0200195
196echo - Introduce Token splitter
197define Token [RealToken @-> ... NLout]
198.o. [WS+ @-> 0]
199;
200
201echo - Introduce Sentence splitter
202read regex Token .o. [[["."|"!"|"?"]+] @-> ... NLout \/ NLout _];
203
204! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
205
206! In a second pass, XML tags need to be combined. This requires tagging "<..." with ~xmls before \n
207! and anything with > with ~xmle.
208! In case this is part of an emoticon ( >:-P ), this needs to be split again .
209! The same is true for ( and )