blob: 57776667c29dd1f4d78e358f0f6ae3cf9406b662 [file] [log] [blame]
Akron310905f2021-08-11 13:49:50 +02001! This tokenizer is based on work by
2! - StandardTokenizerImpl by the Lucene project
3! under the Apache License
4! - https://github.com/dlwh/epic by David Hall (2014)
5! under the Apacahe License
6! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
7! under the Apache License
8! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
9! under the MIT License
10!
11! The abbreviation list is part of the sentence splitter tool
12! of the IDS.
13
Akron4af79f12021-08-11 14:48:17 +020014! define NLout "\u000a";
15define NLout "@_TOKEN_SYMBOL_@";
Akron310905f2021-08-11 13:49:50 +020016define NLin ("\u000d") "\u000a";
17
18define Digit [%0|1|2|3|4|5|6|7|8|9];
19
20!!!!!!!!!!!!!!!!!
21! <from tmorph> !
22!!!!!!!!!!!!!!!!!
23define WS [" "|"\u0009"|"\u000a"|"\u000d"|
24 "\u00a0"|"\u1680"|
25 "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
26 "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
27 "\u2028"|"\u2029"|"\u202f"|"\u205f"|"\u3000"| NLin];
28
29! Punctuation that ends sentences
30! Differs!
31define SP [["."|"?"|"!"]+|"…"]; ! Warning! This results in '...' being a MCS!
32! Left punctuation
33define LP ["("|"["|"{"|
34 "“"|"‘"|"‹"|"«"|
35 "'"|%"|
36 ! differs
37 ["'" "'"] |
38 "*"|"/"|"_"| ! Can be Markdown
39 ! from book
40 [%, %,]];
41! Right punctuation - excluding the characters that can be used as apostrophe
42define RP [SP|","|";"|":"|
43 ")"|"]"|"}"|
44 "”"|"›"|"»"|
45 %"|
46 ! differs
47 ["'" "'"]|
48 "*"|"/"|"_"| ! Can be Markdown
49 ! from book
50 [%‘ %‘]|[%’ %’]];
51
52define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@];
53define Apos %'|%’|%`;
54define Punct LP|RP|Sym;
55!define nonSym \[WS|LP|RP|Sym];
56!!!!!!!!!!!!!!!!!!
57! </from tmorph> !
58!!!!!!!!!!!!!!!!!!
59
60define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
61define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
62define Slash ["⁄"|"∕"|"/"|"/"];
63define Asterisk ["*"];
64
65define Char \[WS|Punct|Apos]; ! |¨;
66
67! source lexicon.xfst
68! define Word;
69define Word Char+ ([Dash|Apos|Asterisk] Char+)*;
70
71define URLChar [Char|[Sym - ["<"|">"|%"]]];
72!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
73
74define Caseinsensitive [
75a (->) A,
76b (->) B,
77c (->) C,
78d (->) D,
79e (->) E,
80f (->) F,
81g (->) G,
82h (->) H,
83i (->) I,
84j (->) J,
85k (->) K,
86l (->) L,
87m (->) M,
88n (->) N,
89o (->) O,
90p (->) P,
91q (->) Q,
92r (->) R,
93s (->) S,
94t (->) T,
95u (->) U,
96v (->) V,
97w (->) W,
98x (->) X,
99y (->) Y,
100z (->) Z,
101ö (->) Ö,
102ü (->) Ü,
103ä (->) Ä,
104ß (->) {SS}
105];
106
107define Abbr @txt"txt/abbrv.txt" .o. Caseinsensitive;
108
109! A solution to the "(author): problem" may be to add ) at the end of any
110! string as a possible ending
111
112define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
113
114source emoji.xfst
115define Emoji;
116
117! acronyms: U.S.A., I.B.M., etc.
118! use a post-filter to remove dots
119define AcronymDep Char %. [Char %.]+;
120
121define Dot "."|[["["|"("] "d" "o" "t" [")"|"]"]] .o. Caseinsensitive;
122define At "@"|[["["|"("] "a" "t" [")"|"]"]] .o. Caseinsensitive;
123
124define TldEnd [{org}|{de}|{com}] .o. Caseinsensitive;
125
126! Very relaxed URL scheme, not based on the strict Lucene implementation
127define URL [ [ [{http} (s) | {ftp} | {file}] ":" "/" "/"] | [{www} Dot] ]
128URLChar [URLChar|SP]* URLChar
129.o. Caseinsensitive;
130
131define Domain Char+ [Dash Char+]* Dot TldEnd;
132
133!define XML "<" Alpha URLChar* (">");
134define XML "<" URLChar+ (">");
135
136!define Email [Alpha [URLChar-At]* At Alpha URLChar* [Dot [[Alpha URLChar+]-Dot-At]]+];
137define Email URLChar+ At URLChar+ [Dot URLChar+]+;
138
139! Twitter user, hashtag, Google+
140define SNS ["@"|"#"|"+"] Char+;
141
142define FileEnd [
143 [{htm} ("l")]|
144 [{doc} ("x")]|
145 {pdf}|
146 ["j" "p" ("e") "g"]|
147 ["m" "p" ["3"|"4"]]|
148 {ogg}|
149 {png}|
150 {avi}|
151 {txt}|
152 {xls}|
153 {xml}|
154 {aac}|
155 {gif}
156 ] .o. Caseinsensitive;
157define File [Char|"-"]+ "." FileEnd;
158
Akrona0bded52021-08-11 15:48:02 +0200159define Streetname Word {str} %.;
Akron4af79f12021-08-11 14:48:17 +0200160
Akron310905f2021-08-11 13:49:50 +0200161! Also supports
162! 19.4.2015, 19/4/2015 etc.
163define DigitPunct ["_"|"-"|"."|","|Slash];
164define Num Digit+ [DigitPunct Digit+]* (Char+);
165
Akrona0bded52021-08-11 15:48:02 +0200166! ordinals
167define Ord Digit ( Digit (Digit) ) %.;
168
Akron310905f2021-08-11 13:49:50 +0200169! TODO:
170! floating point, serial, model numbers, ip addresses, etc.
171! every other segment must have at least one digit
172
173! Omission words like "fu**ing!"
174define Omission Char+ Asterisk Asterisk+ Char*;
175
176
177! TODO: Name words with ' and `
178
179! TODO:
180! FNAME = (({LETTER}:[\\/])?|\/)?({LETTER}+|[\\_/-])+\.{EXTENSION}
181
182
183! Support ASCII elements, like
184! +---------------+
185! <---->, -->, <--
186! +---------------+
187! <---> | Worker Node N |
188! +---------------+
189! |============= Core =============|
190
191
192
Akrona0bded52021-08-11 15:48:02 +0200193define RealToken [XML|Email|URL|SNS|[Abbr %.]|Streetname|Omission|Domain|AcronymDep|File|Emdash|Punct|Ord|Num|Years|Emoji|Word];
Akron310905f2021-08-11 13:49:50 +0200194
195echo - Introduce Token splitter
196define Token [RealToken @-> ... NLout]
197.o. [WS+ @-> 0]
198;
199
200echo - Introduce Sentence splitter
201read regex Token .o. [[["."|"!"|"?"]+] @-> ... NLout \/ NLout _];
202
203! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
204
205! In a second pass, XML tags need to be combined. This requires tagging "<..." with ~xmls before \n
206! and anything with > with ~xmle.
207! In case this is part of an emoticon ( >:-P ), this needs to be split again .
208! The same is true for ( and )