blob: 5c2e3c2773f0cf29f24dc927a21e4fbcae5cfded [file] [log] [blame]
Akron78f67142022-04-09 14:10:44 +02001! This tokenizer is based on work by
2! - StandardTokenizerImpl by the Lucene project
3! under the Apache License
4! - https://github.com/dlwh/epic by David Hall (2014)
5! under the Apacahe License
6! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
7! under the Apache License
8! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
9! under the MIT License
10
Akronb15acb92022-04-16 11:01:46 +020011define NLout "@_TOKEN_BOUND_@";
Akron78f67142022-04-09 14:10:44 +020012! define NLout "\u000a";
13
14define Digit [%0|1|2|3|4|5|6|7|8|9];
15define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
Akrona2f952f2026-02-04 09:51:51 +010016define AsciiLetterCap [A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z];
Akron78f67142022-04-09 14:10:44 +020017define HexLetter [Digit|a|A|b|B|c|C|d|D|e|E|f|F];
18define EOT "\u0004";
19
20!!!!!!!!!!!!!!!!!
21! <from tmorph> !
22!!!!!!!!!!!!!!!!!
23define WS [" "|"\u0009"|"\u00a0"|"\u1680"|
24 "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
25 "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
26 "\u202f"|"\u205f"|"\u3000"];
27
28define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];
29
30! Punctuation that ends sentences
31! Differs!
32define SP [["."|"?"|"!"]+|"…"];
33
34! Left punctuation
35define LP ["("|"["|"{"|
36 "“"|"‘"|"‹"|"«"|
37 "'"|%"|
38 ! differs
39 ["'" "'"] |
40 "*"|"/"|"_"| ! Can be Markdown
41 ! from book
42 [%, %,]];
43
44! Right punctuation - excluding the characters that can be used as apostrophe
45define RP [SP|","|";"|":"|
46 ")"|"]"|"}"|
47 ""|""|"»"|%"|[%’ %’]|["'" "'"]|[%‘ %‘]|
48 "*"|"/"|"_"]; ! Can be Markdown
49
50define DQuotes ["”"|%"|"»"|"«"];
51
52define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@|%&];
53define Apos %'|%’|%`;
54define Punct [LP|RP|Sym];
55!define nonSym \[WS|LP|RP|Sym];
56!!!!!!!!!!!!!!!!!!
57! </from tmorph> !
58!!!!!!!!!!!!!!!!!!
59
60define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
61define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
Akron6dcb6ce2022-04-09 16:09:51 +020062define Alldash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"|"\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"];
Akron78f67142022-04-09 14:10:44 +020063define Slash [""|""|""|"/"];
64define Asterisk ["*"];
65
66define Char \[WS|NL|Punct|Apos]; ! |¨;
67
68!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
69
70define SentenceEnd SP NLout [DQuotes (NLout ")") | [""|%‹|%’|"'"] ( NLout DQuotes (NLout ")") | NLout ")" ) | ")" ] (NLout SP);
71
72define NotSentenceExtension [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - ")" - NLout];