blob: 4193c289edc793818eef20a186435992668d1e30 [file] [log] [blame]
! This tokenizer is based on work by
! - StandardTokenizerImpl by the Lucene project
! under the Apache License
! - https://github.com/dlwh/epic by David Hall (2014)
! under the Apacahe License
! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
! under the Apache License
! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
! under the MIT License
!
! The abbreviation list is part of the sentence splitter tool
! of the IDS.
define NLout "@_TOKEN_SYMBOL_@";
! define NLout "\u000a";
define Digit [%0|1|2|3|4|5|6|7|8|9];
define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
define HexLetter [Digit|a|A|b|B|c|C|d|D|e|E|f|F];
define EOT "\u0004";
!!!!!!!!!!!!!!!!!
! <from tmorph> !
!!!!!!!!!!!!!!!!!
define WS [" "|"\u0009"|"\u00a0"|"\u1680"|
"\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
"\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
"\u202f"|"\u205f"|"\u3000"];
define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];
! Punctuation that ends sentences
! Differs!
define SP [["."|"?"|"!"]+|"…"];
! Left punctuation
define LP ["("|"["|"{"|
"“"|"‘"|"‹"|"«"|
"'"|%"|
! differs
["'" "'"] |
"*"|"/"|"_"| ! Can be Markdown
! from book
[%, %,]];
! Right punctuation - excluding the characters that can be used as apostrophe
define RP [SP|","|";"|":"|
")"|"]"|"}"|
"”"|"›"|"»"|%"|[%’ %’]|["'" "'"]|[%‘ %‘]|
"*"|"/"|"_"]; ! Can be Markdown
define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@|%&];
define Apos %'|%’|%`;
define Punct [LP|RP|Sym];
!define nonSym \[WS|LP|RP|Sym];
!!!!!!!!!!!!!!!!!!
! </from tmorph> !
!!!!!!!!!!!!!!!!!!
define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
define Slash ["⁄"|"∕"|"/"|"/"];
define Asterisk ["*"];
define Char \[WS|NL|Punct|Apos]; ! |¨;
define Word Char+ ([Apos|Asterisk] Char+)* ([s|S] [%’|%`]);
define Plusampersand @txt"txt/plusampersand.txt";
define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;
define URLChar [Char|[Sym - ["<"|">"|%"]]];
!define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];
define Caseinsensitive [
a (->) A,
b (->) B,
c (->) C,
d (->) D,
e (->) E,
f (->) F,
g (->) G,
h (->) H,
i (->) I,
j (->) J,
k (->) K,
l (->) L,
m (->) M,
n (->) N,
o (->) O,
p (->) P,
q (->) Q,
r (->) R,
s (->) S,
t (->) T,
u (->) U,
v (->) V,
w (->) W,
x (->) X,
y (->) Y,
z (->) Z,
ö (->) Ö,
ü (->) Ü,
ä (->) Ä,
ß (->) {SS}
];
define Letter [ [ AsciiLetter | ö | ü | ä | ß ] .o. Caseinsensitive ];
! Abbreviations and Initials
define Abbr [ @txt"txt/abbrv.txt" | Letter ] %.;
! A solution to the "(author): problem" may be to add ) at the end of any
! string as a possible ending
define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
! 20:00 Uhr, 00:12:25,34 Minuten
define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
source emoji.xfst
define Emoji;
! acronyms: U.S.A., I.B.M., etc.
! use a post-filter to remove dots
define AcronymDep Letter %. Letter %. [Letter %.]+;
define Dot "."|[["["|"("] "d" "o" "t" [")"|"]"]] .o. Caseinsensitive;
define At "@"|[["["|"("] "a" "t" [")"|"]"]] .o. Caseinsensitive;
define TldEnd [{org}|{de}|{com}] .o. Caseinsensitive;
! Very relaxed URL scheme, not based on the strict Lucene implementation
define URL [ [ [{http} (s) | {ftp} | {file}] ":" "/" "/"] | [{www} Dot] ]
URLChar [URLChar|SP]* URLChar
.o. Caseinsensitive;
define Domain Char+ [Dash Char+]* Dot TldEnd;
! XML sources
source xml.xfst
define XML;
! XML entities
source entities.xfst
define XMLEntities;
! Email addresses
define Email URLChar+ At URLChar+ [Dot URLChar+]+;
! Twitter user, hashtag, Google+
define SNS ["@"|"#"|"+"] Char+;
define FileEnd [
[{htm} ("l")]|
[{doc} ("x")]|
{pdf}|
["j" "p" ("e") "g"]|
["m" "p" ["3"|"4"]]|
{ogg}|
{png}|
[{ppt} ("x")]|
{avi}|
{txt}|
{xls}|
{xml}|
{aac}|
{gif}|
{exe}
] .o. Caseinsensitive;
define File (( AsciiLetter ":" %\ | "/" ) [ Char | "_" | "-" | Char [ %\ | "/" ] ]*) [Char | "-" | "_" ]+ "." FileEnd;
define Streetname Word {str} %.;
! Also supports
! 19.4.2015, 19/4/2015 etc.
define DigitPunct ["_"|"-"|"."|","|Slash];
define Num Digit+ [DigitPunct Digit+]* (Char+);
! ordinals
define Ord Digit ( Digit (Digit) ) %.;
! TODO:
! floating point, serial, model numbers, ip addresses, etc.
! every other segment must have at least one digit
! Omission words like "fu**ing!"
define Omission Char+ Asterisk Asterisk+ Char*;
! TODO: Name words with ' and `
! Support ASCII elements, like
! +---------------+
! <---->, -->, <--
! +---------------+
! <---> | Worker Node N |
! +---------------+
! |============= Core =============|
echo - Compile Real Token
define RealToken [Punct|Emdash|Abbr|Streetname|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission];
echo - Introduce Token splitter
define Token [
RealToken @-> ... NLout,
XML @-> ... NLout,
URL @-> ... NLout,
Email @-> ... NLout,
File @-> ... NLout,
Domain @-> ... NLout,
Emoji @-> ... NLout
];
echo - Introduce Sentence splitter
! And compose Whitespace ignorance
read regex Token .o. [
SP NLout ["”"|"›"|"»"|%"|%’|"'"] @-> ... NLout \/ _ NLout \%,
] .o. [
SP @-> ... NLout \/ NLout _ NLout [? - "”" - "›" - "»" - %" - %’ - "'"]
] .o. [
[WS|NL]+ @-> 0 || [ .#. | NLout ] _
];
! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b