src/tokenizer.xfst - KorAP/Datok - Gitiles

 ! This tokenizer is based on work by
 !  - StandardTokenizerImpl by the Lucene project
 !    under the Apache License
 !  - https://github.com/dlwh/epic by David Hall (2014)
 !    under the Apacahe License
 !  - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
 !    under the Apache License
 !  - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
 !    under the MIT License
 !
 ! The abbreviation list is part of the sentence splitter tool
 ! of the IDS.

 define NLout "@_TOKEN_SYMBOL_@";
 ! define NLout "\u000a";

 define Digit [%0|1|2|3|4|5|6|7|8|9];
 define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
 define HexLetter [Digit|a|A|b|B|c|C|d|D|e|E|f|F];
 define EOT "\u0004";

 !!!!!!!!!!!!!!!!!
 ! <from tmorph> !
 !!!!!!!!!!!!!!!!!
 define WS [" "|"\u0009"|"\u00a0"|"\u1680"|
            "\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
            "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
            "\u202f"|"\u205f"|"\u3000"];

 define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];

 ! Punctuation that ends sentences
 ! Differs!
 define SP [["."|"?"|"!"]+|"…"];

 ! Left punctuation
 define LP ["("|"["|"{"|
            "“"|"‘"|"‹"|"«"|
            "'"|%"|
            ! differs
            ["'" "'"] |
            "*"|"/"|"_"| ! Can be Markdown
            ! from book
            [%, %,]];

 ! Right punctuation - excluding the characters that can be used as apostrophe
 define RP [SP|","|";"|":"|
               ")"|"]"|"}"|
               "”"|"›"|"»"|
               %"|
               ! differs
               ["'" "'"]|
               "*"|"/"|"_"| ! Can be Markdown
               ! from book
               [%‘ %‘]|[%’ %’]];

 define Sym ["-"|"+"|"<"|">"|"*"|"/"|%=|%@|%&];
 define Apos %'|%’|%`;
 define Punct [LP|RP|Sym];
 !define nonSym \[WS|LP|RP|Sym];
 !!!!!!!!!!!!!!!!!!
 ! </from tmorph> !
 !!!!!!!!!!!!!!!!!!

 define Emdash [%- %- (%-)+ | ["\u2014"|"\u2015"|"\u2e3a"|"\u2e3b"|"\ufe58"]+];
 define Dash ["-"|"\u2011"|"\u2012"|"\u2013"|"\u2e1a"|"\ufe63"|"\uff0d"];
 define Slash ["⁄"|"∕"|"／"|"/"];
 define Asterisk ["*"];

 define Char \[WS|NL|Punct|Apos]; ! |¨;

 ! source lexicon.xfst
 ! define Word;
 define Word Char+ ([Dash|Apos|Asterisk] Char+)*;

 define URLChar [Char|[Sym - ["<"|">"|%"]]];
 !define Alpha ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"];

 define Caseinsensitive [
 a (->) A,
 b (->) B,
 c (->) C,
 d (->) D,
 e (->) E,
 f (->) F,
 g (->) G,
 h (->) H,
 i (->) I,
 j (->) J,
 k (->) K,
 l (->) L,
 m (->) M,
 n (->) N,
 o (->) O,
 p (->) P,
 q (->) Q,
 r (->) R,
 s (->) S,
 t (->) T,
 u (->) U,
 v (->) V,
 w (->) W,
 x (->) X,
 y (->) Y,
 z (->) Z,
 ö (->) Ö,
 ü (->) Ü,
 ä (->) Ä,
 ß (->) {SS}
 ];

 define Letter [ [ AsciiLetter | ö | ü | ä | ß ] .o. Caseinsensitive ];

 ! Abbreviations and Initials
 define Abbr [ @txt"txt/abbrv.txt" | Letter ] %.;

 define Plusampersand @txt"txt/plusampersand.txt";

 ! A solution to the "(author): problem" may be to add ) at the end of any
 ! string as a possible ending

 define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];

 ! 20:00 Uhr, 00:12:25,34 Minuten
 define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];

 source emoji.xfst
 define Emoji;

 ! acronyms: U.S.A., I.B.M., etc.
 ! use a post-filter to remove dots
 define AcronymDep Letter %. [Letter %.]+;

 define Dot "."|[["["|"("] "d" "o" "t" [")"|"]"]] .o. Caseinsensitive;
 define At "@"|[["["|"("] "a" "t" [")"|"]"]] .o. Caseinsensitive;

 define TldEnd [{org}|{de}|{com}] .o. Caseinsensitive;

 ! Very relaxed URL scheme, not based on the strict Lucene implementation
 define URL [ [ [{http} (s) | {ftp} | {file}] ":" "/" "/"] | [{www} Dot] ]
 URLChar [URLChar|SP]* URLChar
 .o. Caseinsensitive;

 define Domain Char+ [Dash Char+]* Dot TldEnd;

 ! XML rule
 define XMLns [AsciiLetter [AsciiLetter|Digit|%-]* (%: AsciiLetter [AsciiLetter|Digit|%-]*)] .o. Caseinsensitive;
 define XML [
   "<" [
       [
         XMLns
         [WS+ XMLns WS*
           (%= WS*
             [[%" [? - %" - %>]+ %"] | [%' [? - %' - %>]+ %']]
           )
         ]*
         (WS* "/")
       ]
       |
       [
         "/" XMLns
       ]
     ] WS* ">"
 ].u;

 ! XML entities
 source entities.xfst
 define XMLEntities;


 ! Email addresses
 define Email URLChar+ At URLChar+ [Dot URLChar+]+;

 ! Twitter user, hashtag, Google+
 define SNS ["@"|"#"|"+"] Char+;

 define FileEnd [
                 [{htm} ("l")]|
                 [{doc} ("x")]|
                 {pdf}|
                 ["j" "p" ("e") "g"]|
                 ["m" "p" ["3"|"4"]]|
                 {ogg}|
                 {png}|
                 [{ppt} ("x")]|
                 {avi}|
                 {txt}|
                 {xls}|
                 {xml}|
                 {aac}|
                 {gif}|
                 {exe}
                 ] .o. Caseinsensitive;

 define File (( AsciiLetter ":" %\ | "/" ) [ Char | "_" | "-" | Char [ %\ | "/" ] ]*) [Char | "-" | "_" ]+ "." FileEnd;

 define Streetname Word {str} %.;

 ! Also supports
 !   19.4.2015, 19/4/2015 etc.
 define DigitPunct ["_"|"-"|"."|","|Slash];
 define Num Digit+ [DigitPunct Digit+]* (Char+);

 ! ordinals
 define Ord Digit ( Digit (Digit) ) %.;

 ! TODO:
 !   floating point, serial, model numbers, ip addresses, etc.
 !   every other segment must have at least one digit

 ! Omission words like "fu**ing!"
 define Omission Char+ Asterisk Asterisk+ Char*;


 ! TODO: Name words with ' and `

 ! Support ASCII elements, like
 ! +---------------+
 ! <---->, -->, <--
 !       +---------------+
 ! <---> | Worker Node N |
 !       +---------------+
 ! |============= Core =============|


 echo - Compile Real Token

 define RealToken [Punct|Word|SNS|AcronymDep|Ord|Num|Years|Times];

 echo - Introduce Token splitter

 define Token [
   XMLEntities @-> ... NLout,
   Abbr @-> ... NLout,
   RealToken @-> ... NLout,
   XML @-> ... NLout,
   URL @-> ... NLout,
   Email @-> ... NLout,
   File @-> ... NLout,
   Plusampersand @-> ... NLout,
   Domain @-> ... NLout,
   Emoji @-> ... NLout,
   [Streetname|Omission|Emdash] @-> ... NLout
   ]
 .o. [[WS|NL]+ @-> 0 || NLout _ ]
 .o. [[WS|NL]+ @-> 0 || .#. _ ]
 ;

 echo - Introduce Sentence splitter
 read regex Token .o. [[["."|"!"|"?"]+|"…"] @-> ... NLout \/ NLout _ ];

 ! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
	! This tokenizer is based on work by
	! - StandardTokenizerImpl by the Lucene project
	! under the Apache License
	! - https://github.com/dlwh/epic by David Hall (2014)
	! under the Apacahe License
	! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
	! under the Apache License
	! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
	! under the MIT License
	!
	! The abbreviation list is part of the sentence splitter tool
	! of the IDS.

	define NLout "@_TOKEN_SYMBOL_@";
	! define NLout "\u000a";

	define Digit [%0\|1\|2\|3\|4\|5\|6\|7\|8\|9];
	define AsciiLetter [a\|b\|c\|d\|e\|f\|g\|h\|i\|j\|k\|l\|m\|n\|o\|p\|q\|r\|s\|t\|u\|v\|w\|x\|y\|z];
	define HexLetter [Digit\|a\|A\|b\|B\|c\|C\|d\|D\|e\|E\|f\|F];
	define EOT "\u0004";

	!!!!!!!!!!!!!!!!!
	! <from tmorph> !
	!!!!!!!!!!!!!!!!!
	define WS [" "\|"\u0009"\|"\u00a0"\|"\u1680"\|
	"\u2000"\|"\u2001"\|"\u2002"\|"\u2003"\|"\u2004"\|"\u2005"\|
	"\u2006"\|"\u2007"\|"\u2008"\|"\u2009"\|"\u200a"\|
	"\u202f"\|"\u205f"\|"\u3000"];

	define NL ["\u000a"\|"\u000b"\|"\u000c"\|"\u000d"\|"\u0085"\|"\u2028"\|"\u2029"\|EOT];

	! Punctuation that ends sentences
	! Differs!
	define SP [["."\|"?"\|"!"]+\|"…"];

	! Left punctuation
	define LP ["("\|"["\|"{"\|
	"“"\|"‘"\|"‹"\|"«"\|
	"'"\|%"\|
	! differs
	["'" "'"] \|
	"*"\|"/"\|"_"\| ! Can be Markdown
	! from book
	[%, %,]];

	! Right punctuation - excluding the characters that can be used as apostrophe
	define RP [SP\|","\|";"\|":"\|
	")"\|"]"\|"}"\|
	"”"\|"›"\|"»"\|
	%"\|
	! differs
	["'" "'"]\|
	"*"\|"/"\|"_"\| ! Can be Markdown
	! from book
	[%‘ %‘]\|[%’ %’]];

	define Sym ["-"\|"+"\|"<"\|">"\|"*"\|"/"\|%=\|%@\|%&];
	define Apos %'\|%’\|%`;
	define Punct [LP\|RP\|Sym];
	!define nonSym \[WS\|LP\|RP\|Sym];
	!!!!!!!!!!!!!!!!!!
	! </from tmorph> !
	!!!!!!!!!!!!!!!!!!

	define Emdash [%- %- (%-)+ \| ["\u2014"\|"\u2015"\|"\u2e3a"\|"\u2e3b"\|"\ufe58"]+];
	define Dash ["-"\|"\u2011"\|"\u2012"\|"\u2013"\|"\u2e1a"\|"\ufe63"\|"\uff0d"];
	define Slash ["⁄"\|"∕"\|"／"\|"/"];
	define Asterisk ["*"];

	define Char \[WS\|NL\|Punct\|Apos]; ! \|¨;

	! source lexicon.xfst
	! define Word;
	define Word Char+ ([Dash\|Apos\|Asterisk] Char+)*;

	define URLChar [Char\|[Sym - ["<"\|">"\|%"]]];
	!define Alpha ["a"\|"b"\|"c"\|"d"\|"e"\|"f"\|"g"\|"h"\|"i"\|"j"\|"k"\|"l"\|"m"\|"n"\|"o"\|"p"\|"q"\|"r"\|"s"\|"t"\|"u"\|"v"\|"w"\|"x"\|"y"\|"z"\|"_"];

	define Caseinsensitive [
	a (->) A,
	b (->) B,
	c (->) C,
	d (->) D,
	e (->) E,
	f (->) F,
	g (->) G,
	h (->) H,
	i (->) I,
	j (->) J,
	k (->) K,
	l (->) L,
	m (->) M,
	n (->) N,
	o (->) O,
	p (->) P,
	q (->) Q,
	r (->) R,
	s (->) S,
	t (->) T,
	u (->) U,
	v (->) V,
	w (->) W,
	x (->) X,
	y (->) Y,
	z (->) Z,
	ö (->) Ö,
	ü (->) Ü,
	ä (->) Ä,
	ß (->) {SS}
	];

	define Letter [ [ AsciiLetter \| ö \| ü \| ä \| ß ] .o. Caseinsensitive ];

	! Abbreviations and Initials
	define Abbr [ @txt"txt/abbrv.txt" \| Letter ] %.;

	define Plusampersand @txt"txt/plusampersand.txt";

	! A solution to the "(author): problem" may be to add ) at the end of any
	! string as a possible ending

	define Years ["(" Digit+ (".") ")"] \| ["[" Digit+ (".") "]"];

	! 20:00 Uhr, 00:12:25,34 Minuten
	define Times [ ( [%0\|1\|2\|3\|4\|5] ) Digit [ ":" [%0\|1\|2\|3\|4\|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];

	source emoji.xfst
	define Emoji;

	! acronyms: U.S.A., I.B.M., etc.
	! use a post-filter to remove dots
	define AcronymDep Letter %. [Letter %.]+;

	define Dot "."\|[["["\|"("] "d" "o" "t" [")"\|"]"]] .o. Caseinsensitive;
	define At "@"\|[["["\|"("] "a" "t" [")"\|"]"]] .o. Caseinsensitive;

	define TldEnd [{org}\|{de}\|{com}] .o. Caseinsensitive;

	! Very relaxed URL scheme, not based on the strict Lucene implementation
	define URL [ [ [{http} (s) \| {ftp} \| {file}] ":" "/" "/"] \| [{www} Dot] ]
	URLChar [URLChar\|SP]* URLChar
	.o. Caseinsensitive;

	define Domain Char+ [Dash Char+]* Dot TldEnd;

	! XML rule
	define XMLns [AsciiLetter [AsciiLetter\|Digit\|%-]* (%: AsciiLetter [AsciiLetter\|Digit\|%-]*)] .o. Caseinsensitive;
	define XML [
	"<" [
	[
	XMLns
	[WS+ XMLns WS*
	(%= WS*
	[[%" [? - %" - %>]+ %"] \| [%' [? - %' - %>]+ %']]
	)
	]*
	(WS* "/")
	]
	\|
	[
	"/" XMLns
	]
	] WS* ">"
	].u;

	! XML entities
	source entities.xfst
	define XMLEntities;


	! Email addresses
	define Email URLChar+ At URLChar+ [Dot URLChar+]+;

	! Twitter user, hashtag, Google+
	define SNS ["@"\|"#"\|"+"] Char+;

	define FileEnd [
	[{htm} ("l")]\|
	[{doc} ("x")]\|
	{pdf}\|
	["j" "p" ("e") "g"]\|
	["m" "p" ["3"\|"4"]]\|
	{ogg}\|
	{png}\|
	[{ppt} ("x")]\|
	{avi}\|
	{txt}\|
	{xls}\|
	{xml}\|
	{aac}\|
	{gif}\|
	{exe}
	] .o. Caseinsensitive;

	define File (( AsciiLetter ":" %\ \| "/" ) [ Char \| "_" \| "-" \| Char [ %\ \| "/" ] ]*) [Char \| "-" \| "_" ]+ "." FileEnd;

	define Streetname Word {str} %.;

	! Also supports
	! 19.4.2015, 19/4/2015 etc.
	define DigitPunct ["_"\|"-"\|"."\|","\|Slash];
	define Num Digit+ [DigitPunct Digit+]* (Char+);

	! ordinals
	define Ord Digit ( Digit (Digit) ) %.;

	! TODO:
	! floating point, serial, model numbers, ip addresses, etc.
	! every other segment must have at least one digit

	! Omission words like "fu**ing!"
	define Omission Char+ Asterisk Asterisk+ Char*;


	! TODO: Name words with ' and `

	! Support ASCII elements, like
	! +---------------+
	! <---->, -->, <--
	! +---------------+
	! <---> \| Worker Node N \|
	! +---------------+
	! \|============= Core =============\|


	echo - Compile Real Token

	define RealToken [Punct\|Word\|SNS\|AcronymDep\|Ord\|Num\|Years\|Times];

	echo - Introduce Token splitter

	define Token [
	XMLEntities @-> ... NLout,
	Abbr @-> ... NLout,
	RealToken @-> ... NLout,
	XML @-> ... NLout,
	URL @-> ... NLout,
	Email @-> ... NLout,
	File @-> ... NLout,
	Plusampersand @-> ... NLout,
	Domain @-> ... NLout,
	Emoji @-> ... NLout,
	[Streetname\|Omission\|Emdash] @-> ... NLout
	]
	.o. [[WS\|NL]+ @-> 0 \|\| NLout _ ]
	.o. [[WS\|NL]+ @-> 0 \|\| .#. _ ]
	;

	echo - Introduce Sentence splitter
	read regex Token .o. [[["."\|"!"\|"?"]+\|"…"] @-> ... NLout \/ NLout _ ];

	! foma -e "source tokenizer.xfst" -q -s && cat text.txt \| flookup tokenizer.fst -x -b