Blame - src/tokenizer.xfst - KorAP/Datok

blob: 20b07f9b32c2cb2087d10f352484e06e09b37cb1 [file] [log] [blame]

Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame^]	1	! This tokenizer is based on work by
				2	! - StandardTokenizerImpl by the Lucene project
				3	! under the Apache License
				4	! - https://github.com/dlwh/epic by David Hall (2014)
				5	! under the Apacahe License
				6	! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
				7	! under the Apache License
				8	! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
				9	! under the MIT License
				10	!
				11	! The abbreviation list is part of the sentence splitter tool
				12	! of the IDS.
				13
				14	define NLout "@_TOKEN_SYMBOL_@"; !"\u000a";
				15	define NLin ("\u000d") "\u000a";
				16
				17	define Digit [%0\|1\|2\|3\|4\|5\|6\|7\|8\|9];
				18
				19	!!!!!!!!!!!!!!!!!
				20	! <from tmorph> !
				21	!!!!!!!!!!!!!!!!!
				22	define WS [" "\|"\u0009"\|"\u000a"\|"\u000d"\|
				23	"\u00a0"\|"\u1680"\|
				24	"\u2000"\|"\u2001"\|"\u2002"\|"\u2003"\|"\u2004"\|"\u2005"\|
				25	"\u2006"\|"\u2007"\|"\u2008"\|"\u2009"\|"\u200a"\|
				26	"\u2028"\|"\u2029"\|"\u202f"\|"\u205f"\|"\u3000"\| NLin];
				27
				28	! Punctuation that ends sentences
				29	! Differs!
				30	define SP [["."\|"?"\|"!"]+\|"…"]; ! Warning! This results in '...' being a MCS!
				31	! Left punctuation
				32	define LP ["("\|"["\|"{"\|
				33	"“"\|"‘"\|"‹"\|"«"\|
				34	"'"\|%"\|
				35	! differs
				36	["'" "'"] \|
				37	"*"\|"/"\|"_"\| ! Can be Markdown
				38	! from book
				39	[%, %,]];
				40	! Right punctuation - excluding the characters that can be used as apostrophe
				41	define RP [SP\|","\|";"\|":"\|
				42	")"\|"]"\|"}"\|
				43	"”"\|"›"\|"»"\|
				44	%"\|
				45	! differs
				46	["'" "'"]\|
				47	"*"\|"/"\|"_"\| ! Can be Markdown
				48	! from book
				49	[%‘ %‘]\|[%’ %’]];
				50
				51	define Sym ["-"\|"+"\|"<"\|">"\|"*"\|"/"\|%=\|%@];
				52	define Apos %'\|%’\|%`;
				53	define Punct LP\|RP\|Sym;
				54	!define nonSym \[WS\|LP\|RP\|Sym];
				55	!!!!!!!!!!!!!!!!!!
				56	! </from tmorph> !
				57	!!!!!!!!!!!!!!!!!!
				58
				59	define Emdash [%- %- (%-)+ \| ["\u2014"\|"\u2015"\|"\u2e3a"\|"\u2e3b"\|"\ufe58"]+];
				60	define Dash ["-"\|"\u2011"\|"\u2012"\|"\u2013"\|"\u2e1a"\|"\ufe63"\|"\uff0d"];
				61	define Slash ["⁄"\|"∕"\|"／"\|"/"];
				62	define Asterisk ["*"];
				63
				64	define Char \[WS\|Punct\|Apos]; ! \|¨;
				65
				66	! source lexicon.xfst
				67	! define Word;
				68	define Word Char+ ([Dash\|Apos\|Asterisk] Char+)*;
				69
				70	define URLChar [Char\|[Sym - ["<"\|">"\|%"]]];
				71	!define Alpha ["a"\|"b"\|"c"\|"d"\|"e"\|"f"\|"g"\|"h"\|"i"\|"j"\|"k"\|"l"\|"m"\|"n"\|"o"\|"p"\|"q"\|"r"\|"s"\|"t"\|"u"\|"v"\|"w"\|"x"\|"y"\|"z"\|"_"];
				72
				73	define Caseinsensitive [
				74	a (->) A,
				75	b (->) B,
				76	c (->) C,
				77	d (->) D,
				78	e (->) E,
				79	f (->) F,
				80	g (->) G,
				81	h (->) H,
				82	i (->) I,
				83	j (->) J,
				84	k (->) K,
				85	l (->) L,
				86	m (->) M,
				87	n (->) N,
				88	o (->) O,
				89	p (->) P,
				90	q (->) Q,
				91	r (->) R,
				92	s (->) S,
				93	t (->) T,
				94	u (->) U,
				95	v (->) V,
				96	w (->) W,
				97	x (->) X,
				98	y (->) Y,
				99	z (->) Z,
				100	ö (->) Ö,
				101	ü (->) Ü,
				102	ä (->) Ä,
				103	ß (->) {SS}
				104	];
				105
				106	define Abbr @txt"txt/abbrv.txt" .o. Caseinsensitive;
				107
				108	! A solution to the "(author): problem" may be to add ) at the end of any
				109	! string as a possible ending
				110
				111	define Years ["(" Digit+ (".") ")"] \| ["[" Digit+ (".") "]"];
				112
				113	source emoji.xfst
				114	define Emoji;
				115
				116	! acronyms: U.S.A., I.B.M., etc.
				117	! use a post-filter to remove dots
				118	define AcronymDep Char %. [Char %.]+;
				119
				120	define Dot "."\|[["["\|"("] "d" "o" "t" [")"\|"]"]] .o. Caseinsensitive;
				121	define At "@"\|[["["\|"("] "a" "t" [")"\|"]"]] .o. Caseinsensitive;
				122
				123	define TldEnd [{org}\|{de}\|{com}] .o. Caseinsensitive;
				124
				125	! Very relaxed URL scheme, not based on the strict Lucene implementation
				126	define URL [ [ [{http} (s) \| {ftp} \| {file}] ":" "/" "/"] \| [{www} Dot] ]
				127	URLChar [URLChar\|SP]* URLChar
				128	.o. Caseinsensitive;
				129
				130	define Domain Char+ [Dash Char+]* Dot TldEnd;
				131
				132	!define XML "<" Alpha URLChar* (">");
				133	define XML "<" URLChar+ (">");
				134
				135	!define Email [Alpha [URLChar-At]* At Alpha URLChar* [Dot [[Alpha URLChar+]-Dot-At]]+];
				136	define Email URLChar+ At URLChar+ [Dot URLChar+]+;
				137
				138	! Twitter user, hashtag, Google+
				139	define SNS ["@"\|"#"\|"+"] Char+;
				140
				141	define FileEnd [
				142	[{htm} ("l")]\|
				143	[{doc} ("x")]\|
				144	{pdf}\|
				145	["j" "p" ("e") "g"]\|
				146	["m" "p" ["3"\|"4"]]\|
				147	{ogg}\|
				148	{png}\|
				149	{avi}\|
				150	{txt}\|
				151	{xls}\|
				152	{xml}\|
				153	{aac}\|
				154	{gif}
				155	] .o. Caseinsensitive;
				156	define File [Char\|"-"]+ "." FileEnd;
				157
				158	! Also supports
				159	! 19.4.2015, 19/4/2015 etc.
				160	define DigitPunct ["_"\|"-"\|"."\|","\|Slash];
				161	define Num Digit+ [DigitPunct Digit+]* (Char+);
				162
				163	! TODO:
				164	! floating point, serial, model numbers, ip addresses, etc.
				165	! every other segment must have at least one digit
				166
				167	! Omission words like "fu**ing!"
				168	define Omission Char+ Asterisk Asterisk+ Char*;
				169
				170
				171	! TODO: Name words with ' and `
				172
				173	! TODO:
				174	! FNAME = (({LETTER}:[\\/])?\|\/)?({LETTER}+\|[\\_/-])+\.{EXTENSION}
				175
				176
				177	! Support ASCII elements, like
				178	! +---------------+
				179	! <---->, -->, <--
				180	! +---------------+
				181	! <---> \| Worker Node N \|
				182	! +---------------+
				183	! \|============= Core =============\|
				184
				185
				186
				187	define RealToken [XML\|Email\|URL\|SNS\|[Abbr %.]\|Omission\|Domain\|AcronymDep\|File\|Emdash\|Punct\|Num\|Years\|Emoji\|Word];
				188
				189	echo - Introduce Token splitter
				190	define Token [RealToken @-> ... NLout]
				191	.o. [WS+ @-> 0]
				192	;
				193
				194	echo - Introduce Sentence splitter
				195	read regex Token .o. [[["."\|"!"\|"?"]+] @-> ... NLout \/ NLout _];
				196
				197	! foma -e "source tokenizer.xfst" -q -s && cat text.txt \| flookup tokenizer.fst -x -b
				198
				199	! In a second pass, XML tags need to be combined. This requires tagging "<..." with ~xmls before \n
				200	! and anything with > with ~xmle.
				201	! In case this is part of an emoticon ( >:-P ), this needs to be split again .
				202	! The same is true for ( and )