Blame - src/tokenizer.xfst - KorAP/Datok

blob: 56917ea37370225287002d7089e77063a340ab31 [file] [log] [blame]

Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	1	! This tokenizer is based on work by
				2	! - StandardTokenizerImpl by the Lucene project
				3	! under the Apache License
				4	! - https://github.com/dlwh/epic by David Hall (2014)
				5	! under the Apacahe License
				6	! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
				7	! under the Apache License
				8	! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
				9	! under the MIT License
				10	!
				11	! The abbreviation list is part of the sentence splitter tool
				12	! of the IDS.
				13
Akron	4af79f1	2021-08-11 14:48:17 +0200	[diff] [blame]	14	! define NLout "\u000a";
				15	define NLout "@_TOKEN_SYMBOL_@";
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	16	define NLin ("\u000d") "\u000a";
				17
				18	define Digit [%0\|1\|2\|3\|4\|5\|6\|7\|8\|9];
Akron	e8837b5	2021-08-11 17:29:58 +0200	[diff] [blame]	19	define AsciiLetter [a\|b\|c\|d\|e\|f\|g\|h\|i\|j\|k\|l\|m\|n\|o\|p\|q\|r\|s\|t\|u\|v\|w\|x\|y\|z];
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	20
				21	!!!!!!!!!!!!!!!!!
				22	! <from tmorph> !
				23	!!!!!!!!!!!!!!!!!
				24	define WS [" "\|"\u0009"\|"\u000a"\|"\u000d"\|
				25	"\u00a0"\|"\u1680"\|
				26	"\u2000"\|"\u2001"\|"\u2002"\|"\u2003"\|"\u2004"\|"\u2005"\|
				27	"\u2006"\|"\u2007"\|"\u2008"\|"\u2009"\|"\u200a"\|
				28	"\u2028"\|"\u2029"\|"\u202f"\|"\u205f"\|"\u3000"\| NLin];
				29
				30	! Punctuation that ends sentences
				31	! Differs!
				32	define SP [["."\|"?"\|"!"]+\|"…"]; ! Warning! This results in '...' being a MCS!
				33	! Left punctuation
				34	define LP ["("\|"["\|"{"\|
				35	"“"\|"‘"\|"‹"\|"«"\|
				36	"'"\|%"\|
				37	! differs
				38	["'" "'"] \|
				39	"*"\|"/"\|"_"\| ! Can be Markdown
				40	! from book
				41	[%, %,]];
				42	! Right punctuation - excluding the characters that can be used as apostrophe
				43	define RP [SP\|","\|";"\|":"\|
				44	")"\|"]"\|"}"\|
				45	"”"\|"›"\|"»"\|
				46	%"\|
				47	! differs
				48	["'" "'"]\|
				49	"*"\|"/"\|"_"\| ! Can be Markdown
				50	! from book
				51	[%‘ %‘]\|[%’ %’]];
				52
				53	define Sym ["-"\|"+"\|"<"\|">"\|"*"\|"/"\|%=\|%@];
				54	define Apos %'\|%’\|%`;
				55	define Punct LP\|RP\|Sym;
				56	!define nonSym \[WS\|LP\|RP\|Sym];
				57	!!!!!!!!!!!!!!!!!!
				58	! </from tmorph> !
				59	!!!!!!!!!!!!!!!!!!
				60
				61	define Emdash [%- %- (%-)+ \| ["\u2014"\|"\u2015"\|"\u2e3a"\|"\u2e3b"\|"\ufe58"]+];
				62	define Dash ["-"\|"\u2011"\|"\u2012"\|"\u2013"\|"\u2e1a"\|"\ufe63"\|"\uff0d"];
				63	define Slash ["⁄"\|"∕"\|"／"\|"/"];
				64	define Asterisk ["*"];
				65
				66	define Char \[WS\|Punct\|Apos]; ! \|¨;
				67
				68	! source lexicon.xfst
				69	! define Word;
				70	define Word Char+ ([Dash\|Apos\|Asterisk] Char+)*;
				71
				72	define URLChar [Char\|[Sym - ["<"\|">"\|%"]]];
				73	!define Alpha ["a"\|"b"\|"c"\|"d"\|"e"\|"f"\|"g"\|"h"\|"i"\|"j"\|"k"\|"l"\|"m"\|"n"\|"o"\|"p"\|"q"\|"r"\|"s"\|"t"\|"u"\|"v"\|"w"\|"x"\|"y"\|"z"\|"_"];
				74
				75	define Caseinsensitive [
				76	a (->) A,
				77	b (->) B,
				78	c (->) C,
				79	d (->) D,
				80	e (->) E,
				81	f (->) F,
				82	g (->) G,
				83	h (->) H,
				84	i (->) I,
				85	j (->) J,
				86	k (->) K,
				87	l (->) L,
				88	m (->) M,
				89	n (->) N,
				90	o (->) O,
				91	p (->) P,
				92	q (->) Q,
				93	r (->) R,
				94	s (->) S,
				95	t (->) T,
				96	u (->) U,
				97	v (->) V,
				98	w (->) W,
				99	x (->) X,
				100	y (->) Y,
				101	z (->) Z,
				102	ö (->) Ö,
				103	ü (->) Ü,
				104	ä (->) Ä,
				105	ß (->) {SS}
				106	];
				107
Akron	fd92d7e	2021-08-11 16:31:43 +0200	[diff] [blame]	108	define Abbr @txt"txt/abbrv.txt";
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	109
Akron	57d0161	2021-08-11 17:53:19 +0200	[diff] [blame]	110	define Plusampersand @txt"txt/plusampersand.txt";
				111
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	112	! A solution to the "(author): problem" may be to add ) at the end of any
				113	! string as a possible ending
				114
				115	define Years ["(" Digit+ (".") ")"] \| ["[" Digit+ (".") "]"];
				116
				117	source emoji.xfst
				118	define Emoji;
				119
				120	! acronyms: U.S.A., I.B.M., etc.
				121	! use a post-filter to remove dots
				122	define AcronymDep Char %. [Char %.]+;
				123
				124	define Dot "."\|[["["\|"("] "d" "o" "t" [")"\|"]"]] .o. Caseinsensitive;
				125	define At "@"\|[["["\|"("] "a" "t" [")"\|"]"]] .o. Caseinsensitive;
				126
				127	define TldEnd [{org}\|{de}\|{com}] .o. Caseinsensitive;
				128
				129	! Very relaxed URL scheme, not based on the strict Lucene implementation
				130	define URL [ [ [{http} (s) \| {ftp} \| {file}] ":" "/" "/"] \| [{www} Dot] ]
				131	URLChar [URLChar\|SP]* URLChar
				132	.o. Caseinsensitive;
				133
				134	define Domain Char+ [Dash Char+]* Dot TldEnd;
				135
				136	!define XML "<" Alpha URLChar* (">");
				137	define XML "<" URLChar+ (">");
				138
				139	!define Email [Alpha [URLChar-At]* At Alpha URLChar* [Dot [[Alpha URLChar+]-Dot-At]]+];
				140	define Email URLChar+ At URLChar+ [Dot URLChar+]+;
				141
				142	! Twitter user, hashtag, Google+
				143	define SNS ["@"\|"#"\|"+"] Char+;
				144
				145	define FileEnd [
				146	[{htm} ("l")]\|
				147	[{doc} ("x")]\|
				148	{pdf}\|
				149	["j" "p" ("e") "g"]\|
				150	["m" "p" ["3"\|"4"]]\|
				151	{ogg}\|
				152	{png}\|
				153	{avi}\|
				154	{txt}\|
				155	{xls}\|
				156	{xml}\|
				157	{aac}\|
Akron	e8837b5	2021-08-11 17:29:58 +0200	[diff] [blame]	158	{gif}\|
				159	{exe}
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	160	] .o. Caseinsensitive;
Akron	e8837b5	2021-08-11 17:29:58 +0200	[diff] [blame]	161
				162	define File (( AsciiLetter ":" %\ \| "/" ) [ Char \| "_" \| "-" \| Char [ %\ \| "/" ] ]*) [Char \| "-" \| "_" ]+ "." FileEnd;
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	163
Akron	a0bded5	2021-08-11 15:48:02 +0200	[diff] [blame]	164	define Streetname Word {str} %.;
Akron	4af79f1	2021-08-11 14:48:17 +0200	[diff] [blame]	165
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	166	! Also supports
				167	! 19.4.2015, 19/4/2015 etc.
				168	define DigitPunct ["_"\|"-"\|"."\|","\|Slash];
				169	define Num Digit+ [DigitPunct Digit+]* (Char+);
				170
Akron	a0bded5	2021-08-11 15:48:02 +0200	[diff] [blame]	171	! ordinals
				172	define Ord Digit ( Digit (Digit) ) %.;
				173
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	174	! TODO:
				175	! floating point, serial, model numbers, ip addresses, etc.
				176	! every other segment must have at least one digit
				177
				178	! Omission words like "fu**ing!"
				179	define Omission Char+ Asterisk Asterisk+ Char*;
				180
				181
				182	! TODO: Name words with ' and `
				183
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	184	! Support ASCII elements, like
				185	! +---------------+
				186	! <---->, -->, <--
				187	! +---------------+
				188	! <---> \| Worker Node N \|
				189	! +---------------+
				190	! \|============= Core =============\|
				191
				192
				193
Akron	57d0161	2021-08-11 17:53:19 +0200	[diff] [blame]	194	define RealToken [XML\|Email\|URL\|SNS\|[Abbr %.]\|Plusampersand\|Streetname\|Omission\|Domain\|AcronymDep\|File\|Emdash\|Punct\|Ord\|Num\|Years\|Emoji\|Word];
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	195
				196	echo - Introduce Token splitter
				197	define Token [RealToken @-> ... NLout]
				198	.o. [WS+ @-> 0]
				199	;
				200
				201	echo - Introduce Sentence splitter
				202	read regex Token .o. [[["."\|"!"\|"?"]+] @-> ... NLout \/ NLout _];
				203
				204	! foma -e "source tokenizer.xfst" -q -s && cat text.txt \| flookup tokenizer.fst -x -b
				205
				206	! In a second pass, XML tags need to be combined. This requires tagging "<..." with ~xmls before \n
				207	! and anything with > with ~xmle.
				208	! In case this is part of an emoticon ( >:-P ), this needs to be split again .
				209	! The same is true for ( and )