Blame - src/tokenizer.xfst - KorAP/Datok

blob: 6d21d8d284ab2677bdbaad6235355be45a6d4b62 [file] [log] [blame]

Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	1	! This tokenizer is based on work by
				2	! - StandardTokenizerImpl by the Lucene project
				3	! under the Apache License
				4	! - https://github.com/dlwh/epic by David Hall (2014)
				5	! under the Apacahe License
				6	! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
				7	! under the Apache License
				8	! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
				9	! under the MIT License
				10	!
				11	! The abbreviation list is part of the sentence splitter tool
				12	! of the IDS.
				13
Akron	4af79f1	2021-08-11 14:48:17 +0200	[diff] [blame]	14	! define NLout "\u000a";
				15	define NLout "@_TOKEN_SYMBOL_@";
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	16	! define NLout "\u000a";
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	17
				18	define Digit [%0\|1\|2\|3\|4\|5\|6\|7\|8\|9];
Akron	e8837b5	2021-08-11 17:29:58 +0200	[diff] [blame]	19	define AsciiLetter [a\|b\|c\|d\|e\|f\|g\|h\|i\|j\|k\|l\|m\|n\|o\|p\|q\|r\|s\|t\|u\|v\|w\|x\|y\|z];
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	20
				21	!!!!!!!!!!!!!!!!!
				22	! <from tmorph> !
				23	!!!!!!!!!!!!!!!!!
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	24	define WS [" "\|"\u0009"\|"\u00a0"\|"\u1680"\|
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	25	"\u2000"\|"\u2001"\|"\u2002"\|"\u2003"\|"\u2004"\|"\u2005"\|
				26	"\u2006"\|"\u2007"\|"\u2008"\|"\u2009"\|"\u200a"\|
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	27	"\u202f"\|"\u205f"\|"\u3000"];
				28
				29	define NL ["\u000a"\|"\u000b"\|"\u000c"\|"\u000d"\|"\u0085"\|"\u2028"\|"\u2029"];
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	30
				31	! Punctuation that ends sentences
				32	! Differs!
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	33	define SP [["."\|"?"\|"!"]+\|"…"];
				34
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	35	! Left punctuation
				36	define LP ["("\|"["\|"{"\|
				37	"“"\|"‘"\|"‹"\|"«"\|
				38	"'"\|%"\|
				39	! differs
				40	["'" "'"] \|
				41	"*"\|"/"\|"_"\| ! Can be Markdown
				42	! from book
				43	[%, %,]];
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	44
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	45	! Right punctuation - excluding the characters that can be used as apostrophe
				46	define RP [SP\|","\|";"\|":"\|
				47	")"\|"]"\|"}"\|
				48	"”"\|"›"\|"»"\|
				49	%"\|
				50	! differs
				51	["'" "'"]\|
				52	"*"\|"/"\|"_"\| ! Can be Markdown
				53	! from book
				54	[%‘ %‘]\|[%’ %’]];
				55
				56	define Sym ["-"\|"+"\|"<"\|">"\|"*"\|"/"\|%=\|%@];
				57	define Apos %'\|%’\|%`;
				58	define Punct LP\|RP\|Sym;
				59	!define nonSym \[WS\|LP\|RP\|Sym];
				60	!!!!!!!!!!!!!!!!!!
				61	! </from tmorph> !
				62	!!!!!!!!!!!!!!!!!!
				63
				64	define Emdash [%- %- (%-)+ \| ["\u2014"\|"\u2015"\|"\u2e3a"\|"\u2e3b"\|"\ufe58"]+];
				65	define Dash ["-"\|"\u2011"\|"\u2012"\|"\u2013"\|"\u2e1a"\|"\ufe63"\|"\uff0d"];
				66	define Slash ["⁄"\|"∕"\|"／"\|"/"];
				67	define Asterisk ["*"];
				68
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	69	define Char \[WS\|NL\|Punct\|Apos]; ! \|¨;
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	70
				71	! source lexicon.xfst
				72	! define Word;
				73	define Word Char+ ([Dash\|Apos\|Asterisk] Char+)*;
				74
				75	define URLChar [Char\|[Sym - ["<"\|">"\|%"]]];
				76	!define Alpha ["a"\|"b"\|"c"\|"d"\|"e"\|"f"\|"g"\|"h"\|"i"\|"j"\|"k"\|"l"\|"m"\|"n"\|"o"\|"p"\|"q"\|"r"\|"s"\|"t"\|"u"\|"v"\|"w"\|"x"\|"y"\|"z"\|"_"];
				77
				78	define Caseinsensitive [
				79	a (->) A,
				80	b (->) B,
				81	c (->) C,
				82	d (->) D,
				83	e (->) E,
				84	f (->) F,
				85	g (->) G,
				86	h (->) H,
				87	i (->) I,
				88	j (->) J,
				89	k (->) K,
				90	l (->) L,
				91	m (->) M,
				92	n (->) N,
				93	o (->) O,
				94	p (->) P,
				95	q (->) Q,
				96	r (->) R,
				97	s (->) S,
				98	t (->) T,
				99	u (->) U,
				100	v (->) V,
				101	w (->) W,
				102	x (->) X,
				103	y (->) Y,
				104	z (->) Z,
				105	ö (->) Ö,
				106	ü (->) Ü,
				107	ä (->) Ä,
				108	ß (->) {SS}
				109	];
				110
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	111	define Abbr @txt"txt/abbrv.txt" %.;
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	112
Akron	57d0161	2021-08-11 17:53:19 +0200	[diff] [blame]	113	define Plusampersand @txt"txt/plusampersand.txt";
				114
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	115	! A solution to the "(author): problem" may be to add ) at the end of any
				116	! string as a possible ending
				117
				118	define Years ["(" Digit+ (".") ")"] \| ["[" Digit+ (".") "]"];
				119
				120	source emoji.xfst
				121	define Emoji;
				122
				123	! acronyms: U.S.A., I.B.M., etc.
				124	! use a post-filter to remove dots
				125	define AcronymDep Char %. [Char %.]+;
				126
				127	define Dot "."\|[["["\|"("] "d" "o" "t" [")"\|"]"]] .o. Caseinsensitive;
				128	define At "@"\|[["["\|"("] "a" "t" [")"\|"]"]] .o. Caseinsensitive;
				129
				130	define TldEnd [{org}\|{de}\|{com}] .o. Caseinsensitive;
				131
				132	! Very relaxed URL scheme, not based on the strict Lucene implementation
				133	define URL [ [ [{http} (s) \| {ftp} \| {file}] ":" "/" "/"] \| [{www} Dot] ]
				134	URLChar [URLChar\|SP]* URLChar
				135	.o. Caseinsensitive;
				136
				137	define Domain Char+ [Dash Char+]* Dot TldEnd;
				138
				139	!define XML "<" Alpha URLChar* (">");
				140	define XML "<" URLChar+ (">");
				141
				142	!define Email [Alpha [URLChar-At]* At Alpha URLChar* [Dot [[Alpha URLChar+]-Dot-At]]+];
				143	define Email URLChar+ At URLChar+ [Dot URLChar+]+;
				144
				145	! Twitter user, hashtag, Google+
				146	define SNS ["@"\|"#"\|"+"] Char+;
				147
				148	define FileEnd [
				149	[{htm} ("l")]\|
				150	[{doc} ("x")]\|
				151	{pdf}\|
				152	["j" "p" ("e") "g"]\|
				153	["m" "p" ["3"\|"4"]]\|
				154	{ogg}\|
				155	{png}\|
				156	{avi}\|
				157	{txt}\|
				158	{xls}\|
				159	{xml}\|
				160	{aac}\|
Akron	e8837b5	2021-08-11 17:29:58 +0200	[diff] [blame]	161	{gif}\|
				162	{exe}
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	163	] .o. Caseinsensitive;
Akron	e8837b5	2021-08-11 17:29:58 +0200	[diff] [blame]	164
				165	define File (( AsciiLetter ":" %\ \| "/" ) [ Char \| "_" \| "-" \| Char [ %\ \| "/" ] ]*) [Char \| "-" \| "_" ]+ "." FileEnd;
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	166
Akron	a0bded5	2021-08-11 15:48:02 +0200	[diff] [blame]	167	define Streetname Word {str} %.;
Akron	4af79f1	2021-08-11 14:48:17 +0200	[diff] [blame]	168
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	169	! Also supports
				170	! 19.4.2015, 19/4/2015 etc.
				171	define DigitPunct ["_"\|"-"\|"."\|","\|Slash];
				172	define Num Digit+ [DigitPunct Digit+]* (Char+);
				173
Akron	a0bded5	2021-08-11 15:48:02 +0200	[diff] [blame]	174	! ordinals
				175	define Ord Digit ( Digit (Digit) ) %.;
				176
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	177	! TODO:
				178	! floating point, serial, model numbers, ip addresses, etc.
				179	! every other segment must have at least one digit
				180
				181	! Omission words like "fu**ing!"
				182	define Omission Char+ Asterisk Asterisk+ Char*;
				183
				184
				185	! TODO: Name words with ' and `
				186
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	187	! Support ASCII elements, like
				188	! +---------------+
				189	! <---->, -->, <--
				190	! +---------------+
				191	! <---> \| Worker Node N \|
				192	! +---------------+
				193	! \|============= Core =============\|
				194
				195
				196
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	197	define RealToken [XML\|Email\|URL\|SNS\|Abbr\|Plusampersand\|Streetname\|Omission\|Domain\|AcronymDep\|File\|Emdash\|Punct\|Ord\|Num\|Years\|Emoji\|Word];
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	198
				199	echo - Introduce Token splitter
				200	define Token [RealToken @-> ... NLout]
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	201	! .o. [NL -> 0]
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	202	.o. [WS+ @-> 0]
				203	;
				204
				205	echo - Introduce Sentence splitter
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	206	read regex Token .o. [[["."\|"!"\|"?"]+\|"…"] @-> ... NLout \/ NLout _];
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	207
				208	! foma -e "source tokenizer.xfst" -q -s && cat text.txt \| flookup tokenizer.fst -x -b
				209
				210	! In a second pass, XML tags need to be combined. This requires tagging "<..." with ~xmls before \n
				211	! and anything with > with ~xmle.
				212	! In case this is part of an emoticon ( >:-P ), this needs to be split again .
				213	! The same is true for ( and )