Blame - src/tokenizer.xfst - KorAP/Datok

blob: f7a089ff29f7fce09debe197e4ea41183747d042 [file] [log] [blame]

Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	1	! This tokenizer is based on work by
				2	! - StandardTokenizerImpl by the Lucene project
				3	! under the Apache License
				4	! - https://github.com/dlwh/epic by David Hall (2014)
				5	! under the Apacahe License
				6	! - KorAPTokenizerImpl.jflex by Marc Kupietz (2016)
				7	! under the Apache License
				8	! - https://github.com/coltekin/TRmorph/tokenize.xfst by Çağrı Çöltekin (2011-2015)
				9	! under the MIT License
				10	!
				11	! The abbreviation list is part of the sentence splitter tool
				12	! of the IDS.
				13
Akron	4af79f1	2021-08-11 14:48:17 +0200	[diff] [blame]	14	define NLout "@_TOKEN_SYMBOL_@";
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	15	! define NLout "\u000a";
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	16
				17	define Digit [%0\|1\|2\|3\|4\|5\|6\|7\|8\|9];
Akron	e8837b5	2021-08-11 17:29:58 +0200	[diff] [blame]	18	define AsciiLetter [a\|b\|c\|d\|e\|f\|g\|h\|i\|j\|k\|l\|m\|n\|o\|p\|q\|r\|s\|t\|u\|v\|w\|x\|y\|z];
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	19
				20	!!!!!!!!!!!!!!!!!
				21	! <from tmorph> !
				22	!!!!!!!!!!!!!!!!!
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	23	define WS [" "\|"\u0009"\|"\u00a0"\|"\u1680"\|
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	24	"\u2000"\|"\u2001"\|"\u2002"\|"\u2003"\|"\u2004"\|"\u2005"\|
				25	"\u2006"\|"\u2007"\|"\u2008"\|"\u2009"\|"\u200a"\|
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	26	"\u202f"\|"\u205f"\|"\u3000"];
				27
				28	define NL ["\u000a"\|"\u000b"\|"\u000c"\|"\u000d"\|"\u0085"\|"\u2028"\|"\u2029"];
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	29
				30	! Punctuation that ends sentences
				31	! Differs!
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	32	define SP [["."\|"?"\|"!"]+\|"…"];
				33
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	34	! Left punctuation
				35	define LP ["("\|"["\|"{"\|
				36	"“"\|"‘"\|"‹"\|"«"\|
				37	"'"\|%"\|
				38	! differs
				39	["'" "'"] \|
				40	"*"\|"/"\|"_"\| ! Can be Markdown
				41	! from book
				42	[%, %,]];
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	43
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	44	! Right punctuation - excluding the characters that can be used as apostrophe
				45	define RP [SP\|","\|";"\|":"\|
				46	")"\|"]"\|"}"\|
				47	"”"\|"›"\|"»"\|
				48	%"\|
				49	! differs
				50	["'" "'"]\|
				51	"*"\|"/"\|"_"\| ! Can be Markdown
				52	! from book
				53	[%‘ %‘]\|[%’ %’]];
				54
				55	define Sym ["-"\|"+"\|"<"\|">"\|"*"\|"/"\|%=\|%@];
				56	define Apos %'\|%’\|%`;
Akron	4c2a1ad	2021-08-31 00:35:53 +0200	[diff] [blame]	57	define Punct [LP\|RP\|Sym];
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	58	!define nonSym \[WS\|LP\|RP\|Sym];
				59	!!!!!!!!!!!!!!!!!!
				60	! </from tmorph> !
				61	!!!!!!!!!!!!!!!!!!
				62
				63	define Emdash [%- %- (%-)+ \| ["\u2014"\|"\u2015"\|"\u2e3a"\|"\u2e3b"\|"\ufe58"]+];
				64	define Dash ["-"\|"\u2011"\|"\u2012"\|"\u2013"\|"\u2e1a"\|"\ufe63"\|"\uff0d"];
				65	define Slash ["⁄"\|"∕"\|"／"\|"/"];
				66	define Asterisk ["*"];
				67
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	68	define Char \[WS\|NL\|Punct\|Apos]; ! \|¨;
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	69
				70	! source lexicon.xfst
				71	! define Word;
				72	define Word Char+ ([Dash\|Apos\|Asterisk] Char+)*;
				73
				74	define URLChar [Char\|[Sym - ["<"\|">"\|%"]]];
				75	!define Alpha ["a"\|"b"\|"c"\|"d"\|"e"\|"f"\|"g"\|"h"\|"i"\|"j"\|"k"\|"l"\|"m"\|"n"\|"o"\|"p"\|"q"\|"r"\|"s"\|"t"\|"u"\|"v"\|"w"\|"x"\|"y"\|"z"\|"_"];
				76
				77	define Caseinsensitive [
				78	a (->) A,
				79	b (->) B,
				80	c (->) C,
				81	d (->) D,
				82	e (->) E,
				83	f (->) F,
				84	g (->) G,
				85	h (->) H,
				86	i (->) I,
				87	j (->) J,
				88	k (->) K,
				89	l (->) L,
				90	m (->) M,
				91	n (->) N,
				92	o (->) O,
				93	p (->) P,
				94	q (->) Q,
				95	r (->) R,
				96	s (->) S,
				97	t (->) T,
				98	u (->) U,
				99	v (->) V,
				100	w (->) W,
				101	x (->) X,
				102	y (->) Y,
				103	z (->) Z,
				104	ö (->) Ö,
				105	ü (->) Ü,
				106	ä (->) Ä,
				107	ß (->) {SS}
				108	];
				109
Akron	3de361e	2021-08-17 09:56:42 +0200	[diff] [blame]	110	define Abbr @txt"txt/abbrv.txt" %.;
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	111
Akron	57d0161	2021-08-11 17:53:19 +0200	[diff] [blame]	112	define Plusampersand @txt"txt/plusampersand.txt";
				113
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	114	! A solution to the "(author): problem" may be to add ) at the end of any
				115	! string as a possible ending
				116
				117	define Years ["(" Digit+ (".") ")"] \| ["[" Digit+ (".") "]"];
				118
				119	source emoji.xfst
				120	define Emoji;
				121
				122	! acronyms: U.S.A., I.B.M., etc.
				123	! use a post-filter to remove dots
				124	define AcronymDep Char %. [Char %.]+;
				125
				126	define Dot "."\|[["["\|"("] "d" "o" "t" [")"\|"]"]] .o. Caseinsensitive;
				127	define At "@"\|[["["\|"("] "a" "t" [")"\|"]"]] .o. Caseinsensitive;
				128
				129	define TldEnd [{org}\|{de}\|{com}] .o. Caseinsensitive;
				130
				131	! Very relaxed URL scheme, not based on the strict Lucene implementation
				132	define URL [ [ [{http} (s) \| {ftp} \| {file}] ":" "/" "/"] \| [{www} Dot] ]
				133	URLChar [URLChar\|SP]* URLChar
				134	.o. Caseinsensitive;
				135
				136	define Domain Char+ [Dash Char+]* Dot TldEnd;
				137
Akron	4c2a1ad	2021-08-31 00:35:53 +0200	[diff] [blame]	138	! XML rule
				139	define XMLns [AsciiLetter [AsciiLetter\|Digit\|%-]* (%: AsciiLetter [AsciiLetter\|Digit\|%-]*)] .o. Caseinsensitive;
				140	define XML [
				141	"<" [
				142	[
				143	XMLns
				144	[WS+ XMLns WS*
				145	(%= WS*
				146	[[%" [? - %" - %>]+ %"] \| [%' [? - %' - %>]+ %']]
				147	)
				148	]*
				149	]
				150	\|
				151	[
				152	"/" XMLns
				153	]
				154	] WS* ">"
				155	].u;
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	156
				157	!define Email [Alpha [URLChar-At]* At Alpha URLChar* [Dot [[Alpha URLChar+]-Dot-At]]+];
				158	define Email URLChar+ At URLChar+ [Dot URLChar+]+;
				159
				160	! Twitter user, hashtag, Google+
				161	define SNS ["@"\|"#"\|"+"] Char+;
				162
				163	define FileEnd [
				164	[{htm} ("l")]\|
				165	[{doc} ("x")]\|
				166	{pdf}\|
				167	["j" "p" ("e") "g"]\|
				168	["m" "p" ["3"\|"4"]]\|
				169	{ogg}\|
				170	{png}\|
				171	{avi}\|
				172	{txt}\|
				173	{xls}\|
				174	{xml}\|
				175	{aac}\|
Akron	e8837b5	2021-08-11 17:29:58 +0200	[diff] [blame]	176	{gif}\|
				177	{exe}
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	178	] .o. Caseinsensitive;
Akron	e8837b5	2021-08-11 17:29:58 +0200	[diff] [blame]	179
				180	define File (( AsciiLetter ":" %\ \| "/" ) [ Char \| "_" \| "-" \| Char [ %\ \| "/" ] ]*) [Char \| "-" \| "_" ]+ "." FileEnd;
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	181
Akron	a0bded5	2021-08-11 15:48:02 +0200	[diff] [blame]	182	define Streetname Word {str} %.;
Akron	4af79f1	2021-08-11 14:48:17 +0200	[diff] [blame]	183
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	184	! Also supports
				185	! 19.4.2015, 19/4/2015 etc.
				186	define DigitPunct ["_"\|"-"\|"."\|","\|Slash];
				187	define Num Digit+ [DigitPunct Digit+]* (Char+);
				188
Akron	a0bded5	2021-08-11 15:48:02 +0200	[diff] [blame]	189	! ordinals
				190	define Ord Digit ( Digit (Digit) ) %.;
				191
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	192	! TODO:
				193	! floating point, serial, model numbers, ip addresses, etc.
				194	! every other segment must have at least one digit
				195
				196	! Omission words like "fu**ing!"
				197	define Omission Char+ Asterisk Asterisk+ Char*;
				198
				199
				200	! TODO: Name words with ' and `
				201
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	202	! Support ASCII elements, like
				203	! +---------------+
				204	! <---->, -->, <--
				205	! +---------------+
				206	! <---> \| Worker Node N \|
				207	! +---------------+
				208	! \|============= Core =============\|
				209
				210
Akron	4c2a1ad	2021-08-31 00:35:53 +0200	[diff] [blame]	211	echo - Compile Real Token
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	212
Akron	4c2a1ad	2021-08-31 00:35:53 +0200	[diff] [blame]	213	define RealToken [Punct\|Word\|XML\|Email\|URL\|SNS\|Domain\|AcronymDep\|File\|Ord\|Num\|Years];
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	214
				215	echo - Introduce Token splitter
Akron	4c2a1ad	2021-08-31 00:35:53 +0200	[diff] [blame]	216
				217	define Token [
				218	RealToken @-> ... NLout,
				219	Abbr @-> ... NLout,
				220	Plusampersand @-> ... NLout,
				221	Emoji @-> ... NLout,
				222	[Streetname\|Omission\|Emdash] @-> ... NLout
				223	]
				224	.o. [WS+ @-> 0 \|\| NLout _ ]
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	225	;
				226
				227	echo - Introduce Sentence splitter
Akron	4c2a1ad	2021-08-31 00:35:53 +0200	[diff] [blame]	228	read regex Token .o. [[["."\|"!"\|"?"]+\|"…"] @-> ... NLout \/ NLout _ ];
Akron	310905f	2021-08-11 13:49:50 +0200	[diff] [blame]	229
				230	! foma -e "source tokenizer.xfst" -q -s && cat text.txt \| flookup tokenizer.fst -x -b
				231
				232	! In a second pass, XML tags need to be combined. This requires tagging "<..." with ~xmls before \n
				233	! and anything with > with ~xmle.
				234	! In case this is part of an emoticon ( >:-P ), this needs to be split again .
				235	! The same is true for ( and )