testdata/clitic_test.xfst - KorAP/Datok - Gitiles

 define TB "@_TOKEN_BOUND_@";
 # define TB "_";
 define WS [" "|"\u000a"|"\u0009"];
 define PUNCT ["."|"?"|"!"];
 define Char \[WS|PUNCT|"'"];
 define Clitic ["'" [{ll}|{d}|{ve}|{s}|{re}|"m"|"n"|"t"]];
 # Following https://web.stanford.edu/~zwicky/ZPCliticsInfl.pdf
 define CliticNTPart [[{do}({es})|{did}|{have}|{has}|{had}|{ca}|{could}|{might}|{sha}|{should}|{wo}|{would}|{dare}|{must}|{need}|{ought}|{are}|{is}|{was}|{were}|{ai}] {n}];
 define CliticNT [CliticNTPart "'" {t}];


 define Word [ [ Clitic |  Char+] - CliticNTPart | CliticNT];

 ! Compose token boundaries
 define Tokenizer [[Word|PUNCT|CliticNT] @-> ... TB] .o.
  ! Compose Whitespace ignorance
 [WS+ @-> 0] .o.
  ! Compose sentence ends
 [[PUNCT+] @-> ... TB \/ TB _ ];
 read regex Tokenizer .o. [{n't} ->@ TB ... \/ TB ];
	define TB "@_TOKEN_BOUND_@";
	# define TB "_";
	define WS [" "\|"\u000a"\|"\u0009"];
	define PUNCT ["."\|"?"\|"!"];
	define Char \[WS\|PUNCT\|"'"];
	define Clitic ["'" [{ll}\|{d}\|{ve}\|{s}\|{re}\|"m"\|"n"\|"t"]];
	# Following https://web.stanford.edu/~zwicky/ZPCliticsInfl.pdf
	define CliticNTPart [[{do}({es})\|{did}\|{have}\|{has}\|{had}\|{ca}\|{could}\|{might}\|{sha}\|{should}\|{wo}\|{would}\|{dare}\|{must}\|{need}\|{ought}\|{are}\|{is}\|{was}\|{were}\|{ai}] {n}];
	define CliticNT [CliticNTPart "'" {t}];


	define Word [ [ Clitic \| Char+] - CliticNTPart \| CliticNT];

	! Compose token boundaries
	define Tokenizer [[Word\|PUNCT\|CliticNT] @-> ... TB] .o.
	! Compose Whitespace ignorance
	[WS+ @-> 0] .o.
	! Compose sentence ends
	[[PUNCT+] @-> ... TB \/ TB _ ];
	read regex Tokenizer .o. [{n't} ->@ TB ... \/ TB ];