blob: de6788f636e14c0d774d74e313615d1e341ff464 [file] [log] [blame]
define TB "@_TOKEN_BOUND_@";
# define TB "_";
define WS [" "|"\u000a"|"\u0009"];
define PUNCT ["."|"?"|"!"];
define Char \[WS|PUNCT|"'"];
define Clitic ["'" [{ll}|{d}|{ve}|{s}|{re}|"m"|"n"|"t"]];
# Following https://web.stanford.edu/~zwicky/ZPCliticsInfl.pdf
define CliticNTPart [[{do}({es})|{did}|{have}|{has}|{had}|{ca}|{could}|{might}|{sha}|{should}|{wo}|{would}|{dare}|{must}|{need}|{ought}|{are}|{is}|{was}|{were}|{ai}] {n}];
define CliticNT [CliticNTPart "'" {t}];
define Word [ [ Clitic | Char+] - CliticNTPart | CliticNT];
! Compose token boundaries
define Tokenizer [[Word|PUNCT|CliticNT] @-> ... TB] .o.
! Compose Whitespace ignorance
[WS+ @-> 0] .o.
! Compose sentence ends
[[PUNCT+] @-> ... TB \/ TB _ ];
read regex Tokenizer .o. [{n't} ->@ TB ... \/ TB ];