blob: de6788f636e14c0d774d74e313615d1e341ff464 [file] [log] [blame]
Akroncae39112023-04-26 19:43:16 +02001define TB "@_TOKEN_BOUND_@";
Akron72a64222023-04-26 17:00:45 +02002# define TB "_";
Akroncae39112023-04-26 19:43:16 +02003define WS [" "|"\u000a"|"\u0009"];
4define PUNCT ["."|"?"|"!"];
Akron72a64222023-04-26 17:00:45 +02005define Char \[WS|PUNCT|"'"];
6define Clitic ["'" [{ll}|{d}|{ve}|{s}|{re}|"m"|"n"|"t"]];
7# Following https://web.stanford.edu/~zwicky/ZPCliticsInfl.pdf
8define CliticNTPart [[{do}({es})|{did}|{have}|{has}|{had}|{ca}|{could}|{might}|{sha}|{should}|{wo}|{would}|{dare}|{must}|{need}|{ought}|{are}|{is}|{was}|{were}|{ai}] {n}];
9define CliticNT [CliticNTPart "'" {t}];
10
11
12define Word [ [ Clitic | Char+] - CliticNTPart | CliticNT];
Akroncae39112023-04-26 19:43:16 +020013
14! Compose token boundaries
Akron72a64222023-04-26 17:00:45 +020015define Tokenizer [[Word|PUNCT|CliticNT] @-> ... TB] .o.
Akroncae39112023-04-26 19:43:16 +020016 ! Compose Whitespace ignorance
17[WS+ @-> 0] .o.
18 ! Compose sentence ends
19[[PUNCT+] @-> ... TB \/ TB _ ];
Akron72a64222023-04-26 17:00:45 +020020read regex Tokenizer .o. [{n't} ->@ TB ... \/ TB ];
21