blob: 1b129aa4da61dae921120c18c522d29c1c452401 [file] [log] [blame]
define TB "@_TOKEN_BOUND_@";
define WS [" "|"\u000a"|"\u0009"];
define PUNCT ["."|"?"|"!"];
define Char \[WS|PUNCT];
define Word Char+;
! Compose token boundaries
define Tokenizer [[Word|PUNCT] @-> ... TB] .o.
! Compose Whitespace ignorance
[WS+ @-> 0] .o.
! Compose sentence ends
[[PUNCT+] @-> ... TB \/ TB _ ];
read regex Tokenizer .o. [{'re} ->@ TB ... ];