blob: 1b129aa4da61dae921120c18c522d29c1c452401 [file] [log] [blame]
Akroncae39112023-04-26 19:43:16 +02001define TB "@_TOKEN_BOUND_@";
2define WS [" "|"\u000a"|"\u0009"];
3define PUNCT ["."|"?"|"!"];
4define Char \[WS|PUNCT];
5define Word Char+;
6
7! Compose token boundaries
8define Tokenizer [[Word|PUNCT] @-> ... TB] .o.
9 ! Compose Whitespace ignorance
10[WS+ @-> 0] .o.
11 ! Compose sentence ends
12[[PUNCT+] @-> ... TB \/ TB _ ];
13read regex Tokenizer .o. [{'re} ->@ TB ... ];