| define TB "@_TOKEN_BOUND_@"; | |
| define WS [" "|"\u000a"|"\u0009"]; | |
| define PUNCT ["."|"?"|"!"]; | |
| define Char \[WS|PUNCT]; | |
| define Word Char+; | |
| ! Compose token boundaries | |
| define Tokenizer [[Word|PUNCT] @-> ... TB] .o. | |
| ! Compose Whitespace ignorance | |
| [WS+ @-> 0] .o. | |
| ! Compose sentence ends | |
| [[PUNCT+] @-> ... TB \/ TB _ ]; | |
| read regex Tokenizer .o. [{'re} ->@ TB ... ]; |