Akron | cae3911 | 2023-04-26 19:43:16 +0200 | [diff] [blame] | 1 | define TB "@_TOKEN_BOUND_@"; |
2 | define WS [" "|"\u000a"|"\u0009"]; | ||||
3 | define PUNCT ["."|"?"|"!"]; | ||||
4 | define Char \[WS|PUNCT]; | ||||
5 | define Word Char+; | ||||
6 | |||||
7 | ! Compose token boundaries | ||||
8 | define Tokenizer [[Word|PUNCT] @-> ... TB] .o. | ||||
9 | ! Compose Whitespace ignorance | ||||
10 | [WS+ @-> 0] .o. | ||||
11 | ! Compose sentence ends | ||||
12 | [[PUNCT+] @-> ... TB \/ TB _ ]; | ||||
13 | read regex Tokenizer .o. [{'re} ->@ TB ... ]; |