Akron | cae3911 | 2023-04-26 19:43:16 +0200 | [diff] [blame] | 1 | define TB "@_TOKEN_BOUND_@"; |
Akron | 72a6422 | 2023-04-26 17:00:45 +0200 | [diff] [blame^] | 2 | # define TB "_"; |
Akron | cae3911 | 2023-04-26 19:43:16 +0200 | [diff] [blame] | 3 | define WS [" "|"\u000a"|"\u0009"]; |
| 4 | define PUNCT ["."|"?"|"!"]; |
Akron | 72a6422 | 2023-04-26 17:00:45 +0200 | [diff] [blame^] | 5 | define Char \[WS|PUNCT|"'"]; |
| 6 | define Clitic ["'" [{ll}|{d}|{ve}|{s}|{re}|"m"|"n"|"t"]]; |
| 7 | # Following https://web.stanford.edu/~zwicky/ZPCliticsInfl.pdf |
| 8 | define CliticNTPart [[{do}({es})|{did}|{have}|{has}|{had}|{ca}|{could}|{might}|{sha}|{should}|{wo}|{would}|{dare}|{must}|{need}|{ought}|{are}|{is}|{was}|{were}|{ai}] {n}]; |
| 9 | define CliticNT [CliticNTPart "'" {t}]; |
| 10 | |
| 11 | |
| 12 | define Word [ [ Clitic | Char+] - CliticNTPart | CliticNT]; |
Akron | cae3911 | 2023-04-26 19:43:16 +0200 | [diff] [blame] | 13 | |
| 14 | ! Compose token boundaries |
Akron | 72a6422 | 2023-04-26 17:00:45 +0200 | [diff] [blame^] | 15 | define Tokenizer [[Word|PUNCT|CliticNT] @-> ... TB] .o. |
Akron | cae3911 | 2023-04-26 19:43:16 +0200 | [diff] [blame] | 16 | ! Compose Whitespace ignorance |
| 17 | [WS+ @-> 0] .o. |
| 18 | ! Compose sentence ends |
| 19 | [[PUNCT+] @-> ... TB \/ TB _ ]; |
Akron | 72a6422 | 2023-04-26 17:00:45 +0200 | [diff] [blame^] | 20 | read regex Tokenizer .o. [{n't} ->@ TB ... \/ TB ]; |
| 21 | |