| #!/usr/bin/env perl | |
| use strict; | |
| use warnings; | |
| use FindBin; | |
| use Encode; | |
| BEGIN { | |
| unshift @INC, "$FindBin::Bin/../../lib"; | |
| }; | |
| use KorAP::XML::TEI::Tokenizer::Aggressive; | |
| $| = 1; | |
| # Init tokenizer | |
| my $tok = KorAP::XML::TEI::Tokenizer::Aggressive->new; | |
| # Read lines from input and return boundaries | |
| while (!eof(STDIN)) { | |
| my $line = decode_utf8(<>); | |
| for my $text (split(/\n?\x{04}\n?/, $line)) { | |
| $tok->tokenize($text); | |
| print join(' ', $tok->boundaries), "\n"; | |
| $tok->reset; | |
| } | |
| }; | |
| 1; |