blob: 4e8b2d074a973f2089fe5bc552b895eec5ebee3c [file] [log] [blame]
Akron8b511f92020-07-09 17:28:08 +02001#!/usr/bin/env perl
2use strict;
3use warnings;
4use FindBin;
Marc Kupietz1e882fb2020-09-09 00:05:46 +02005use Encode;
Akron8b511f92020-07-09 17:28:08 +02006BEGIN {
7 unshift @INC, "$FindBin::Bin/../../lib";
8};
9use KorAP::XML::TEI::Tokenizer::Aggressive;
10
Akron8b511f92020-07-09 17:28:08 +020011$| = 1;
12
13# Init tokenizer
14my $tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
15
16# Read lines from input and return boundaries
17while (!eof(STDIN)) {
Marc Kupietz1e882fb2020-09-09 00:05:46 +020018 my $line = decode_utf8(<>);
Marc Kupietz52dc21b2020-09-05 13:51:22 +020019 for my $text (split(/\n?\x{04}\n?/, $line)) {
20 $tok->tokenize($text);
21 print join(' ', $tok->boundaries), "\n";
22 $tok->reset;
23 }
Akron8b511f92020-07-09 17:28:08 +020024};
25
261;