blob: 4c4ddaa064bcecd10726a6ed521bea566f9309f6 [file] [log] [blame]
Akroned9baf02019-01-22 17:03:25 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use utf8;
5use Test::More;
6use JSON::XS;
7
8use File::Basename 'dirname';
9use File::Spec::Functions 'catdir';
10
11sub _t2h {
12 my $string = shift;
13 $string =~ s/^\[\(\d+?-\d+?\)(.+?)\]$/$1/;
14 my %hash = ();
15 foreach (split(qr!\|!, $string)) {
16 $hash{$_} = 1;
17 };
18 return \%hash;
19};
20
21use_ok('KorAP::XML::Krill');
22
23my $path = catdir(dirname(__FILE__), 'corpus/WPD/00001');
24ok(my $doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document');
25like($doc->path, qr!\Q$path\E/$!, 'Path');
26ok($doc->parse, 'Parse document');
27is($doc->text_sigle, 'WPD/AAA/00001', 'ID');
28
29
30# Get tokens
31use_ok('KorAP::XML::Tokenizer');
32
33# Get tokenization
34ok(my $tokens = KorAP::XML::Tokenizer->new(
35 path => $doc->path,
36 doc => $doc,
37 foundry => 'OpenNLP',
38 layer => 'Tokens',
39 name => 'tokens'
40), 'New Tokenizer');
41ok($tokens->parse, 'Parse');
42
43like($tokens->stream->pos(12)->to_string, qr/s:Vokal/);
44like($tokens->stream->pos(13)->to_string, qr/s:Der/);
45
46
47# Get tokenization with non word tokens
48ok($tokens = KorAP::XML::Tokenizer->new(
49 path => $doc->path,
50 doc => $doc,
51 foundry => 'OpenNLP',
52 layer => 'Tokens',
53 name => 'tokens',
54 non_word_tokens => 1
55), 'New Tokenizer');
56ok($tokens->parse, 'Parse');
57
58like($tokens->stream->pos(12)->to_string, qr/s:Vokal/);
59like($tokens->stream->pos(13)->to_string, qr/s:\./);
60like($tokens->stream->pos(14)->to_string, qr/s:Der/);
61
62
63done_testing;
64
65__END__