blob: da135e5a19c8ae9937175a594bef6597efbca0ee [file] [log] [blame]
Akroned9baf02019-01-22 17:03:25 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use utf8;
5use Test::More;
6use JSON::XS;
Akron263274c2019-02-07 09:48:30 +01007use Data::Dumper;
Akroned9baf02019-01-22 17:03:25 +01008
9use File::Basename 'dirname';
10use File::Spec::Functions 'catdir';
11
12sub _t2h {
13 my $string = shift;
14 $string =~ s/^\[\(\d+?-\d+?\)(.+?)\]$/$1/;
15 my %hash = ();
16 foreach (split(qr!\|!, $string)) {
17 $hash{$_} = 1;
18 };
19 return \%hash;
20};
21
22use_ok('KorAP::XML::Krill');
23
24my $path = catdir(dirname(__FILE__), 'corpus/WPD/00001');
25ok(my $doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document');
26like($doc->path, qr!\Q$path\E/$!, 'Path');
27ok($doc->parse, 'Parse document');
28is($doc->text_sigle, 'WPD/AAA/00001', 'ID');
29
30
31# Get tokens
32use_ok('KorAP::XML::Tokenizer');
33
34# Get tokenization
35ok(my $tokens = KorAP::XML::Tokenizer->new(
36 path => $doc->path,
37 doc => $doc,
38 foundry => 'OpenNLP',
39 layer => 'Tokens',
40 name => 'tokens'
41), 'New Tokenizer');
42ok($tokens->parse, 'Parse');
43
44like($tokens->stream->pos(12)->to_string, qr/s:Vokal/);
45like($tokens->stream->pos(13)->to_string, qr/s:Der/);
46
47
48# Get tokenization with non word tokens
49ok($tokens = KorAP::XML::Tokenizer->new(
50 path => $doc->path,
51 doc => $doc,
52 foundry => 'OpenNLP',
53 layer => 'Tokens',
54 name => 'tokens',
55 non_word_tokens => 1
56), 'New Tokenizer');
57ok($tokens->parse, 'Parse');
58
59like($tokens->stream->pos(12)->to_string, qr/s:Vokal/);
60like($tokens->stream->pos(13)->to_string, qr/s:\./);
61like($tokens->stream->pos(14)->to_string, qr/s:Der/);
62
63
Akronc526e752019-02-05 14:57:17 +010064my $json = decode_json $tokens->to_json;
65is($json->{docSigle}, 'WPD/AAA', 'DocSigle old');
66is($json->{author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author old');
67
68$json = decode_json $tokens->to_json(0.4);
69is($json->{fields}->[0]->{key}, 'corpusSigle');
70is($json->{fields}->[0]->{value}, 'WPD');
Akron6bf3cc92019-02-07 12:11:20 +010071is($json->{fields}->[7]->{key}, 'creationDate');
72is($json->{fields}->[7]->{value}, '2005');
Akronc526e752019-02-05 14:57:17 +010073
Akron263274c2019-02-07 09:48:30 +010074is($json->{data}->{name}, 'tokens');
75is($json->{data}->{tokenSource}, 'opennlp#tokens');
Akrondec43122020-03-03 11:22:25 +010076is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>1068');
77is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>6083<i>1068<b>0');
Akron263274c2019-02-07 09:48:30 +010078
Akroned9baf02019-01-22 17:03:25 +010079done_testing;
80
81__END__