blob: 4e883d1cfecbd0a4c0039c3f215e86772fe2d5b5 [file] [log] [blame]
Akroned9baf02019-01-22 17:03:25 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use utf8;
5use Test::More;
6use JSON::XS;
7
8use File::Basename 'dirname';
9use File::Spec::Functions 'catdir';
10
11sub _t2h {
12 my $string = shift;
13 $string =~ s/^\[\(\d+?-\d+?\)(.+?)\]$/$1/;
14 my %hash = ();
15 foreach (split(qr!\|!, $string)) {
16 $hash{$_} = 1;
17 };
18 return \%hash;
19};
20
21use_ok('KorAP::XML::Krill');
22
23my $path = catdir(dirname(__FILE__), 'corpus/WPD/00001');
24ok(my $doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document');
25like($doc->path, qr!\Q$path\E/$!, 'Path');
26ok($doc->parse, 'Parse document');
27is($doc->text_sigle, 'WPD/AAA/00001', 'ID');
28
29
30# Get tokens
31use_ok('KorAP::XML::Tokenizer');
32
33# Get tokenization
34ok(my $tokens = KorAP::XML::Tokenizer->new(
35 path => $doc->path,
36 doc => $doc,
37 foundry => 'OpenNLP',
38 layer => 'Tokens',
39 name => 'tokens'
40), 'New Tokenizer');
41ok($tokens->parse, 'Parse');
42
43like($tokens->stream->pos(12)->to_string, qr/s:Vokal/);
44like($tokens->stream->pos(13)->to_string, qr/s:Der/);
45
46
47# Get tokenization with non word tokens
48ok($tokens = KorAP::XML::Tokenizer->new(
49 path => $doc->path,
50 doc => $doc,
51 foundry => 'OpenNLP',
52 layer => 'Tokens',
53 name => 'tokens',
54 non_word_tokens => 1
55), 'New Tokenizer');
56ok($tokens->parse, 'Parse');
57
58like($tokens->stream->pos(12)->to_string, qr/s:Vokal/);
59like($tokens->stream->pos(13)->to_string, qr/s:\./);
60like($tokens->stream->pos(14)->to_string, qr/s:Der/);
61
62
Akronc526e752019-02-05 14:57:17 +010063my $json = decode_json $tokens->to_json;
64is($json->{docSigle}, 'WPD/AAA', 'DocSigle old');
65is($json->{author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author old');
66
67$json = decode_json $tokens->to_json(0.4);
68is($json->{fields}->[0]->{key}, 'corpusSigle');
69is($json->{fields}->[0]->{value}, 'WPD');
70is($json->{fields}->[6]->{key}, 'creationDate');
71is($json->{fields}->[6]->{value}, '2005');
72
Akroned9baf02019-01-22 17:03:25 +010073done_testing;
74
75__END__