blob: b254d5ed3c88932c30f55367be98e49fd78f4fdb [file] [log] [blame]
Akron414ec952020-08-03 15:48:43 +02001#!/usr/bin/env perl
2use strict;
3use warnings;
4use utf8;
5use Test::More;
6use JSON::XS;
7use Data::Dumper;
8
9use File::Basename 'dirname';
10use File::Spec::Functions 'catdir';
11
12if ($ENV{SKIP_REAL}) {
13 plan skip_all => 'Skip real tests';
14};
15
16sub _t2h {
17 my $string = shift;
18 $string =~ s/^\[\(\d+?-\d+?\)(.+?)\]$/$1/;
19 my %hash = ();
20 foreach (split(qr!\|!, $string)) {
21 $hash{$_} = 1;
22 };
23 return \%hash;
24};
25
26use_ok('KorAP::XML::Krill');
27
28my $path = catdir(dirname(__FILE__), qw/corpus WPD 00001/);
29ok(my $doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document');
30like($doc->path, qr!\Q$path\E/$!, 'Path');
31ok($doc->parse, 'Parse document');
32is($doc->text_sigle, 'WPD/AAA/00001', 'ID');
33
34
35# Get tokens
36use_ok('KorAP::XML::Tokenizer');
37
38# Get tokenization
39ok(my $tokens = KorAP::XML::Tokenizer->new(
40 path => $doc->path,
41 doc => $doc,
42 foundry => 'OpenNLP',
43 layer => 'Tokens',
44 name => 'tokens'
45), 'New Tokenizer');
46ok($tokens->parse, 'Parse');
47
48like($tokens->stream->pos(12)->to_string, qr/s:Vokal/);
49like($tokens->stream->pos(13)->to_string, qr/s:Der/);
50
51
52# Get tokenization with non word tokens
53ok($tokens = KorAP::XML::Tokenizer->new(
54 path => $doc->path,
55 doc => $doc,
56 foundry => 'OpenNLP',
57 layer => 'Tokens',
58 name => 'tokens',
59 non_word_tokens => 1
60), 'New Tokenizer');
61ok($tokens->parse, 'Parse');
62
63like($tokens->stream->pos(12)->to_string, qr/s:Vokal/);
64like($tokens->stream->pos(13)->to_string, qr/s:\./);
65like($tokens->stream->pos(14)->to_string, qr/s:Der/);
66
67
68my $json = decode_json $tokens->to_json;
69is($json->{docSigle}, 'WPD/AAA', 'DocSigle old');
70is($json->{author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author old');
71
72$json = decode_json $tokens->to_json(0.4);
73is($json->{fields}->[0]->{key}, 'corpusSigle');
74is($json->{fields}->[0]->{value}, 'WPD');
75is($json->{fields}->[7]->{key}, 'creationDate');
76is($json->{fields}->[7]->{value}, '2005');
77
78is($json->{data}->{name}, 'tokens');
79is($json->{data}->{tokenSource}, 'opennlp#tokens');
80is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>1068');
81is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>6083<i>1068<b>0');
82
83done_testing;
84
85__END__