blob: 37029ae24e6c49faae27cf0dcc5ebf95b0e0757b [file] [log] [blame]
Akron57510c12019-01-04 14:58:53 +01001use strict;
2use warnings;
3use Test::More;
4use Data::Dumper;
5use JSON::XS;
6use Log::Log4perl;
7use utf8;
8
9use Benchmark qw/:hireswallclock/;
10
11my $t = Benchmark->new;
12
13# Initialize log4perl object
14#Log::Log4perl->init({
15# 'log4perl.rootLogger' => 'TRACE, STDERR',
16# 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
17# 'log4perl.appender.STDERR.layout' => 'PatternLayout',
18# 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
19#});
20
21
22use File::Basename 'dirname';
23use File::Spec::Functions 'catdir';
24
25use_ok('KorAP::XML::Krill');
26
27my $path = catdir(dirname(__FILE__), '..', 'corpus', 'AGD-scrambled', 'DOC', '00001');
28
29ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
30ok($doc->parse, 'Parse document');
31
32is($doc->text_sigle, 'AGD/DOC/00001', 'Correct text sigle');
33is($doc->doc_sigle, 'AGD/DOC', 'Correct document sigle');
34is($doc->corpus_sigle, 'AGD', 'Correct corpus sigle');
35
36my $meta = $doc->meta;
37is($meta->{T_title}, 'FOLK_E_00321_SE_01_T_01_DF_01', 'Title');
38is($meta->{D_creation_date}, '20181112', 'Title');
39
40# TODO:
41# Add source as asset!
42
43# Tokenization
44use_ok('KorAP::XML::Tokenizer');
45
46my ($token_base_foundry, $token_base_layer) = (qw/DGD Annot/);
47
48# Get tokenization
49my $tokens = KorAP::XML::Tokenizer->new(
50 path => $doc->path,
51 doc => $doc,
52 foundry => $token_base_foundry,
53 layer => $token_base_layer,
54 name => 'tokens',
55 non_word_tokens => 1
56);
57
58ok($tokens, 'Token Object is fine');
59ok($tokens->parse, 'Token parsing is fine');
60
61my $output = decode_json( $tokens->to_json );
62
63is(substr($output->{data}->{text}, 0, 100),
64 '+++++++++ ku sqn alxv a pwm ▮ xnj nq qtl ohmdgjqp ▮ ▮ ▮ ▮ ▮ fi ▮ sna ▮ alxv hn ▮ zjc ahyx ftwbramn l',
65 'Primary Data');
66
67is($output->{data}->{name}, 'tokens', 'tokenName');
68is($output->{data}->{tokenSource}, 'dgd#annot', 'tokenSource');
69
70is($output->{version}, '0.03', 'version');
71is($output->{data}->{foundries}, '', 'Foundries');
72is($output->{data}->{layerInfos}, '', 'layerInfos');
73is($output->{data}->{stream}->[1]->[2], 's:ku', 'data');
74is($output->{data}->{stream}->[2]->[2], 's:sqn', 'data');
75is($output->{data}->{stream}->[3]->[2], 's:alxv', 'data');
76is($output->{textSigle}, 'AGD/DOC/00001', 'Correct text sigle');
77is($output->{docSigle}, 'AGD/DOC', 'Correct document sigle');
78is($output->{corpusSigle}, 'AGD', 'Correct corpus sigle');
79
80is($output->{title}, 'FOLK_E_00321_SE_01_T_01_DF_01', 'Title');
81
82## DeReKo
83$tokens->add('DeReKo', 'Structure');
84
85$output = decode_json( $tokens->to_json );
86
87is($output->{data}->{foundries},
88 'dereko dereko/structure',
89 'Foundries');
90is($output->{data}->{layerInfos}, 'dereko/s=spans', 'layerInfos');
91
92my $first_token = join('||', @{$output->{data}->{stream}->[0]});
93like($first_token, qr!<>:dereko/s:text!);
94
95## DGD
96$tokens->add('DGD', 'Morpho');
97
98$output = decode_json( $tokens->to_json );
99is($output->{data}->{foundries},
100 'dereko dereko/structure dgd dgd/morpho',
101 'Foundries');
102is($output->{data}->{layerInfos}, 'dereko/s=spans dgd/l=tokens dgd/p=tokens dgd/para=tokens',
103 'layerInfos');
104
105my $third_token = join('||', @{$output->{data}->{stream}->[3]});
106like($third_token, qr!dgd/l:alui!);
107like($third_token, qr!dgd/p:VMGWY!);
108like($third_token, qr!i:alxv!);
109like($third_token, qr!s:alxv!);
110
111# TODO:
112# Check sentences!
113# Check paragraphs!
114
115
116
117done_testing;
118__END__