blob: 8f94f440a4f8f9c6f74cbd0b301396eae4308060 [file] [log] [blame]
use strict;
use warnings;
use Test::More;
use Data::Dumper;
use JSON::XS;
if ($ENV{SKIP_REAL}) {
plan skip_all => 'Skip real tests';
};
use utf8;
use lib 'lib', '../lib';
use File::Basename 'dirname';
use File::Spec::Functions 'catdir';
use_ok('KorAP::XML::Krill');
use_ok('KorAP::XML::Meta::I5');
use_ok('KorAP::XML::Annotation::NKJP::NamedEntities');
my $path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KOT');
ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
is($doc->text_sigle, 'NKJP/NKJP/KOT', 'Correct text sigle');
is($doc->doc_sigle, 'NKJP/NKJP', 'Correct document sigle');
is($doc->corpus_sigle, 'NKJP', 'Correct corpus sigle');
my $meta = $doc->meta;
is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "Kot"', 'Title');
is($meta->{T_corpus_title}, 'Narodowy Korpus Języka Polskiego -- podkorpus zawierający 1 milion słów', 'Title');
ok($doc = KorAP::XML::Krill->new( path => $path . '/', lang => 'en' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
$meta = $doc->meta;
is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "Kot"', 'Title');
is($meta->{T_corpus_title}, 'National Corpus of Polish -- the 1 million word subcorpus', 'Language sensitive Title');
ok(!$meta->{T_sub_title}, 'SubTitle');
ok(!$meta->{T_author}, 'Author');
ok(!$meta->{A_editor}, 'Editor');
ok(!$meta->{S_pub_place}, 'PubPlace');
ok(!$meta->{A_publisher}, 'Publisher');
ok(!$meta->{S_text_type}, 'No Text Type');
ok(!$meta->{S_text_type_art}, 'No Text Type Art');
ok(!$meta->{S_text_type_ref}, 'No Text Type Ref');
ok(!$meta->{S_text_domain}, 'No Text Domain');
ok(!$meta->{S_text_column}, 'No Text Column');
# Tokenization
use_ok('KorAP::XML::Tokenizer');
my ($token_base_foundry, $token_base_layer) = (qw/nkjp Morpho/);
# Get tokenization
my $tokens = KorAP::XML::Tokenizer->new(
path => $doc->path,
doc => $doc,
foundry => $token_base_foundry,
layer => $token_base_layer,
name => 'tokens',
);
ok($tokens, 'Token Object is fine');
ok($tokens->parse, 'Token parsing is fine');
my $output = decode_json( $tokens->to_json );
is($output->{data}->{stream}->[0]->[0], '-:tokens$<i>43', 't');
is($output->{data}->{stream}->[0]->[3], 'i:nie', 't');
is($output->{data}->{stream}->[1]->[2], 's:zdążyła', 't');
## Base
ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
ok($tokens->add('NKJP', 'Morpho'), 'Add Morpho');
$output = $tokens->to_data;
is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho', 'Foundries');
is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/p=tokens', 'layerInfos');
my $token = join('||', @{$output->{data}->{stream}->[7]});
like($token, qr!<>:dereko\/s:seg\$<b>64!);
like($token, qr!<>:dereko\/s:seg\$<b>64!);
like($token, qr!iadu!);
like($token, qr!nkjp\/lad!);
like($token, qr!nkjp\/m:sg:gen:m3!);
like($token, qr!nkjp\/p:subst!);
like($token, qr!sadu!);
# KolakowskiOco
$path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KolakowskiOco');
ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
is($doc->text_sigle, 'NKJP/NKJP/KolakowskiOco', 'Correct text sigle');
is($doc->doc_sigle, 'NKJP/NKJP', 'Correct document sigle');
is($doc->corpus_sigle, 'NKJP', 'Correct corpus sigle');
$meta = $doc->meta;
is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "O co nas pytają wielcy filozofowie. Seria 3 "', 'Title');
ok(!$meta->{T_sub_title}, 'SubTitle');
ok(!$meta->{T_author}, 'Author');
ok(!$meta->{A_editor}, 'Editor');
ok(!$meta->{S_pub_place}, 'PubPlace');
ok(!$meta->{A_publisher}, 'Publisher');
ok(!$meta->{S_text_type}, 'No Text Type');
ok(!$meta->{S_text_type_art}, 'No Text Type Art');
ok(!$meta->{S_text_type_ref}, 'No Text Type Ref');
ok(!$meta->{S_text_domain}, 'No Text Domain');
ok(!$meta->{S_text_column}, 'No Text Column');
# Get tokenization
$tokens = KorAP::XML::Tokenizer->new(
path => $doc->path,
doc => $doc,
foundry => $token_base_foundry,
layer => $token_base_layer,
name => 'tokens'
);
ok($tokens, 'Token Object is fine');
ok($tokens->parse, 'Token parsing is fine');
$output = decode_json( $tokens->to_json );
is($output->{data}->{stream}->[0]->[0], '-:tokens$<i>117', 't');
is($output->{data}->{stream}->[0]->[3], 'i:czy', 't');
is($output->{data}->{stream}->[1]->[2], 's:zdarza', 't');
## Base
ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
ok($tokens->add('NKJP', 'Morpho'), 'Add Morpho');
ok($tokens->add('NKJP', 'NamedEntities'), 'Add NamedEntities');
$output = $tokens->to_data;
is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho nkjp/namedentities', 'Foundries');
is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/ne=tokens nkjp/p=tokens', 'layerInfos');
$token = join('||', @{$output->{data}->{stream}->[5]});
like($token, qr!<>:dereko/s:seg\$<b>64<i>23<i>28<i>6<b>4<s>1!);
like($token, qr!_5\$<i>23<i>28!);
like($token, qr!i:takie!);
like($token, qr!nkjp/l:taki!);
like($token, qr!nkjp/m:sg:nom:n:pos!);
like($token, qr!nkjp/p:adj!);
like($token, qr!s:takie!);
$token = join('||', @{$output->{data}->{stream}->[67]});
like($token, qr!<>:dereko/s:seg\$<b>64<i>464<i>475<i>68<b>4<s>1!);
like($token, qr!\@:dereko\/s:corresp:ann_segmentation\.xml\\#segm_2\.2-seg\$<b>17<s>1<i>68!);
like($token, qr!\@:dereko\/s:id:morph_2\.2-seg\$<b>17<s>1<i>68!);
like($token, qr!_67\$<i>464<i>475!);
like($token, qr!i:kierkegaard!);
like($token, qr!nkjp/l:Kierkegaard!);
like($token, qr!nkjp/m:sg:nom:m1!);
like($token, qr!nkjp/ne:persName:surname!);
like($token, qr!nkjp/p:subst!);
like($token, qr!s:Kierkegaard!);
done_testing;
__END__