| Akron | 88d063a | 2022-03-21 15:10:01 +0100 | [diff] [blame^] | 1 | use strict; | 
|  | 2 | use warnings; | 
|  | 3 | use Test::More; | 
|  | 4 | use Data::Dumper; | 
|  | 5 | use JSON::XS; | 
|  | 6 |  | 
|  | 7 | if ($ENV{SKIP_REAL}) { | 
|  | 8 | plan skip_all => 'Skip real tests'; | 
|  | 9 | }; | 
|  | 10 |  | 
|  | 11 | use utf8; | 
|  | 12 | use lib 'lib', '../lib'; | 
|  | 13 |  | 
|  | 14 | use File::Basename 'dirname'; | 
|  | 15 | use File::Spec::Functions 'catdir'; | 
|  | 16 |  | 
|  | 17 | use_ok('KorAP::XML::Krill'); | 
|  | 18 |  | 
|  | 19 | my $path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KOT'); | 
|  | 20 |  | 
|  | 21 | ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document'); | 
|  | 22 | ok($doc->parse, 'Parse document'); | 
|  | 23 |  | 
|  | 24 | is($doc->text_sigle, 'NKJP/NKJP/KOT', 'Correct text sigle'); | 
|  | 25 | is($doc->doc_sigle, 'NKJP/NKJP', 'Correct document sigle'); | 
|  | 26 | is($doc->corpus_sigle, 'NKJP', 'Correct corpus sigle'); | 
|  | 27 |  | 
|  | 28 | my $meta = $doc->meta; | 
|  | 29 |  | 
|  | 30 | is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "Kot"', 'Title'); | 
|  | 31 | ok(!$meta->{T_sub_title}, 'SubTitle'); | 
|  | 32 | ok(!$meta->{T_author}, 'Author'); | 
|  | 33 | ok(!$meta->{A_editor}, 'Editor'); | 
|  | 34 | ok(!$meta->{S_pub_place}, 'PubPlace'); | 
|  | 35 | ok(!$meta->{A_publisher},  'Publisher'); | 
|  | 36 |  | 
|  | 37 | ok(!$meta->{S_text_type}, 'No Text Type'); | 
|  | 38 | ok(!$meta->{S_text_type_art}, 'No Text Type Art'); | 
|  | 39 | ok(!$meta->{S_text_type_ref}, 'No Text Type Ref'); | 
|  | 40 | ok(!$meta->{S_text_domain}, 'No Text Domain'); | 
|  | 41 | ok(!$meta->{S_text_column}, 'No Text Column'); | 
|  | 42 |  | 
|  | 43 |  | 
|  | 44 | # Tokenization | 
|  | 45 | use_ok('KorAP::XML::Tokenizer'); | 
|  | 46 |  | 
|  | 47 | my ($token_base_foundry, $token_base_layer) = (qw/nkjp Morpho/); | 
|  | 48 |  | 
|  | 49 | # Get tokenization | 
|  | 50 | my $tokens = KorAP::XML::Tokenizer->new( | 
|  | 51 | path => $doc->path, | 
|  | 52 | doc => $doc, | 
|  | 53 | foundry => $token_base_foundry, | 
|  | 54 | layer => $token_base_layer, | 
|  | 55 | name => 'tokens' | 
|  | 56 | ); | 
|  | 57 | ok($tokens, 'Token Object is fine'); | 
|  | 58 | ok($tokens->parse, 'Token parsing is fine'); | 
|  | 59 |  | 
|  | 60 | my $output = decode_json( $tokens->to_json ); | 
|  | 61 |  | 
|  | 62 | is($output->{data}->{stream}->[0]->[0], '-:tokens$<i>43', 't'); | 
|  | 63 | is($output->{data}->{stream}->[0]->[3], 'i:nie', 't'); | 
|  | 64 | is($output->{data}->{stream}->[1]->[2], 's:zdążyła', 't'); | 
|  | 65 |  | 
|  | 66 | ## Base | 
|  | 67 | ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs')); | 
|  | 68 | ok($tokens->add('NKJP', 'Morpho'), 'Add Gingko'); | 
|  | 69 |  | 
|  | 70 | $output = $tokens->to_data; | 
|  | 71 |  | 
|  | 72 | is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho', 'Foundries'); | 
|  | 73 |  | 
|  | 74 | is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/p=tokens', 'layerInfos'); | 
|  | 75 |  | 
|  | 76 | my $token = join('||', @{$output->{data}->{stream}->[7]}); | 
|  | 77 |  | 
|  | 78 | like($token, qr!<>:dereko\/s:seg\$<b>64!); | 
|  | 79 | like($token, qr!<>:dereko\/s:seg\$<b>64!); | 
|  | 80 | like($token, qr!i:ładu!); | 
|  | 81 | like($token, qr!nkjp\/l:ład!); | 
|  | 82 | like($token, qr!nkjp\/m:sg:gen:m3!); | 
|  | 83 | like($token, qr!nkjp\/p:subst!); | 
|  | 84 | like($token, qr!s:ładu!); | 
|  | 85 |  | 
|  | 86 | done_testing; | 
|  | 87 | __END__ | 
|  | 88 |  |