blob: 5edd877459b94a675a0fecadbd5f7350b58a147b [file] [log] [blame]
Akronabb36902021-10-11 15:51:06 +02001use strict;
2use warnings;
3use Test::More;
4use Data::Dumper;
5use JSON::XS;
6
7if ($ENV{SKIP_REAL}) {
8 plan skip_all => 'Skip real tests';
9};
10
11use Benchmark qw/:hireswallclock/;
12
13my $t = Benchmark->new;
14
15use utf8;
16use lib 'lib', '../lib';
17
18use File::Basename 'dirname';
19use File::Spec::Functions 'catdir';
20
21use_ok('KorAP::XML::Krill');
22
23# This will Check Gingko-Files
24
25# New
26# ATZ07/JAN/00001
27my $path = catdir(dirname(__FILE__), 'corpus','Gingko', 'ATZ07','JAN','00001');
28
29ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
30ok($doc->parse, 'Parse document');
31
32is($doc->text_sigle, 'ATZ07/JAN/00001', 'Correct text sigle');
33is($doc->doc_sigle, 'ATZ07/JAN', 'Correct document sigle');
34is($doc->corpus_sigle, 'ATZ07', 'Correct corpus sigle');
35
36my $meta = $doc->meta;
37is($meta->{T_title}, 'Ein neues Energiemanagement-Konzept für das elektrische Bordnetz', 'Title');
38is($meta->{S_pub_place}, 'Wiesbaden', 'PubPlace');
39is($meta->{D_pub_date}, '20070000', 'Creation Date');
40ok(!$meta->{T_sub_title}, 'SubTitle');
41is($meta->{T_author}, 'Theuerkauf, Heinz; Schmidt, Matthias', 'Author');
42
43is($meta->{A_publisher}, 'Springer Fachmedien GmbH', 'Publisher');
44ok(!$meta->{A_editor}, 'Editor');
45ok(!$meta->{A_translator}, 'Translator');
46is($meta->{S_text_type}, 'Zeitschrift: Fachzeitschrift', 'Correct Text Type');
47is($meta->{S_text_type_art}, 'Fachartikel', 'Correct Text Type Art');
48is($meta->{S_text_type_ref}, 'Fachzeitschrift', 'Correct Text Type Ref');
49ok(!$meta->{S_text_column}, 'Correct Text Column');
50ok(!$meta->{S_text_domain}, 'Correct Text Domain');
51ok(!$meta->{D_creation_date}, 'Creation Date');
52
53ok(!$meta->{pages}, 'Pages');
54ok(!$meta->{A_file_edition_statement}, 'File Ed Statement');
55ok(!$meta->{A_bibl_edition_statement}, 'Bibl Ed Statement');
56is($meta->{A_reference}, 'ATZ - Automobiltechnische Zeitschrift, Januar 2007, Nr.109, S. 10-15 - Theuerkauf, H.; Schmidt, M.: Ein neues Energiemanagement-Konzept für das elektrische Bordnetz', 'Reference');
57is($meta->{S_language}, 'de', 'Language');
58
59is($meta->{T_corpus_title}, 'Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus', 'Correct Corpus title');
60ok(!$meta->{T_corpus_sub_title}, 'Correct Corpus Sub title');
61ok(!$meta->{T_corpus_author}, 'Correct Corpus author');
62is($meta->{A_corpus_editor}, 'Christian Fandrych', 'Correct Corpus editor');
63
64is($meta->{T_doc_title}, 'Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus', 'Correct Doc title');
65ok(!$meta->{T_doc_sub_title}, 'Correct Doc Sub title');
66ok(!$meta->{T_doc_author}, 'Correct Doc author');
67is($meta->{A_doc_editor}, 'Prof. Dr. Christian Fandrych, Leipzig University', 'Correct Doc editor');
68
69# Tokenization
70use_ok('KorAP::XML::Tokenizer');
71
72my ($token_base_foundry, $token_base_layer) = (qw/Gingko Morpho/);
73
74# Get tokenization
75my $tokens = KorAP::XML::Tokenizer->new(
76 path => $doc->path,
77 doc => $doc,
78 foundry => $token_base_foundry,
79 layer => $token_base_layer,
80 name => 'tokens'
81);
82ok($tokens, 'Token Object is fine');
83ok($tokens->parse, 'Token parsing is fine');
84
85my $output = decode_json( $tokens->to_json );
86
87## Base
88ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
89ok($tokens->add('Gingko', 'Morpho'), 'Add Gingko');
90
91$output = $tokens->to_data;
92
93is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs gingko gingko/morpho', 'Foundries');
94
95is($output->{data}->{layerInfos}, 'dereko/s=spans gingko/l=tokens gingko/p=tokens', 'layerInfos');
96
97my $token = join('||', @{$output->{data}->{stream}->[7]});
98
99# Unknown
100unlike($token, qr!gingko/l!, 'data');
101like($token, qr!ginkgo/p:NN!, 'data');
102
103$token = join('||', @{$output->{data}->{stream}->[9]});
104
105like($token, qr!i:heutige!, 'data');
106like($token, qr!ginkgo/p:ADJA!, 'data');
107like($token, qr!gingko/l:heutig!, 'data');
108
109done_testing;
110__END__
111