blob: 6895c19756ae43ec8864ef1bb2921f6898a3992e [file] [log] [blame]
Akronabb36902021-10-11 15:51:06 +02001use strict;
2use warnings;
3use Test::More;
4use Data::Dumper;
5use JSON::XS;
6
7if ($ENV{SKIP_REAL}) {
8 plan skip_all => 'Skip real tests';
9};
10
11use Benchmark qw/:hireswallclock/;
12
13my $t = Benchmark->new;
14
15use utf8;
16use lib 'lib', '../lib';
17
18use File::Basename 'dirname';
19use File::Spec::Functions 'catdir';
20
21use_ok('KorAP::XML::Krill');
22
23# This will Check Gingko-Files
24
25# New
26# ATZ07/JAN/00001
27my $path = catdir(dirname(__FILE__), 'corpus','Gingko', 'ATZ07','JAN','00001');
28
Akron41e6c8b2021-10-14 20:22:18 +020029ok(my $doc = KorAP::XML::Krill->new(
30 path => $path . '/',
31 meta_type => 'Gingko'
32), 'Load Korap::Document');
Akronabb36902021-10-11 15:51:06 +020033ok($doc->parse, 'Parse document');
34
35is($doc->text_sigle, 'ATZ07/JAN/00001', 'Correct text sigle');
36is($doc->doc_sigle, 'ATZ07/JAN', 'Correct document sigle');
37is($doc->corpus_sigle, 'ATZ07', 'Correct corpus sigle');
38
39my $meta = $doc->meta;
40is($meta->{T_title}, 'Ein neues Energiemanagement-Konzept für das elektrische Bordnetz', 'Title');
41is($meta->{S_pub_place}, 'Wiesbaden', 'PubPlace');
42is($meta->{D_pub_date}, '20070000', 'Creation Date');
43ok(!$meta->{T_sub_title}, 'SubTitle');
44is($meta->{T_author}, 'Theuerkauf, Heinz; Schmidt, Matthias', 'Author');
45
46is($meta->{A_publisher}, 'Springer Fachmedien GmbH', 'Publisher');
47ok(!$meta->{A_editor}, 'Editor');
48ok(!$meta->{A_translator}, 'Translator');
49is($meta->{S_text_type}, 'Zeitschrift: Fachzeitschrift', 'Correct Text Type');
50is($meta->{S_text_type_art}, 'Fachartikel', 'Correct Text Type Art');
51is($meta->{S_text_type_ref}, 'Fachzeitschrift', 'Correct Text Type Ref');
52ok(!$meta->{S_text_column}, 'Correct Text Column');
53ok(!$meta->{S_text_domain}, 'Correct Text Domain');
54ok(!$meta->{D_creation_date}, 'Creation Date');
55
56ok(!$meta->{pages}, 'Pages');
57ok(!$meta->{A_file_edition_statement}, 'File Ed Statement');
58ok(!$meta->{A_bibl_edition_statement}, 'Bibl Ed Statement');
Akron8c85e9f2022-01-03 16:27:10 +010059is($meta->{A_reference}, 'ATZ - Automobiltechnische Zeitschrift, Januar 2007, Nr. 109(1), S. 10-15 - Theuerkauf, H.; Schmidt, M.: Ein neues Energiemanagement-Konzept für das elektrische Bordnetz (DOI:10.1007/BF03221854)', 'Reference');
Akronabb36902021-10-11 15:51:06 +020060is($meta->{S_language}, 'de', 'Language');
61
Akron8c85e9f2022-01-03 16:27:10 +010062ok(!$meta->{A_bibl_edition_statement}, 'Bibl Ed Statement');
63
64
65is($meta->{T_corpus_title}, 'Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus: ATZ - Automobiltechnische Zeitschrift, 2007', 'Correct Corpus title');
Akronabb36902021-10-11 15:51:06 +020066ok(!$meta->{T_corpus_sub_title}, 'Correct Corpus Sub title');
67ok(!$meta->{T_corpus_author}, 'Correct Corpus author');
Akron8c85e9f2022-01-03 16:27:10 +010068is($meta->{A_corpus_editor}, 'Prof. Dr. Christian Fandrych, Leipzig University; Jun.-Prof. Dr. Antje Heine', 'Correct Corpus editor');
Akronabb36902021-10-11 15:51:06 +020069
Akron8c85e9f2022-01-03 16:27:10 +010070is($meta->{T_doc_title}, 'ATZ - Automobiltechnische Zeitschrift, Januar 2007', 'Correct Doc title');
Akronabb36902021-10-11 15:51:06 +020071ok(!$meta->{T_doc_sub_title}, 'Correct Doc Sub title');
72ok(!$meta->{T_doc_author}, 'Correct Doc author');
Akron8c85e9f2022-01-03 16:27:10 +010073ok(!$meta->{A_doc_editor}, 'Correct Doc editor');
Akronabb36902021-10-11 15:51:06 +020074
Akron8ad06c42022-01-11 17:07:49 +010075# Gingko Metadata
Akron41e6c8b2021-10-14 20:22:18 +020076is($meta->{S_gingko_genre_main}, 'wissenschaftlich');
77is($meta->{S_gingko_genre_sub}, 'wissenschaftlich');
78is($meta->{T_gingko_source}, 'ATZ - Automobiltechnische Zeitschrift');
79is($meta->{S_gingko_source_short}, 'ATZ');
80is($meta->{S_gingko_lemma_corr}, 'no');
81is($meta->{T_gingko_collection}, 'Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus');
82is($meta->{S_gingko_collection_short}, 'Gingko');
Akron8c85e9f2022-01-03 16:27:10 +010083is($meta->{A_gingko_article_DOI}, 'data:application/x.korap-link;title=doi%3A10.1007%2FBF03221854,https%3A%2F%2Fdoi.org%2F10.1007%2FBF03221854', 'Gingko Article DOI');
84is($meta->{I_gingko_text_tokens}, '2191', 'Gingko Text Tokens');
85is($meta->{A_internal_link}, 'data:application/x.korap-link;title=IDS%20webpage%20on%20Gingko%20in%20the%20DeReKo%20archive,https%3A%2F%2Fwww.ids-mannheim.de%2Fdigspra%2Fkl%2Fprojekte%2Fkorpora%2Farchiv-1%2Fgingko%2F', 'Gingko Internal Link');
Akron2a4c4ce2022-02-07 19:51:00 +010086is($meta->{A_external_link}, 'data:application/x.korap-link;title=Gingko-Webseite%20an%20der%20Universit%C3%A4t%20Leipzig,http%3A%2F%2Fwww.uni-leipzig.de%2Fgingko%2F', 'Gingko External Link');
Akron8c85e9f2022-01-03 16:27:10 +010087
Akron41e6c8b2021-10-14 20:22:18 +020088
Akronabb36902021-10-11 15:51:06 +020089# Tokenization
90use_ok('KorAP::XML::Tokenizer');
91
92my ($token_base_foundry, $token_base_layer) = (qw/Gingko Morpho/);
93
94# Get tokenization
95my $tokens = KorAP::XML::Tokenizer->new(
96 path => $doc->path,
97 doc => $doc,
98 foundry => $token_base_foundry,
99 layer => $token_base_layer,
100 name => 'tokens'
101);
102ok($tokens, 'Token Object is fine');
103ok($tokens->parse, 'Token parsing is fine');
104
105my $output = decode_json( $tokens->to_json );
106
107## Base
108ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
109ok($tokens->add('Gingko', 'Morpho'), 'Add Gingko');
110
111$output = $tokens->to_data;
112
113is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs gingko gingko/morpho', 'Foundries');
114
115is($output->{data}->{layerInfos}, 'dereko/s=spans gingko/l=tokens gingko/p=tokens', 'layerInfos');
116
117my $token = join('||', @{$output->{data}->{stream}->[7]});
118
119# Unknown
120unlike($token, qr!gingko/l!, 'data');
Akron8ad06c42022-01-11 17:07:49 +0100121like($token, qr!gingko/p:NN!, 'data');
Akronabb36902021-10-11 15:51:06 +0200122
123$token = join('||', @{$output->{data}->{stream}->[9]});
124
125like($token, qr!i:heutige!, 'data');
Akron8ad06c42022-01-11 17:07:49 +0100126like($token, qr!gingko/p:ADJA!, 'data');
Akronabb36902021-10-11 15:51:06 +0200127like($token, qr!gingko/l:heutig!, 'data');
128
Akron8ad06c42022-01-11 17:07:49 +0100129# Check Gingko meta in Koral
Akron41e6c8b2021-10-14 20:22:18 +0200130my $koral = decode_json($tokens->to_json(0.4));
131
132my $test = 0;
133foreach (@{$koral->{fields}}) {
134 if ($_->{key} eq 'gingkoGenreMain') {
135 is($_->{'type'},'type:string');
136 is($_->{'value'},'wissenschaftlich');
137 $test++;
138 }
139 elsif ($_->{key} eq 'gingkoCollection') {
140 is($_->{'type'},'type:text');
141 is($_->{'value'},'Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus');
142 $test++;
143 };
144};
145
146is($test,2);
147
Akronabb36902021-10-11 15:51:06 +0200148done_testing;
149__END__
150