blob: 84ae9d7841fa91ae899da21bec5eb4c28bcc87c6 [file] [log] [blame]
Akronabb36902021-10-11 15:51:06 +02001use strict;
2use warnings;
3use Test::More;
4use Data::Dumper;
5use JSON::XS;
6
7if ($ENV{SKIP_REAL}) {
8 plan skip_all => 'Skip real tests';
9};
10
11use Benchmark qw/:hireswallclock/;
12
13my $t = Benchmark->new;
14
15use utf8;
16use lib 'lib', '../lib';
17
18use File::Basename 'dirname';
19use File::Spec::Functions 'catdir';
20
21use_ok('KorAP::XML::Krill');
22
23# This will Check Gingko-Files
24
25# New
26# ATZ07/JAN/00001
27my $path = catdir(dirname(__FILE__), 'corpus','Gingko', 'ATZ07','JAN','00001');
28
Akron41e6c8b2021-10-14 20:22:18 +020029ok(my $doc = KorAP::XML::Krill->new(
30 path => $path . '/',
31 meta_type => 'Gingko'
32), 'Load Korap::Document');
Akronabb36902021-10-11 15:51:06 +020033ok($doc->parse, 'Parse document');
34
35is($doc->text_sigle, 'ATZ07/JAN/00001', 'Correct text sigle');
36is($doc->doc_sigle, 'ATZ07/JAN', 'Correct document sigle');
37is($doc->corpus_sigle, 'ATZ07', 'Correct corpus sigle');
38
39my $meta = $doc->meta;
40is($meta->{T_title}, 'Ein neues Energiemanagement-Konzept für das elektrische Bordnetz', 'Title');
41is($meta->{S_pub_place}, 'Wiesbaden', 'PubPlace');
42is($meta->{D_pub_date}, '20070000', 'Creation Date');
43ok(!$meta->{T_sub_title}, 'SubTitle');
44is($meta->{T_author}, 'Theuerkauf, Heinz; Schmidt, Matthias', 'Author');
45
46is($meta->{A_publisher}, 'Springer Fachmedien GmbH', 'Publisher');
47ok(!$meta->{A_editor}, 'Editor');
48ok(!$meta->{A_translator}, 'Translator');
49is($meta->{S_text_type}, 'Zeitschrift: Fachzeitschrift', 'Correct Text Type');
50is($meta->{S_text_type_art}, 'Fachartikel', 'Correct Text Type Art');
51is($meta->{S_text_type_ref}, 'Fachzeitschrift', 'Correct Text Type Ref');
52ok(!$meta->{S_text_column}, 'Correct Text Column');
53ok(!$meta->{S_text_domain}, 'Correct Text Domain');
54ok(!$meta->{D_creation_date}, 'Creation Date');
55
56ok(!$meta->{pages}, 'Pages');
57ok(!$meta->{A_file_edition_statement}, 'File Ed Statement');
58ok(!$meta->{A_bibl_edition_statement}, 'Bibl Ed Statement');
59is($meta->{A_reference}, 'ATZ - Automobiltechnische Zeitschrift, Januar 2007, Nr.109, S. 10-15 - Theuerkauf, H.; Schmidt, M.: Ein neues Energiemanagement-Konzept für das elektrische Bordnetz', 'Reference');
60is($meta->{S_language}, 'de', 'Language');
61
62is($meta->{T_corpus_title}, 'Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus', 'Correct Corpus title');
63ok(!$meta->{T_corpus_sub_title}, 'Correct Corpus Sub title');
64ok(!$meta->{T_corpus_author}, 'Correct Corpus author');
65is($meta->{A_corpus_editor}, 'Christian Fandrych', 'Correct Corpus editor');
66
67is($meta->{T_doc_title}, 'Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus', 'Correct Doc title');
68ok(!$meta->{T_doc_sub_title}, 'Correct Doc Sub title');
69ok(!$meta->{T_doc_author}, 'Correct Doc author');
70is($meta->{A_doc_editor}, 'Prof. Dr. Christian Fandrych, Leipzig University', 'Correct Doc editor');
71
Akron8ad06c42022-01-11 17:07:49 +010072# Gingko Metadata
Akron41e6c8b2021-10-14 20:22:18 +020073is($meta->{S_gingko_genre_main}, 'wissenschaftlich');
74is($meta->{S_gingko_genre_sub}, 'wissenschaftlich');
75is($meta->{T_gingko_source}, 'ATZ - Automobiltechnische Zeitschrift');
76is($meta->{S_gingko_source_short}, 'ATZ');
77is($meta->{S_gingko_lemma_corr}, 'no');
78is($meta->{T_gingko_collection}, 'Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus');
79is($meta->{S_gingko_collection_short}, 'Gingko');
80
Akronabb36902021-10-11 15:51:06 +020081# Tokenization
82use_ok('KorAP::XML::Tokenizer');
83
84my ($token_base_foundry, $token_base_layer) = (qw/Gingko Morpho/);
85
86# Get tokenization
87my $tokens = KorAP::XML::Tokenizer->new(
88 path => $doc->path,
89 doc => $doc,
90 foundry => $token_base_foundry,
91 layer => $token_base_layer,
92 name => 'tokens'
93);
94ok($tokens, 'Token Object is fine');
95ok($tokens->parse, 'Token parsing is fine');
96
97my $output = decode_json( $tokens->to_json );
98
99## Base
100ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
101ok($tokens->add('Gingko', 'Morpho'), 'Add Gingko');
102
103$output = $tokens->to_data;
104
105is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs gingko gingko/morpho', 'Foundries');
106
107is($output->{data}->{layerInfos}, 'dereko/s=spans gingko/l=tokens gingko/p=tokens', 'layerInfos');
108
109my $token = join('||', @{$output->{data}->{stream}->[7]});
110
111# Unknown
112unlike($token, qr!gingko/l!, 'data');
Akron8ad06c42022-01-11 17:07:49 +0100113like($token, qr!gingko/p:NN!, 'data');
Akronabb36902021-10-11 15:51:06 +0200114
115$token = join('||', @{$output->{data}->{stream}->[9]});
116
117like($token, qr!i:heutige!, 'data');
Akron8ad06c42022-01-11 17:07:49 +0100118like($token, qr!gingko/p:ADJA!, 'data');
Akronabb36902021-10-11 15:51:06 +0200119like($token, qr!gingko/l:heutig!, 'data');
120
Akron8ad06c42022-01-11 17:07:49 +0100121# Check Gingko meta in Koral
Akron41e6c8b2021-10-14 20:22:18 +0200122my $koral = decode_json($tokens->to_json(0.4));
123
124my $test = 0;
125foreach (@{$koral->{fields}}) {
126 if ($_->{key} eq 'gingkoGenreMain') {
127 is($_->{'type'},'type:string');
128 is($_->{'value'},'wissenschaftlich');
129 $test++;
130 }
131 elsif ($_->{key} eq 'gingkoCollection') {
132 is($_->{'type'},'type:text');
133 is($_->{'value'},'Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus');
134 $test++;
135 };
136};
137
138is($test,2);
139
Akronabb36902021-10-11 15:51:06 +0200140done_testing;
141__END__
142