Support for Gingko metadata
Change-Id: I913444b85000da6be8af05d1e376a5b83e888515
diff --git a/Readme.pod b/Readme.pod
index 89abd5f..3009ba8 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -426,6 +426,26 @@
See the built-in annotation importers as examples.
+=head1 METADATA SUPPORT
+
+L<KorAP::XML::Krill> has built-in importer for some meta data variants
+developed in the KorAP project that are part of the KorAP preprocessing pipeline.
+
+=over 2
+
+=item I5 - Meta data for all I5 files
+
+=item Sgbr - Meta data from the Schreibgebrauch project
+
+=item Gingko - Meta data from the Gingko project in addition to I5
+
+=back
+
+More importers are in preparation.
+New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
+See the built-in meta data importers as examples.
+
+
=head1 About KorAP-XML
KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
diff --git a/lib/KorAP/XML/Meta/Gingko.pm b/lib/KorAP/XML/Meta/Gingko.pm
new file mode 100644
index 0000000..821b0d3
--- /dev/null
+++ b/lib/KorAP/XML/Meta/Gingko.pm
@@ -0,0 +1,60 @@
+package KorAP::XML::Meta::Gingko;
+use KorAP::XML::Meta::Base;
+use KorAP::XML::Meta::I5;
+
+my $squish = \&KorAP::XML::Meta::I5::_squish;
+
+sub parse {
+ my ($self, $dom, $type) = @_;
+
+ unless (KorAP::XML::Meta::I5::parse($self, $dom, $type)) {
+ return 0;
+ };
+
+ my $temp;
+
+ if ($type eq 'text') {
+ if ($temp = $dom->at('textClass > classCode[scheme=gingkoGenre.top]')) {
+ $temp = $squish->($temp->all_text);
+ $self->{S_gingko_genre_main} = $temp if $temp;
+ };
+
+ if ($temp = $dom->at('textClass > classCode[scheme=gingkoGenre.sub]')) {
+ $temp = $squish->($temp->all_text);
+ $self->{S_gingko_genre_sub} = $temp if $temp;
+ };
+
+ if (my $mono = $dom->at('sourceDesc > biblStruct > monogr')) {
+ if ($temp = $mono->at('h\.title[type=main]')) {
+ $temp = $squish->($temp->all_text);
+ $self->{T_gingko_source} = $temp if $temp;
+ };
+
+ if ($temp = $mono->at('h\.title[type=short]')) {
+ $temp = $squish->($temp->all_text);
+ $self->{S_gingko_source_short} = $temp if $temp;
+ };
+ };
+
+ if ($temp = $dom->at('correction')) {
+ $temp = $squish->($temp->all_text);
+ $self->{S_gingko_lemma_corr} = $temp if $temp;
+ };
+ }
+
+ elsif ($type eq 'corpus') {
+ if (my $mono = $dom->at('sourceDesc > biblStruct > monogr')) {
+ if ($temp = $mono->at('biblNote[n=collection]')) {
+ $temp = $squish->($temp->all_text);
+ $self->{T_gingko_collection} = $temp if $temp;
+ };
+
+ if ($temp = $mono->at('biblNote[n=collectionShort]')) {
+ $temp = $squish->($temp->all_text);
+ $self->{S_gingko_collection_short} = $temp if $temp;
+ };
+ };
+ };
+};
+
+1;
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 2376a5e..b2533bc 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -1431,6 +1431,26 @@
See the built-in annotation importers as examples.
+=head1 METADATA SUPPORT
+
+L<KorAP::XML::Krill> has built-in importer for some meta data variants
+developed in the KorAP project that are part of the KorAP preprocessing pipeline.
+
+=over 2
+
+=item I5 - Meta data for all I5 files
+
+=item Sgbr - Meta data from the Schreibgebrauch project
+
+=item Gingko - Meta data from the Gingko project in addition to I5
+
+=back
+
+More importers are in preparation.
+New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
+See the built-in meta data importers as examples.
+
+
=head1 About KorAP-XML
KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
diff --git a/t/real/gingko.t b/t/real/gingko.t
index 5edd877..35cb8de 100644
--- a/t/real/gingko.t
+++ b/t/real/gingko.t
@@ -26,7 +26,10 @@
# ATZ07/JAN/00001
my $path = catdir(dirname(__FILE__), 'corpus','Gingko', 'ATZ07','JAN','00001');
-ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
+ok(my $doc = KorAP::XML::Krill->new(
+ path => $path . '/',
+ meta_type => 'Gingko'
+), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
is($doc->text_sigle, 'ATZ07/JAN/00001', 'Correct text sigle');
@@ -66,6 +69,15 @@
ok(!$meta->{T_doc_author}, 'Correct Doc author');
is($meta->{A_doc_editor}, 'Prof. Dr. Christian Fandrych, Leipzig University', 'Correct Doc editor');
+# Ginkgo Metadata
+is($meta->{S_gingko_genre_main}, 'wissenschaftlich');
+is($meta->{S_gingko_genre_sub}, 'wissenschaftlich');
+is($meta->{T_gingko_source}, 'ATZ - Automobiltechnische Zeitschrift');
+is($meta->{S_gingko_source_short}, 'ATZ');
+is($meta->{S_gingko_lemma_corr}, 'no');
+is($meta->{T_gingko_collection}, 'Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus');
+is($meta->{S_gingko_collection_short}, 'Gingko');
+
# Tokenization
use_ok('KorAP::XML::Tokenizer');
@@ -106,6 +118,25 @@
like($token, qr!ginkgo/p:ADJA!, 'data');
like($token, qr!gingko/l:heutig!, 'data');
+# Check Ginkgo meta in Koral
+my $koral = decode_json($tokens->to_json(0.4));
+
+my $test = 0;
+foreach (@{$koral->{fields}}) {
+ if ($_->{key} eq 'gingkoGenreMain') {
+ is($_->{'type'},'type:string');
+ is($_->{'value'},'wissenschaftlich');
+ $test++;
+ }
+ elsif ($_->{key} eq 'gingkoCollection') {
+ is($_->{'type'},'type:text');
+ is($_->{'value'},'Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus');
+ $test++;
+ };
+};
+
+is($test,2);
+
done_testing;
__END__