Started meta parsing for Schreibgebrauch
Change-Id: Ib0f58cfaceff691bc237dfee8a5f957fb3b3391c
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index 8aeec55..3e9f82e 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm
@@ -145,11 +145,12 @@
unshift @header, '/' . catfile(@path, 'header.xml');
pop @path;
};
+
my @type = qw/corpus doc text/;
foreach (@header) {
# Get corpus, doc and text meta data
my $type = shift(@type);
- $self->_parse_meta($_, $type) if -e $_;
+ $self->_parse_meta_i5($_, $type) if -e $_;
};
return 1;
@@ -213,7 +214,7 @@
};
-sub _parse_meta {
+sub _parse_meta_i5 {
my $self = shift;
my $header_xml = shift;
my $type = shift;
@@ -521,88 +522,6 @@
};
-# Don't work that well
-sub _parse_meta_fast {
- my $self = shift;
-
- # my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1');
- my $file = b($self->path . 'header.xml')->slurp;
-
- my ($meta, $error);
- my $unable = 'Unable to parse document ' . $self->path;
-
- try {
- local $SIG{__WARN__} = sub {
- $error = 1;
- };
- $meta = xml2hash(
- $file,
- text => '#text',
- attr => '-',
- array => ['h.title', 'imprint', 'catRef', 'h.author']
- )->{idsHeader};
- }
- catch {
- $self->log->warn($unable);
- $error = 1;
- };
-
- return if $error;
-
- my $bibl_struct = $meta->{fileDesc}->{sourceDesc}->{biblStruct};
- my $analytic = $bibl_struct->{analytic};
-
- my $titles = $analytic->{'h.title'};
- foreach (@$titles) {
- if ($_->{'-type'} eq 'main') {
- $self->title($_->{'#text'});
- }
- elsif ($_->{'-type'} eq 'sub') {
- $self->sub_title($_->{'#text'});
- };
- };
-
- # Get Author
- if (my $author = $analytic->{'h.author'}) {
- $self->author($author->[0]);
- };
-
- # Get pubDate
- my $date = $bibl_struct->{monogr}->{imprint};
- my ($year, $month, $day) = (0,0,0);
- foreach (@$date) {
- if ($date->{-type} eq 'year') {
- $year = $date->{'#text'};
- }
- elsif ($date->{-type} eq 'month') {
- $month = $date->{'#text'};
- }
- elsif ($date->{-type} eq 'day') {
- $day = $date->{'#text'};
- };
- };
-
- $year = 0 if $year !~ /^\d+$/;
- $month = 0 if $month !~ /^\d+$/;
- $day = 0 if $day !~ /^\d+$/;
-
- $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
- $date .= length($month) == 1 ? '0' . $month : $month;
- $date .= length($day) == 1 ? '0' . $day : $day;
-
- $self->pub_date($date);
-
- # Get textClasses
- my @topic;
- my $textClass = $meta->{profileDesc}->{textClass}->{catRef};
- foreach (@$textClass) {
- my ($ign, @ttopic) = split('\.', $_->{'-target'});
- push(@topic, @ttopic);
- };
- $self->text_class(@topic);
-};
-
-
1;
diff --git a/lib/KorAP/Index/Schreibgebrauch/Lemma.pm b/lib/KorAP/Index/Schreibgebrauch/Lemma.pm
index d3807e2..5bd01d5 100644
--- a/lib/KorAP/Index/Schreibgebrauch/Lemma.pm
+++ b/lib/KorAP/Index/Schreibgebrauch/Lemma.pm
@@ -34,10 +34,14 @@
# warn $found;
unless ($first++) {
- $mtt->add(term => 'sgbr/l:' . $found);
+ $mtt->add(
+ term => 'sgbr/l:' . $found
+ );
}
else {
- $mtt->add(term => 'sgbr/lv:' . $found);
+ $mtt->add(
+ term => 'sgbr/lv:' . $found
+ );
};
};
};
diff --git a/t/index/mate_dependency.t b/t/index/mate_dependency.t
index 1622bad..2228a43 100644
--- a/t/index/mate_dependency.t
+++ b/t/index/mate_dependency.t
@@ -2,7 +2,7 @@
use strict;
use warnings;
use utf8;
-use Test::More; # skip_all => 'Not yet implemented';
+use Test::More skip_all => 'Not yet implemented';
use Scalar::Util qw/weaken/;
use Data::Dumper;
use lib 't/index';
diff --git a/t/sgbr/TEST/BSP/1/sgbr/ana.xml b/t/sgbr/TEST/BSP/1/sgbr/ana.xml
index 987b84d..9a2e798 100644
--- a/t/sgbr/TEST/BSP/1/sgbr/ana.xml
+++ b/t/sgbr/TEST/BSP/1/sgbr/ana.xml
@@ -684,4 +684,4 @@
</fs>
</span>
</spanList>
-</layer>
+</layer>
\ No newline at end of file
diff --git a/t/sgbr/TEST/BSP/1/sgbr/lemma.xml b/t/sgbr/TEST/BSP/1/sgbr/lemma.xml
index 5085f21..fbb28d0 100644
--- a/t/sgbr/TEST/BSP/1/sgbr/lemma.xml
+++ b/t/sgbr/TEST/BSP/1/sgbr/lemma.xml
@@ -491,4 +491,4 @@
</fs>
</span>
</spanList>
-</layer>
+</layer>
\ No newline at end of file
diff --git a/t/sgbr/sgbr_meta.t b/t/sgbr/sgbr_meta.t
new file mode 100644
index 0000000..8ab6414
--- /dev/null
+++ b/t/sgbr/sgbr_meta.t
@@ -0,0 +1,28 @@
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+use Data::Dumper;
+use KorAP::Tokenizer;
+use KorAP::Document;
+use utf8;
+
+my $path = catdir(dirname(__FILE__), 'TEST', 'BSP', 1);
+
+ok(my $doc = KorAP::Document->new(
+ path => $path . '/'
+), 'Create Document');
+
+ok($doc->parse, 'Parse document');
+
+like($doc->path, qr!$path/!, 'Path');
+
+# Metdata
+is($doc->text_sigle, 'TEST_BSP.1', 'ID-text');
+is($doc->doc_sigle, 'TEST_BSP', 'ID-doc');
+is($doc->corpus_sigle, 'TEST', 'ID-corpus');
+
+diag 'TODO: Parse meta';
+
+done_testing;