Improved tei support and script
Change-Id: I62fc97828aec1a1acec7d22f8892f54ed6d81803
diff --git a/.gitignore b/.gitignore
index 832d443..87e491e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,8 @@
MYMETA*
Makefile
pm_to_blib
+t/sgbr/PRO-DUD*
+t/sgbr/meta_duden.t
*.tar.gz
*~
*.sqlite
diff --git a/Makefile.PL b/Makefile.PL
index 972d207..f278586 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -36,5 +36,6 @@
't/index/*.t ' .
't/sgbr/*.t ' .
't/real/*.t'
- }
+ },
+ EXE_FILES => ['script/korapxml2krill']
);
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index b2af20c..f56ad34 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -296,37 +296,61 @@
my $type = shift;
my $stmt;
- if ($type eq 'text' && ($stmt = $dom->at('titleStmt'))) {
+ if ($type eq 'text') {
- # Title
+ # Publisher
try {
- $stmt->find('title')->each(
- sub {
- my $type = $_->attr('type') || 'main';
- $self->title($_->all_text) if $type eq 'main';
- $self->sub_title($_->all_text) if $type eq 'sub';
- }
- );
+ $self->publisher($dom->at('publisher')->all_text);
};
- # Author
+ # Date of publication
try {
- my $author = $stmt->at('author')->attr('ref');
+ my $date = $dom->at('date')->all_text;
+ if ($date =~ s!^\s*(\d{4})-(\d{2})-(\d{2})!$1$2$3!) {
+ $self->pub_date($date);
+ }
+ else {
+ $self->log->warn('"' . $date . '" is not a compatible pubDate');
+ }
+ };
- $author = $self->{ref_author}->{$author};
+ # Publication place
+ try {
+ $self->pub_place($dom->at('pubPlace')->all_text);
+ };
- if ($author) {
+ if ($stmt = $dom->at('titleStmt')) {
+ # Title
+ try {
+ $stmt->find('title')->each(
+ sub {
+ my $type = $_->attr('type') || 'main';
+ $self->title($_->all_text) if $type eq 'main';
- my $array = $self->keywords;
- $self->author($author->{id});
+ # Only support the first subtitle
+ $self->sub_title($_->all_text) if $type eq 'sub' && !$self->sub_title;
+ }
+ );
+ };
- if ($author->{age}) {
- $self->store('sgbrAuthorAgeClass' => $author->{age});
- push @$array, 'sgbrAuthorAgeClass:' . $author->{age};
- };
- if ($author->{sex}) {
- $self->store('sgbrAuthorSex' => $author->{sex});
- push @$array, 'sgbrAuthorSex:' . $author->{sex};
+ # Author
+ try {
+ my $author = $stmt->at('author')->attr('ref');
+
+ $author = $self->{ref_author}->{$author};
+
+ if ($author) {
+ my $array = $self->keywords;
+ $self->author($author->{name} // $author->{id});
+
+ if ($author->{age}) {
+ $self->store('sgbrAuthorAgeClass' => $author->{age});
+ push @$array, 'sgbrAuthorAgeClass:' . $author->{age};
+ };
+ if ($author->{sex}) {
+ $self->store('sgbrAuthorSex' => $author->{sex});
+ push @$array, 'sgbrAuthorSex:' . $author->{sex};
+ };
};
};
};
@@ -346,11 +370,16 @@
$dom->find('particDesc person')->each(
sub {
- $self->{ref_author}->{'#' . $_->attr('xml:id')} = {
+ my $hash = $self->{ref_author}->{'#' . $_->attr('xml:id')} = {
age => $_->attr('age'),
sex => $_->attr('sex'),
id => $_->attr('xml:id')
- }
+ };
+
+ # Get name
+ if ($_->at('persName')) {
+ $hash->{name} = $_->at('persName')->all_text;
+ };
});
};
@@ -360,11 +389,20 @@
};
try {
- $stmt = $dom->find('titleStmt > title')->each(
+ $self->store('funder', $dom->at('funder > orgName')->all_text);
+ };
+
+ try {
+ $stmt = $dom->find('fileDesc > titleStmt > title')->each(
sub {
my $type = $_->attr('type') || 'main';
$self->doc_title($_->all_text) if $type eq 'main';
- $self->doc_sub_title($_->all_text) if $type eq 'sub';
+ if ($type eq 'sub') {
+ my $sub_title = $self->doc_sub_title;
+ $self->doc_sub_title(
+ ($sub_title ? $sub_title . ', ' : '') . $_->all_text
+ );
+ };
}
);
};
diff --git a/script/korapxml2krill_dir b/script/korapxml2krill_dir
index fa589eb..1f8ec14 100644
--- a/script/korapxml2krill_dir
+++ b/script/korapxml2krill_dir
@@ -86,21 +86,29 @@
$call .= ' -y ' . $pretty if $pretty;
$call .= ' -a ' . $_ foreach @allow;
$call .= ' -s ' . $_ foreach @skip;
- print "Convert $file\n";
+ print $file;
system($call);
+ print "\n";
};
my $it = Directory::Iterator->new($input);
+my @dirs;
my $dir;
while (1) {
if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
- write_file($dir);
+ push @dirs, $dir;
$it->prune;
};
last unless $it->next;
};
+my $count = scalar @dirs;
+for (my $i = 0; $i < $count; $i++) {
+ print "Convert [$i/$count] ";
+ write_file($dirs[$i]);
+};
+
__END__
diff --git a/t/sgbr/meta.t b/t/sgbr/meta.t
index a1e2b8e..c5f9b60 100644
--- a/t/sgbr/meta.t
+++ b/t/sgbr/meta.t
@@ -24,12 +24,7 @@
is($doc->corpus_sigle, 'TEST', 'ID-corpus');
is($doc->title, 'Sommerüberraschung', 'title');
-#is($doc->sub_title, 'Beispiel Text Untertitel', 'title');
-#is($doc->pub_date, '20010402', 'Publication date');
-#is($doc->pub_place, 'Mannheim', 'Publication place');
-
is($doc->author, 'TEST.BSP.Autoren.1', 'Author');
-
is($doc->store('sgbrAuthorAgeClass'), 'X', 'AgeClass');
is($doc->store('sgbrAuthorSex'), 'M', 'Sex');
@@ -69,6 +64,7 @@
# Sgbr specific keywords
is($doc->keywords_string, 'sgbrAuthorAgeClass:X sgbrAuthorSex:M sgbrKodex:M');
+
done_testing;