Improved tei support and script

Change-Id: I62fc97828aec1a1acec7d22f8892f54ed6d81803
diff --git a/.gitignore b/.gitignore
index 832d443..87e491e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,8 @@
 MYMETA*
 Makefile
 pm_to_blib
+t/sgbr/PRO-DUD*
+t/sgbr/meta_duden.t
 *.tar.gz
 *~
 *.sqlite
diff --git a/Makefile.PL b/Makefile.PL
index 972d207..f278586 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -36,5 +36,6 @@
 	't/index/*.t ' .
 	  't/sgbr/*.t ' .
 	    't/real/*.t'
-	  }
+	  },
+  EXE_FILES => ['script/korapxml2krill']
 );
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index b2af20c..f56ad34 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -296,37 +296,61 @@
   my $type = shift;
 
   my $stmt;
-  if ($type eq 'text' && ($stmt = $dom->at('titleStmt'))) {
+  if ($type eq 'text') {
 
-    # Title
+    # Publisher
     try {
-      $stmt->find('title')->each(
-	sub {
-	  my $type = $_->attr('type') || 'main';
-	  $self->title($_->all_text) if $type eq 'main';
-	  $self->sub_title($_->all_text) if $type eq 'sub';
-	}
-      );
+      $self->publisher($dom->at('publisher')->all_text);
     };
 
-    # Author
+    # Date of publication
     try {
-      my $author = $stmt->at('author')->attr('ref');
+      my $date = $dom->at('date')->all_text;
+      if ($date =~ s!^\s*(\d{4})-(\d{2})-(\d{2})!$1$2$3!) {
+	$self->pub_date($date);
+      }
+      else {
+	$self->log->warn('"' . $date . '" is not a compatible pubDate');
+      }
+    };
 
-      $author = $self->{ref_author}->{$author};
+    # Publication place
+    try {
+      $self->pub_place($dom->at('pubPlace')->all_text);
+    };
 
-      if ($author) {
+    if ($stmt = $dom->at('titleStmt')) {
+      # Title
+      try {
+	$stmt->find('title')->each(
+	  sub {
+	    my $type = $_->attr('type') || 'main';
+	    $self->title($_->all_text) if $type eq 'main';
 
-	my $array = $self->keywords;
-	$self->author($author->{id});
+	    # Only support the first subtitle
+	    $self->sub_title($_->all_text) if $type eq 'sub' && !$self->sub_title;
+	  }
+	);
+      };
 
-	if ($author->{age}) {
-	  $self->store('sgbrAuthorAgeClass' => $author->{age});
-	  push @$array, 'sgbrAuthorAgeClass:' . $author->{age};
-	};
-	if ($author->{sex}) {
-	  $self->store('sgbrAuthorSex' => $author->{sex});
-	  push @$array, 'sgbrAuthorSex:' . $author->{sex};
+      # Author
+      try {
+	my $author = $stmt->at('author')->attr('ref');
+
+	$author = $self->{ref_author}->{$author};
+
+	if ($author) {
+	  my $array = $self->keywords;
+	  $self->author($author->{name} // $author->{id});
+
+	  if ($author->{age}) {
+	    $self->store('sgbrAuthorAgeClass' => $author->{age});
+	    push @$array, 'sgbrAuthorAgeClass:' . $author->{age};
+	  };
+	  if ($author->{sex}) {
+	    $self->store('sgbrAuthorSex' => $author->{sex});
+	    push @$array, 'sgbrAuthorSex:' . $author->{sex};
+	  };
 	};
       };
     };
@@ -346,11 +370,16 @@
       $dom->find('particDesc person')->each(
 	sub {
 
-	  $self->{ref_author}->{'#' . $_->attr('xml:id')} = {
+	  my $hash = $self->{ref_author}->{'#' . $_->attr('xml:id')} = {
 	    age => $_->attr('age'),
 	    sex => $_->attr('sex'),
 	    id => $_->attr('xml:id')
-	  }
+	  };
+
+	  # Get name
+	  if ($_->at('persName')) {
+	    $hash->{name} = $_->at('persName')->all_text;
+	  };
 	});
     };
 
@@ -360,11 +389,20 @@
     };
 
     try {
-      $stmt = $dom->find('titleStmt > title')->each(
+      $self->store('funder', $dom->at('funder > orgName')->all_text);
+    };
+
+    try {
+      $stmt = $dom->find('fileDesc > titleStmt > title')->each(
 	sub {
 	  my $type = $_->attr('type') || 'main';
 	  $self->doc_title($_->all_text) if $type eq 'main';
-	  $self->doc_sub_title($_->all_text) if $type eq 'sub';
+	  if ($type eq 'sub') {
+	    my $sub_title = $self->doc_sub_title;
+	    $self->doc_sub_title(
+	      ($sub_title ? $sub_title . ', ' : '') . $_->all_text
+	    );
+	  };
 	}
       );
     };
diff --git a/script/korapxml2krill_dir b/script/korapxml2krill_dir
index fa589eb..1f8ec14 100644
--- a/script/korapxml2krill_dir
+++ b/script/korapxml2krill_dir
@@ -86,21 +86,29 @@
   $call .= ' -y ' . $pretty if $pretty;
   $call .= ' -a ' . $_ foreach @allow;
   $call .= ' -s ' . $_ foreach @skip;
-  print "Convert $file\n";
+  print $file;
   system($call);
+  print "\n";
 };
 
 
 my $it = Directory::Iterator->new($input);
+my @dirs;
 my $dir;
 while (1) {
 
     if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
-	write_file($dir);
+	push @dirs, $dir;
 	$it->prune;
     };
   last unless $it->next;
 };
 
+my $count = scalar @dirs;
+for (my $i = 0; $i < $count; $i++) {
+  print "Convert [$i/$count] ";
+  write_file($dirs[$i]);
+};
+
 
 __END__
diff --git a/t/sgbr/meta.t b/t/sgbr/meta.t
index a1e2b8e..c5f9b60 100644
--- a/t/sgbr/meta.t
+++ b/t/sgbr/meta.t
@@ -24,12 +24,7 @@
 is($doc->corpus_sigle, 'TEST', 'ID-corpus');
 
 is($doc->title, 'Sommerüberraschung', 'title');
-#is($doc->sub_title, 'Beispiel Text Untertitel', 'title');
-#is($doc->pub_date, '20010402', 'Publication date');
-#is($doc->pub_place, 'Mannheim', 'Publication place');
-
 is($doc->author, 'TEST.BSP.Autoren.1', 'Author');
-
 is($doc->store('sgbrAuthorAgeClass'), 'X', 'AgeClass');
 
 is($doc->store('sgbrAuthorSex'), 'M', 'Sex');
@@ -69,6 +64,7 @@
 # Sgbr specific keywords
 is($doc->keywords_string, 'sgbrAuthorAgeClass:X sgbrAuthorSex:M sgbrKodex:M');
 
+
 done_testing;