Log warning if metadata value could not be extracted
Change-Id: Ide365750e95a4b4551e7f87e99aec7ca6a75f419
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index 61d48eb..e870bf8 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu
@@ -213,7 +213,7 @@
# $log->debug("found span: $current_id $current_from $current_to");
$token = substr($plain_texts{$docid}, $current_from, $current_to - $current_from);
if (!defined $token) {
- $log->warn("could not retrieve token for $docid at $current_from-$current_to/", length($plain_texts{$docid}), " - ending with: ", substr($plain_texts{$docid},length($plain_texts{$docid})-10));
+ $log->warn("$docid: could not retrieve token at $current_from-$current_to/", length($plain_texts{$docid}), " - ending with: ", substr($plain_texts{$docid},length($plain_texts{$docid})-10));
$token = "_";
}
$token=~s/[\t\n\r]//g; # make sure that tokens never contain tabs or newlines
@@ -237,6 +237,10 @@
printTokenRanges();
if ($extract_metadata) {
for (my $i = 0; $i < @extract_metadata_regex; $i++) {
+ if(!defined($metadata{$docid}[$i])) {
+ $log->warn("$docid: metadata matching /$extract_metadata_regex[$i]/ was not found, using empty string instead");
+ $metadata{$docid}[$i]="";
+ }
print "$metadata{$docid}[$i]\t";
}
}