Add korapxml2conllu option -e <regex> to extract element/attributes
./script/korapxml2conllu -e '(posting/id|div/id)' -p "A0000" t/data/wdf19.zip | head -12
# foundry = base
# filename = WDF19/A0000/10894/base/tokens.xml
# text_id = WDF19_A0000.10894
# start_offsets = 0 0 5 14 23 32 40 48 51 54 60
# end_offsets = 61 4 12 22 31 39 47 50 53 59 61
1 Arts _ _ _ _ _ _ _ _
2 visuels _ _ _ _ _ _ _ _
# div/id = i.10894_1
# posting/id = i.10894_1_1
3 Pourquoi _ _ _ _ _ _ _ _
4 toujours _ _ _ _ _ _ _ _
5 vouloir _ _ _ _ _ _ _ _
Change-Id: I2cedc6580699fab0db6794d0f3225ea4da72b30f
diff --git a/Changes b/Changes
index 2b993fb..3d045db 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,5 @@
0.3.900 unreleased
-
+ - korapxml2conllu option -e <regex> added to extract element/attributes to comments
0.3 2021-02-15
- Provide conllu2korapxml to convert from ConLL-U to KorAP-XML zip
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index 707db18..494b490 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu
@@ -28,6 +28,7 @@
GetOptions(
'sigle-pattern|p=s' => \(my $sigle_pattern = ''),
+ 'extract-attributes-regex|e=s' => \(my $extract_attributes_regex = ''),
'log|l=s' => \(my $log_level = 'warn'),
'help|h' => sub {
@@ -47,14 +48,6 @@
}
);
- ZIPSIGLEPATTERN='-x "*15/FEB*" "*15/MAR*"' $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
-
- Results will be written to stdout
-EOF
-
-getopts('dhp:', \%opts);
-die $usage if($opts{h} || @ARGV == 0);
-my $debug=($opts{d}? 1 : 0);
# Establish logger
binmode(STDERR, ':encoding(UTF-8)');
Log::Any::Adapter->set('Stderr', log_level => $log_level);
@@ -68,6 +61,7 @@
my %processedFilenames;
my $zipsiglepattern = (defined($ENV{ZIPSIGLEPATTERN})? $ENV{ZIPSIGLEPATTERN} : "");
my $baseOnly;
+my %extras;
my ($ID_idx, $FORM_idx, $LEMMA_idx, $UPOS_idx, $XPOS_idx, $FEATS_idx, $HEAD_idx, $DEPREC_idx, $DEPS_idx, $MISC_idx) = (0..9);
@@ -96,19 +90,22 @@
die "cannot open data file $data_zip corresponding to $morpho_zip" if(! -r $data_zip);
my $first=1;
- my $pattern = (defined($opts{p})? $opts{p} : '');
my @conll = ("_") x 10;
my $filename;
$baseOnly = $morpho_zip eq $data_zip;
my ($morphoOrTokenCommand, $plaintextAndStructureCommand);
- if(!$baseOnly) {
- $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${pattern}*/*/*/morpho.xml' $zipsiglepattern |";
- $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${pattern}*/*/data.xml' $zipsiglepattern |";
+ if (!$baseOnly) {
+ $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${sigle_pattern}*/*/*/morpho.xml' $zipsiglepattern |";
+ if ($extract_attributes_regex) {
+ $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/[sd][ta]*.xml' $zipsiglepattern |";
+ } else {
+ $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/data.xml' $zipsiglepattern |";
+ }
} else {
$foundry = "base";
- $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${pattern}*/*/*/tokens.xml' $zipsiglepattern |";
- $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${pattern}*/*/[sd][ta]*.xml' $zipsiglepattern |";
+ $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${sigle_pattern}*/*/*/tokens.xml' $zipsiglepattern |";
+ $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/[sd][ta]*.xml' $zipsiglepattern |";
}
open (MORPHO_OR_TOKENPIPE, $morphoOrTokenCommand) or die "cannot unzip $morpho_zip";
@@ -163,9 +160,18 @@
$conll[$MISC_idx] = $2;
}
} elsif (/<span /) {
+ my $last_from = $current_from // -1;
($current_id) = /id="[^0-9]*([^\"]*)"/;
($current_from) = /from="([^\"]*)"/;
($current_to) = /to="([^\"]*)"/;
+ if($extract_attributes_regex) {
+ for (my $i = $last_from + 1; $i <= $current_from; $i++) {
+ if ($extras{$docid}{$i}) {
+ $current .= $extras{$docid}{$i};
+ undef $extras{$docid}{$i};
+ }
+ }
+ }
$log->debug("found span: $current_id $current_from $current_to");
$token = substr($plain_texts{$docid}, $current_from, $current_to - $current_from);
if (!defined $token) {
@@ -251,6 +257,7 @@
my ($target_id) = @_;
my $docid;
my $text_started=0;
+ my $text_count = 0;
my ($current_id, $current_from, $current_to);
if($plain_texts{$target_id} && (!$baseOnly || $sentence_ends{$target_id}{-1})) {
@@ -271,6 +278,19 @@
} elsif(m@<f\s[^>]*>s</f>@) {
$log->debug("Found sentence end for $docid \@$current_to");
$sentence_ends{$docid}{$current_to}=1;
+ } elsif($extract_attributes_regex && m@<f\sname="name"[^>]*>([^<]+)</f>@) {
+ my $current_element = $1;
+ while(<PLAINTEXTPIPE>) {
+ last if(m@</fs>@);
+ if(m@<f\sname="([^"]+)"[^>]*>([^<]+)</f>@) {
+ my $current_node = "$current_element/$1";
+ my $value = $2;
+ if ($current_node =~ /$extract_attributes_regex/) {
+# print "# $docid $current_from-$current_to :: $current_node = $value\n";
+ $extras{$docid}{$current_from} .= "# $current_node = $value\n";
+ }
+ }
+ }
} elsif (m@<text>(.*)</text>@) {
$_= decode("utf-8", $1, Encode::FB_DEFAULT);
s/</</go;
@@ -278,7 +298,7 @@
s/&/&/go;
tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
$plain_texts{$docid} = $_;
- last if($docid eq $target_id);
+ last if($text_count++ > 1 && $plain_texts{$target_id});
} elsif (m@<text>(.*)@) {
$_= decode("utf-8", $1, Encode::FB_DEFAULT);
s/</</go;
@@ -295,7 +315,7 @@
tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
$plain_texts{$docid} .= $_;
$text_started=0;
- last if($docid eq $target_id);
+ last if($text_count++ > 1 && $plain_texts{$target_id});
} elsif ($text_started) {
chomp;
$_ = decode("utf-8", $_, Encode::FB_DEFAULT) . ' ';
@@ -346,6 +366,11 @@
=item B<--sigle-pattern|-p>
Convert only texts from the KorAP XML zip files with folder names (i.e. sigles) matching the glob pattern.
+
+=item B<--extract-attribute-pattern|-e>
+
+Extract element/attribute regular expressions to comments.
+
=item B<--help|-h>
Print help information.
@@ -362,6 +387,9 @@
=back
=head1 EXAMPLES
+
+ korapxml2conllu -e '(posting/id|div/id)' t/data/wdf19.zip
+
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
diff --git a/t/data/wdf19.zip b/t/data/wdf19.zip
new file mode 100644
index 0000000..61a8bdf
--- /dev/null
+++ b/t/data/wdf19.zip
Binary files differ
diff --git a/t/test.t b/t/test.t
index 22d261a..48bd2b1 100644
--- a/t/test.t
+++ b/t/test.t
@@ -1,6 +1,6 @@
use strict;
use warnings;
-use Test::More tests => 10;
+use Test::More tests => 19;
use Test::Script;
use Test::TempDir::Tiny;
use File::Copy;
@@ -75,4 +75,15 @@
script_runs([ 'script/korapxml2conllu', "$test_tempdir/goe.tree_tagger.zip" ],
"Converts $test_tempdir/goe.tree_tagger.zip to CoNLL-U");
script_stdout_is $expected, "Full round trip: Converts goe.morpho.conllu to KorAP-XML and back to CoNLL-U correctly";
+
+script_runs([ 'script/korapxml2conllu', '-e', 'div/type', "t/data/goe.tree_tagger.zip" ], "Runs korapxml2conllu with morpho input and attribute extraction");
+script_stdout_like "\n# div/type = Autobiographie\n", "Extracts attributes from morpho zips";
+script_stdout_like "\n# div/type = section\n", "Extracts attributes from morpho zips";
+
+script_runs([ 'script/korapxml2conllu', '-e', '(posting/id|div/id)', "t/data/wdf19.zip" ], "Runs korapxml2conllu with base input and regex attribute extraction");
+script_stdout_like "\n# posting/id = i.13075_11_45", "Extracts multiple attributes from base zips (1)";
+script_stdout_like "\n# div/id = i.13075_14", "Extracts multiple attributes from base zips (2)";
+script_stdout_like "\n# posting/id = i.14548_9_1\n3\tbonjour", "Extracts attributes in the right place";
+script_stdout_like "\n# posting/id = i.12610_4_4", "Extracts directly adjacent postings from base zips (1)";
+script_stdout_like "\n# posting/id = i.12610_4_5", "Extracts directly adjacent postings from base zips (2)";
done_testing;