blob: 6c7e9b817ea3da73593326001aa3c0a294032e33 [file] [log] [blame]
Marc Kupietz5e7f20a2020-02-17 18:17:11 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use POSIX;
Marc Kupietz6a79cad2021-03-19 16:26:58 +01005use Log::Any '$log';
6use Log::Any::Adapter;
7use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +01009use Encode;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010010
11my $MAX_SENTENCE_LENGTH=10000;
12my $COMMENT_START="#";
Marc Kupietza2680b92021-10-11 17:24:28 +020013my $COMMENT_END="";
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010014
15my $test=0;
16my $text_no=0;
17my %opts;
18my %plain_texts;
Marc Kupietzd8455832021-02-11 17:30:29 +010019my %sentence_ends;
20
Marc Kupietz4cc243a2021-10-11 17:15:16 +020021our $VERSION = '0.4.1.9000';
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010022
Marc Kupietz0ab8a2c2021-03-19 16:21:00 +010023our $VERSION_MSG = "\nkorapxml2conllu - v$VERSION\n";
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010024
Marc Kupietz6a79cad2021-03-19 16:26:58 +010025use constant {
26 # Set to 1 for minimal more debug output (no need to be parametrized)
27 DEBUG => $ENV{KORAPXMLCONLLU_DEBUG} // 0
28};
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010029
Marc Kupietz6a79cad2021-03-19 16:26:58 +010030GetOptions(
31 'sigle-pattern|p=s' => \(my $sigle_pattern = ''),
Marc Kupietzeb7d06a2021-03-19 16:29:16 +010032 'extract-attributes-regex|e=s' => \(my $extract_attributes_regex = ''),
Marc Kupietz15c84fd2021-10-12 12:20:27 +020033 's-bounds-from-morpho' => \(my $s_bounds_from_morpho = 0),
Marc Kupietz6a79cad2021-03-19 16:26:58 +010034 'log|l=s' => \(my $log_level = 'warn'),
Marc Kupietzd7d5d6a2021-10-11 17:52:58 +020035 'columns|c=n' => \(my $columns = 10),
Marc Kupietzd8455832021-02-11 17:30:29 +010036
Marc Kupietz6a79cad2021-03-19 16:26:58 +010037 'help|h' => sub {
38 pod2usage(
39 -verbose => 99,
40 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS|EXAMPLES',
41 -msg => $VERSION_MSG,
42 -output => '-'
43 )
44 },
45 'version|v' => sub {
46 pod2usage(
47 -verbose => 0,
48 -msg => $VERSION_MSG,
49 -output => '-'
50 );
51 }
52);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010053
Marc Kupietz6a79cad2021-03-19 16:26:58 +010054# Establish logger
55binmode(STDERR, ':encoding(UTF-8)');
56Log::Any::Adapter->set('Stderr', log_level => $log_level);
57$log->notice('Debugging is activated') if DEBUG;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010058
59my $docid="";
60my ($current_id, $current_from, $current_to, $token);
61my $current;
62my ($unknown, $known) = (0, 0);
63my @current_lines;
64my %processedFilenames;
65my $zipsiglepattern = (defined($ENV{ZIPSIGLEPATTERN})? $ENV{ZIPSIGLEPATTERN} : "");
Marc Kupietzd8455832021-02-11 17:30:29 +010066my $baseOnly;
Marc Kupietzeb7d06a2021-03-19 16:29:16 +010067my %extras;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010068
69my ($ID_idx, $FORM_idx, $LEMMA_idx, $UPOS_idx, $XPOS_idx, $FEATS_idx, $HEAD_idx, $DEPREC_idx, $DEPS_idx, $MISC_idx) = (0..9);
70
Marc Kupietzc7d1b932020-09-23 13:17:17 +020071my $UNZIP = `sh -c 'command -v unzip'`;
72chomp $UNZIP;
73
74
75if ($UNZIP eq '') {
76 warn('No unzip executable found in PATH.');
77 return 0;
78};
79
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010080foreach my $morpho_zip (@ARGV) {
81 die "cannot open $morpho_zip" if(! -r $morpho_zip);
82 my $data_zip = $morpho_zip;
83 if ($data_zip !~ /\.zip/ && $data_zip =~ /\.conllu?/i) {
84 open(CONLL, "<$data_zip") or die "cannot open $data_zip";
85 while(<CONLL>) {
86 print;
87 }
88 close(CONLL);
89 next;
90 }
91 $data_zip =~ s/\.([^.]+)\.zip$/.zip/;
92 my $foundry = $1;
93 die "cannot open data file $data_zip corresponding to $morpho_zip" if(! -r $data_zip);
94
95 my $first=1;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010096 my @conll = ("_") x 10;
97 my $filename;
98
Marc Kupietzd8455832021-02-11 17:30:29 +010099 $baseOnly = $morpho_zip eq $data_zip;
100 my ($morphoOrTokenCommand, $plaintextAndStructureCommand);
Marc Kupietzeb7d06a2021-03-19 16:29:16 +0100101 if (!$baseOnly) {
102 $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${sigle_pattern}*/*/*/morpho.xml' $zipsiglepattern |";
Marc Kupietz15c84fd2021-10-12 12:20:27 +0200103 if ($extract_attributes_regex || !$s_bounds_from_morpho) {
Marc Kupietzeb7d06a2021-03-19 16:29:16 +0100104 $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/[sd][ta]*.xml' $zipsiglepattern |";
105 } else {
Marc Kupietz15c84fd2021-10-12 12:20:27 +0200106 $log->debug("Not reading structure information.");
Marc Kupietzeb7d06a2021-03-19 16:29:16 +0100107 $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/data.xml' $zipsiglepattern |";
108 }
Marc Kupietzd8455832021-02-11 17:30:29 +0100109 } else {
110 $foundry = "base";
Marc Kupietzf1fdc192021-10-08 13:29:59 +0200111 $morphoOrTokenCommand = "$UNZIP -l $morpho_zip '*/${sigle_pattern}*/*/*/morpho.xml' $zipsiglepattern";
112 if (`$morphoOrTokenCommand` !~ /morpho\.xml/) {
113 $morphoOrTokenCommand =~ s/morpho\.xml/tokens.xml/;
114 } else {
115 $baseOnly = 0;
116 }
117 $morphoOrTokenCommand =~ s/-l/-c/;
118 $morphoOrTokenCommand .= ' |';
Marc Kupietzeb7d06a2021-03-19 16:29:16 +0100119 $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/[sd][ta]*.xml' $zipsiglepattern |";
Marc Kupietzd8455832021-02-11 17:30:29 +0100120 }
121
122 open (MORPHO_OR_TOKENPIPE, $morphoOrTokenCommand) or die "cannot unzip $morpho_zip";
123 open (PLAINTEXTPIPE, $plaintextAndStructureCommand) or die "cannot unzip $data_zip";
Marc Kupietza2680b92021-10-11 17:24:28 +0200124 print "$COMMENT_START foundry = $foundry$COMMENT_END\n";
Marc Kupietzd8455832021-02-11 17:30:29 +0100125 while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz30c41b12020-09-22 14:32:34 +0200126 if (/^ inflating: (.*)/) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100127 $filename=$1;
Marc Kupietzd8455832021-02-11 17:30:29 +0100128 while($processedFilenames{$filename} && !eof(MORPHO_OR_TOKENPIPE)) {
Marc Kupietz6a79cad2021-03-19 16:26:58 +0100129 $log->warn("$filename already processed");
Marc Kupietzd8455832021-02-11 17:30:29 +0100130 while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100131 last if(/\s+inflating:\s+(.*)/);
132 }
Marc Kupietzd8455832021-02-11 17:30:29 +0100133 $filename=$1 if(!eof(MORPHO_OR_TOKENPIPE) && /\s+inflating:\s+(.*)/);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100134 }
Marc Kupietz30c41b12020-09-22 14:32:34 +0200135 } elsif(m@^\s*<layer\s+.*docid="([^"]+)"@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100136 last if($test && $text_no++ > 3);
137 if(!$first) {
138 closeDoc(0);
139 }
140 $processedFilenames{$filename}=1;
141 $docid=$1;
142 @current_lines=();
143 $known=$unknown=0;
144 $current="";
145 if ($first) {
146 $first = 0;
147 }
148 if(!fetch_plaintext($docid)) { # skip this text
Marc Kupietzd8455832021-02-11 17:30:29 +0100149 while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100150 last if(m@</layer>@);
151 }
152 }
Marc Kupietza2680b92021-10-11 17:24:28 +0200153 print STDOUT "$COMMENT_START filename = $filename$COMMENT_END\n$COMMENT_START text_id = $docid$COMMENT_END\n";
Marc Kupietz6a79cad2021-03-19 16:26:58 +0100154 $log->debug("Analyzing $docid");
Marc Kupietz30c41b12020-09-22 14:32:34 +0200155 } elsif (m@^\s*<f\s+.*name="([^"]+)">([^<]+)</f>@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100156 if ($1 eq "lemma") {
157 $conll[$LEMMA_idx] = $2;
158 $conll[$LEMMA_idx] =~ s/[\t\n\r]//g; # make sure that lemmas never contain tabs or newlines
159 if($conll[$LEMMA_idx] eq 'UNKNOWN') {
160 $conll[$LEMMA_idx] = "--";
161 $unknown++;
162 } else {
163 $known++;
164 }
165 } elsif ($1 eq 'pos' || $1 eq "ctag") {
166 $unknown++;
167 $conll[$XPOS_idx] = $conll[$UPOS_idx] = $2;
168 } elsif ($1 eq 'msd') {
169 $conll[$FEATS_idx] = $2;
170 } elsif ($1 eq 'certainty') {
171 $conll[$MISC_idx] = $2;
172 }
173 } elsif (/<span /) {
Marc Kupietzeb7d06a2021-03-19 16:29:16 +0100174 my $last_from = $current_from // -1;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100175 ($current_id) = /id="[^0-9]*([^\"]*)"/;
176 ($current_from) = /from="([^\"]*)"/;
177 ($current_to) = /to="([^\"]*)"/;
Marc Kupietzeb7d06a2021-03-19 16:29:16 +0100178 if($extract_attributes_regex) {
179 for (my $i = $last_from + 1; $i <= $current_from; $i++) {
180 if ($extras{$docid}{$i}) {
181 $current .= $extras{$docid}{$i};
182 undef $extras{$docid}{$i};
183 }
184 }
185 }
Marc Kupietz1db65e52021-07-31 23:38:07 +0200186# $log->debug("found span: $current_id $current_from $current_to");
Marc Kupietz7e71a822020-06-22 17:14:30 +0200187 $token = substr($plain_texts{$docid}, $current_from, $current_to - $current_from);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100188 if (!defined $token) {
Marc Kupietz6a79cad2021-03-19 16:26:58 +0100189 $log->warn("could not retrieve token for $docid at $current_from-$current_to/", length($plain_texts{$docid}), " - ending with: ", substr($plain_texts{$docid},length($plain_texts{$docid})-10));
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100190 $token = "_";
191 }
192 $token=~s/[\t\n\r]//g; # make sure that tokens never contain tabs or newlines
193 @conll = ("_") x 10;
194 $conll[$FORM_idx] = encode("utf-8", $token);
Marc Kupietzd8455832021-02-11 17:30:29 +0100195 if($baseOnly) {
196 my @vals = ($current_from, $current_to);
Marc Kupietz1db65e52021-07-31 23:38:07 +0200197# $log->debug("joining : ", join(" ", @vals));
Marc Kupietzd8455832021-02-11 17:30:29 +0100198 push @current_lines, \@vals;
199 $known++;
200 $conll[$ID_idx] = $#current_lines+1;
Marc Kupietzd7d5d6a2021-10-11 17:52:58 +0200201 if ($columns == 1) {
202 $current .= "$conll[1]\n";
203 } else {
204 $current .= join("\t", @conll[0..$columns-1]) . "\n"; # conll columns
205 }
Marc Kupietzd8455832021-02-11 17:30:29 +0100206 fetch_plaintext($docid);
207 if ($sentence_ends{$docid}{$current_to}) {
Marc Kupietz15c84fd2021-10-12 12:20:27 +0200208 $log->debug("Using sentence end for $docid \@$current_to");
Marc Kupietzd8455832021-02-11 17:30:29 +0100209 $current .= "\n";
210 printTokenRanges();
211 print STDOUT $current;
212 $current = "";
213 $known = 0;
214 $unknown = 0;
215 @current_lines = ();
216 }
217 }
Marc Kupietz30c41b12020-09-22 14:32:34 +0200218 } elsif (m@^\s*</fs>@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100219 my @vals = ($current_from, $current_to);
Marc Kupietz1db65e52021-07-31 23:38:07 +0200220# $log->debug("joining : ", join(" ", @vals));
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100221 push @current_lines, \@vals;
222 # convert gathered information to CONLL
223 $conll[$ID_idx] = $#current_lines+1;
Marc Kupietzd7d5d6a2021-10-11 17:52:58 +0200224 if ($columns == 1) {
225 $current .= "$conll[1]\n";
226 } else {
227 $current .= join("\t", @conll[0..$columns-1]) . "\n"; # conll columns
228 }
Marc Kupietz15c84fd2021-10-12 12:20:27 +0200229 if($sentence_ends{$docid}{$current_to} || ($s_bounds_from_morpho && $conll[$XPOS_idx] eq '$.' || ($conll[$XPOS_idx] eq 'SENT' && $token eq '.')) || $known + $unknown >= $MAX_SENTENCE_LENGTH) {
230 $log->debug("Using sentence end for $docid \@$current_to");
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100231 $current .= "\n";
232 if($known + $unknown > 0) { # only print sentence if it contains some words
233 printTokenRanges();
234 print STDOUT $current;
235 }
236 $current=""; $known=0; $unknown=0;
237 @current_lines = ();
238 }
Marc Kupietzd8455832021-02-11 17:30:29 +0100239 while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100240 last if (m@</span>@); # only consider first interpretation
241 }
242 }
243 }
244 $current .= "\n";
245 closeDoc(1);
Marc Kupietzd8455832021-02-11 17:30:29 +0100246 close(MORPHO_OR_TOKENPIPE);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100247 close(PLAINTEXTPIPE);
248}
249exit;
250
251sub printTokenRanges {
252 print "$COMMENT_START start_offsets = ", $current_lines[0]->[0];
253 foreach my $t (@current_lines) {
254 print STDOUT " $t->[0]";
255 }
Marc Kupietza2680b92021-10-11 17:24:28 +0200256 print "$COMMENT_END\n$COMMENT_START end_offsets = ", $current_lines[$#current_lines]->[1];
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100257 foreach my $t (@current_lines) {
258 print STDOUT " $t->[1]";
259 }
Marc Kupietza2680b92021-10-11 17:24:28 +0200260 print "$COMMENT_END\n";
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100261}
262
263sub closeDoc {
Marc Kupietz6a79cad2021-03-19 16:26:58 +0100264 $log->debug("closing doc");
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100265 if($known + $unknown > 0) { # only parse a sentence if it has some words
266 chomp $current;
267 chomp $current;
268 chomp $current;
269 $current .= "\n\n";
270 printTokenRanges();
271 print STDOUT $current;
272 }
273}
274
275# read data.xml to figure out the tokens
276# (ideally tokens should also be in in morpho.xml, but they are not)
277sub fetch_plaintext {
278 my ($target_id) = @_;
279 my $docid;
280 my $text_started=0;
Marc Kupietzeb7d06a2021-03-19 16:29:16 +0100281 my $text_count = 0;
Marc Kupietzd8455832021-02-11 17:30:29 +0100282 my ($current_id, $current_from, $current_to);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100283
Marc Kupietz15c84fd2021-10-12 12:20:27 +0200284 if($plain_texts{$target_id} && ($s_bounds_from_morpho || $sentence_ends{$target_id})) {
Marc Kupietz1db65e52021-07-31 23:38:07 +0200285 $log->debug("Already got $target_id");
Marc Kupietzd8455832021-02-11 17:30:29 +0100286 return 1;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100287 }
288 while(<PLAINTEXTPIPE>) {
289 if(/<raw_text[^>]+docid="([^"]*)/) {
290 $docid=$1;
Marc Kupietz1db65e52021-07-31 23:38:07 +0200291 $log->debug("Getting plain text for $docid");
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100292 $text_started=0;
Marc Kupietzd8455832021-02-11 17:30:29 +0100293 } elsif(/<layer[^>]+docid="([^"]*)/) {
294 $docid=$1;
Marc Kupietzd8455832021-02-11 17:30:29 +0100295 } elsif(m@<span @) {
296 ($current_id) = /id="[^0-9]*([^\"]*)"/;
297 ($current_from) = /from="([^\"]*)"/;
298 ($current_to) = /to="([^\"]*)"/;
299 } elsif(m@<f\s[^>]*>s</f>@) {
Marc Kupietz15c84fd2021-10-12 12:20:27 +0200300 if ($s_bounds_from_morpho) {
301 $log->debug("Ignoring sentence end for $docid \@$current_to because of --s-bounds-from-morpho");
302 } else {
303 $log->debug("Found sentence end for $docid \@$current_to");
304 $sentence_ends{$docid}{$current_to} = 1;
305 }
Marc Kupietzeb7d06a2021-03-19 16:29:16 +0100306 } elsif($extract_attributes_regex && m@<f\sname="name"[^>]*>([^<]+)</f>@) {
307 my $current_element = $1;
Marc Kupietz1db65e52021-07-31 23:38:07 +0200308 $log->debug("Looking for matching attributes in $docid");
Marc Kupietzeb7d06a2021-03-19 16:29:16 +0100309 while(<PLAINTEXTPIPE>) {
310 last if(m@</fs>@);
311 if(m@<f\sname="([^"]+)"[^>]*>([^<]+)</f>@) {
312 my $current_node = "$current_element/$1";
313 my $value = $2;
314 if ($current_node =~ /$extract_attributes_regex/) {
Marc Kupietz1db65e52021-07-31 23:38:07 +0200315 $log->debug("Found matching attribute: $docid - $current_node = $value");
Marc Kupietzeb7d06a2021-03-19 16:29:16 +0100316 $extras{$docid}{$current_from} .= "# $current_node = $value\n";
317 }
318 }
319 }
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100320 } elsif (m@<text>(.*)</text>@) {
321 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
322 s/&lt;/</go;
323 s/&gt;/>/go;
324 s/&amp;/&/go;
325 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
326 $plain_texts{$docid} = $_;
Marc Kupietz093b21c2021-07-31 23:39:51 +0200327 last if(!$extract_attributes_regex && ($text_count++ > 1 && $plain_texts{$target_id}));
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100328 } elsif (m@<text>(.*)@) {
329 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
330 s/&lt;/</go;
331 s/&gt;/>/go;
332 s/&amp;/&/go;
333 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
334 $plain_texts{$docid} = "$_ ";
335 $text_started=1;
336 } elsif ($text_started && m@(.*)</text>@) {
337 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
338 s/&lt;/</go;
339 s/&gt;/>/go;
340 s/&amp;/&/go;
341 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
342 $plain_texts{$docid} .= $_;
343 $text_started=0;
Marc Kupietz093b21c2021-07-31 23:39:51 +0200344 last if(!$extract_attributes_regex && ($text_count++ > 1 && $plain_texts{$target_id}));
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100345 } elsif ($text_started) {
346 chomp;
347 $_ = decode("utf-8", $_, Encode::FB_DEFAULT) . ' ';
348 s/&lt;/</go;
349 s/&gt;/>/go;
350 s/&amp;/&/go;
351 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
352 $plain_texts{$docid} .= $_;
353 }
354 }
Marc Kupietz1db65e52021-07-31 23:38:07 +0200355 $log->debug("Got plain text for $docid");
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100356 if(defined($ENV{PLAINTEXTFILTER})) {
357 if ($plain_texts{$docid} !~ $ENV{PLAINTEXTFILTER}) {
358 $plain_texts{$docid} = undef;
Marc Kupietz6a79cad2021-03-19 16:26:58 +0100359 $log->info("Skipping $docid");
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100360 return(undef);
361 } else {
Marc Kupietz6a79cad2021-03-19 16:26:58 +0100362 $log->debug("Using $docid");
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100363 }
364 }
365 return(1);
366}
Marc Kupietz6a79cad2021-03-19 16:26:58 +0100367
368=pod
369
370=encoding utf8
371
372=head1 NAME
373
374korapxml2conllu - Conversion of KorAP-XML zips to CoNLL-U
375
376=head1 SYNOPSIS
377
378 korapxml2conllu zca15.tree_tagger.zip > zca15.conllu
379
380=head1 DESCRIPTION
381
382C<korapxml2conllu> is a script to Convert L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml> base or morpho zips to CoNLL(-U) format with all information necessary
383 for reconstruction in comment lines.
384
385=head1 INSTALLATION
386
387 $ cpanm https://github.com/KorAP/KorAP-XML-CoNLL-U.git
388
389=head1 OPTIONS
390
391=over 2
392
393=item B<--sigle-pattern|-p>
394
395Convert only texts from the KorAP XML zip files with folder names (i.e. sigles) matching the glob pattern.
Marc Kupietzeb7d06a2021-03-19 16:29:16 +0100396
397=item B<--extract-attribute-pattern|-e>
398
399Extract element/attribute regular expressions to comments.
400
Marc Kupietzd7d5d6a2021-10-11 17:52:58 +0200401=item B<--columns>=I<int> | B<-c> I<int>
402
403Print n columns (default: 10). If n=1, only the token itself is printed.
404
Marc Kupietz15c84fd2021-10-12 12:20:27 +0200405=item B<--s-bounds-from-morpho>
406
407Get sentence boundary information from tagger output rather than from s annotion in structure.xml files.
408
Marc Kupietz6a79cad2021-03-19 16:26:58 +0100409=item B<--help|-h>
410
411Print help information.
412
413=item B<--version|-v>
414
415Print version information.
416
417
418=item B<--log|-l>
419
420Loglevel for I<Log::Any>. Defaults to C<warn>.
421
422=back
423
424=head1 EXAMPLES
Marc Kupietzeb7d06a2021-03-19 16:29:16 +0100425
426 korapxml2conllu -e '(posting/id|div/id)' t/data/wdf19.zip
427
Marc Kupietz6a79cad2021-03-19 16:26:58 +0100428=head1 COPYRIGHT AND LICENSE
429
430Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
431
432Author: Marc Kupietz
433
434Contributors: Nils Diewald
435
436L<KorAP::XML::CoNNL-U> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
437Corpus Analysis Platform at the
438L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
439member of the
440L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
441
442This program is free software published under the
443L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.