Blame - script/korapxml2conllu - KorAP/KorAP-XML-CoNLL-U

blob: aca61c3ac18a3b2236995e7304da15566a0a7be1 [file] [log] [blame]

Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
				4	use POSIX;
				5	use Getopt::Std;
				6	use Encode;
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	7
				8	my $MAX_SENTENCE_LENGTH=10000;
				9	my $COMMENT_START="#";
				10
				11	my $test=0;
				12	my $text_no=0;
				13	my %opts;
				14	my %plain_texts;
				15	my $usage=<<EOF;
				16	Usage: $0 [options] ZIPFILE [ZIPFILE...]
				17
				18	Options:
				19	-p pattern
				20
				21	Description:
				22	Convert KorAP-XML morpho zip to CoNLL(-U) format with all information necessary
				23	for reconstruction in comment lines.
				24
				25	Examples:
				26	$0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
				27
				28	ZIPSIGLEPATTERN='-x "15/FEB" "15/MAR"' $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
				29
				30	Results will be written to stdout
				31	EOF
				32
				33	getopts('dhp:', \%opts);
				34	die $usage if($opts{h} \|\| @ARGV == 0);
				35	my $debug=($opts{d}? 1 : 0);
				36
				37	my $docid="";
				38	my ($current_id, $current_from, $current_to, $token);
				39	my $current;
				40	my ($unknown, $known) = (0, 0);
				41	my @current_lines;
				42	my %processedFilenames;
				43	my $zipsiglepattern = (defined($ENV{ZIPSIGLEPATTERN})? $ENV{ZIPSIGLEPATTERN} : "");
				44
				45	my ($ID_idx, $FORM_idx, $LEMMA_idx, $UPOS_idx, $XPOS_idx, $FEATS_idx, $HEAD_idx, $DEPREC_idx, $DEPS_idx, $MISC_idx) = (0..9);
				46
Marc Kupietz	c7d1b93	2020-09-23 13:17:17 +0200	[diff] [blame]	47	my $UNZIP = `sh -c 'command -v unzip'`;
				48	chomp $UNZIP;
				49
				50
				51	if ($UNZIP eq '') {
				52	warn('No unzip executable found in PATH.');
				53	return 0;
				54	};
				55
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	56	foreach my $morpho_zip (@ARGV) {
				57	die "cannot open $morpho_zip" if(! -r $morpho_zip);
				58	my $data_zip = $morpho_zip;
				59	if ($data_zip !~ /\.zip/ && $data_zip =~ /\.conllu?/i) {
				60	open(CONLL, "<$data_zip") or die "cannot open $data_zip";
				61	while(<CONLL>) {
				62	print;
				63	}
				64	close(CONLL);
				65	next;
				66	}
				67	$data_zip =~ s/\.([^.]+)\.zip$/.zip/;
				68	my $foundry = $1;
				69	die "cannot open data file $data_zip corresponding to $morpho_zip" if(! -r $data_zip);
				70
				71	my $first=1;
				72	my $pattern = (defined($opts{p})? $opts{p} : '');
				73	my @conll = ("_") x 10;
				74	my $filename;
				75
Marc Kupietz	c7d1b93	2020-09-23 13:17:17 +0200	[diff] [blame]	76	my $morphocommand = "$UNZIP -c $morpho_zip '/${pattern}///morpho.xml' $zipsiglepattern \|";
Marc Kupietz	7022cc1	2020-09-22 14:32:34 +0200	[diff] [blame]	77	# print STDERR $morphocommand, "\n";
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	78	open (MORPHOPIPE, $morphocommand) or die "cannot unzip $morpho_zip";
Marc Kupietz	c7d1b93	2020-09-23 13:17:17 +0200	[diff] [blame]	79	open (PLAINTEXTPIPE, "$UNZIP -c $data_zip '/${pattern}/*/data.xml' $zipsiglepattern \|") or die "cannot unzip $data_zip";
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	80	print "$COMMENT_START foundry = $foundry\n";
				81	while (<MORPHOPIPE>) {
Marc Kupietz	30c41b1	2020-09-22 14:32:34 +0200	[diff] [blame]	82	if (/^ inflating: (.*)/) {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	83	$filename=$1;
				84	while($processedFilenames{$filename} && !eof(MORPHOPIPE)) {
				85	print STDERR "WARNING: $filename already processed\n";
				86	while (<MORPHOPIPE>) {
				87	last if(/\s+inflating:\s+(.*)/);
				88	}
				89	$filename=$1 if(!eof(MORPHOPIPE) && /\s+inflating:\s+(.*)/);
				90	}
Marc Kupietz	30c41b1	2020-09-22 14:32:34 +0200	[diff] [blame]	91	} elsif(m@^\s<layer\s+.docid="([^"]+)"@) {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	92	last if($test && $text_no++ > 3);
				93	if(!$first) {
				94	closeDoc(0);
				95	}
				96	$processedFilenames{$filename}=1;
				97	$docid=$1;
				98	@current_lines=();
				99	$known=$unknown=0;
				100	$current="";
				101	if ($first) {
				102	$first = 0;
				103	}
				104	if(!fetch_plaintext($docid)) { # skip this text
				105	while (<MORPHOPIPE>) {
				106	last if(m@</layer>@);
				107	}
				108	}
				109	print STDOUT "$COMMENT_START filename = $filename\n$COMMENT_START text_id = $docid\n";
				110	print STDERR "Analyzing $docid\n" if ($debug);
Marc Kupietz	30c41b1	2020-09-22 14:32:34 +0200	[diff] [blame]	111	} elsif (m@^\s<f\s+.name="([^"]+)">([^<]+)</f>@) {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	112	if ($1 eq "lemma") {
				113	$conll[$LEMMA_idx] = $2;
				114	$conll[$LEMMA_idx] =~ s/[\t\n\r]//g; # make sure that lemmas never contain tabs or newlines
				115	if($conll[$LEMMA_idx] eq 'UNKNOWN') {
				116	$conll[$LEMMA_idx] = "--";
				117	$unknown++;
				118	} else {
				119	$known++;
				120	}
				121	} elsif ($1 eq 'pos' \|\| $1 eq "ctag") {
				122	$unknown++;
				123	$conll[$XPOS_idx] = $conll[$UPOS_idx] = $2;
				124	} elsif ($1 eq 'msd') {
				125	$conll[$FEATS_idx] = $2;
				126	} elsif ($1 eq 'certainty') {
				127	$conll[$MISC_idx] = $2;
				128	}
				129	} elsif (/<span /) {
				130	($current_id) = /id="[^0-9]([^\"])"/;
				131	($current_from) = /from="([^\"]*)"/;
				132	($current_to) = /to="([^\"]*)"/;
				133	print STDERR "found span: $current_id $current_from $current_to\n" if($debug);
Marc Kupietz	7e71a82	2020-06-22 17:14:30 +0200	[diff] [blame]	134	$token = substr($plain_texts{$docid}, $current_from, $current_to - $current_from);
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	135	if (!defined $token) {
				136	print STDERR "WARNING: could not retrieve token for $docid at $current_from-$current_to/", length($plain_texts{$docid}), " - ending with: ", substr($plain_texts{$docid},length($plain_texts{$docid})-10), "\n";
				137	$token = "_";
				138	}
				139	$token=~s/[\t\n\r]//g; # make sure that tokens never contain tabs or newlines
				140	@conll = ("_") x 10;
				141	$conll[$FORM_idx] = encode("utf-8", $token);
Marc Kupietz	30c41b1	2020-09-22 14:32:34 +0200	[diff] [blame]	142	} elsif (m@^\s*</fs>@) {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	143	my @vals = ($current_from, $current_to);
				144	print STDERR "joining : ", join(" ", @vals), "\n" if($debug);
				145	push @current_lines, \@vals;
				146	# convert gathered information to CONLL
				147	$conll[$ID_idx] = $#current_lines+1;
				148	$current .= join("\t", @conll) . "\n"; # conll columns
				149	if($conll[$XPOS_idx] eq '$.' \|\| ($conll[$XPOS_idx] eq 'SENT' && $token eq '.') \|\| $known + $unknown >= $MAX_SENTENCE_LENGTH) {
				150	$current .= "\n";
				151	if($known + $unknown > 0) { # only print sentence if it contains some words
				152	printTokenRanges();
				153	print STDOUT $current;
				154	}
				155	$current=""; $known=0; $unknown=0;
				156	@current_lines = ();
				157	}
				158	while (<MORPHOPIPE>) {
				159	last if (m@</span>@); # only consider first interpretation
				160	}
				161	}
				162	}
				163	$current .= "\n";
				164	closeDoc(1);
				165	close(MORPHOPIPE);
				166	close(PLAINTEXTPIPE);
				167	}
				168	exit;
				169
				170	sub printTokenRanges {
				171	print "$COMMENT_START start_offsets = ", $current_lines[0]->[0];
				172	foreach my $t (@current_lines) {
				173	print STDOUT " $t->[0]";
				174	}
				175	print "\n$COMMENT_START end_offsets = ", $current_lines[$#current_lines]->[1];
				176	foreach my $t (@current_lines) {
				177	print STDOUT " $t->[1]";
				178	}
				179	print "\n";
				180	}
				181
				182	sub closeDoc {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	183	print STDERR "closing doc\n" if($debug);
				184	if($known + $unknown > 0) { # only parse a sentence if it has some words
				185	chomp $current;
				186	chomp $current;
				187	chomp $current;
				188	$current .= "\n\n";
				189	printTokenRanges();
				190	print STDOUT $current;
				191	}
				192	}
				193
				194	# read data.xml to figure out the tokens
				195	# (ideally tokens should also be in in morpho.xml, but they are not)
				196	sub fetch_plaintext {
				197	my ($target_id) = @_;
				198	my $docid;
				199	my $text_started=0;
				200
				201	if($plain_texts{$target_id}) {
				202	# print STDERR "already got $target_id\n";
				203	return;
				204	}
				205	while(<PLAINTEXTPIPE>) {
				206	if(/<raw_text[^>]+docid="([^"]*)/) {
				207	$docid=$1;
				208	$text_started=0;
				209	} elsif (m@<text>(.*)</text>@) {
				210	$_= decode("utf-8", $1, Encode::FB_DEFAULT);
				211	s/</</go;
				212	s/>/>/go;
				213	s/&/&/go;
				214	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
				215	$plain_texts{$docid} = $_;
				216	last if($docid eq $target_id);
				217	} elsif (m@<text>(.*)@) {
				218	$_= decode("utf-8", $1, Encode::FB_DEFAULT);
				219	s/</</go;
				220	s/>/>/go;
				221	s/&/&/go;
				222	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
				223	$plain_texts{$docid} = "$_ ";
				224	$text_started=1;
				225	} elsif ($text_started && m@(.*)</text>@) {
				226	$_= decode("utf-8", $1, Encode::FB_DEFAULT);
				227	s/</</go;
				228	s/>/>/go;
				229	s/&/&/go;
				230	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
				231	$plain_texts{$docid} .= $_;
				232	$text_started=0;
				233	last if($docid eq $target_id);
				234	} elsif ($text_started) {
				235	chomp;
				236	$_ = decode("utf-8", $_, Encode::FB_DEFAULT) . ' ';
				237	s/</</go;
				238	s/>/>/go;
				239	s/&/&/go;
				240	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
				241	$plain_texts{$docid} .= $_;
				242	}
				243	}
				244	if(defined($ENV{PLAINTEXTFILTER})) {
				245	if ($plain_texts{$docid} !~ $ENV{PLAINTEXTFILTER}) {
				246	$plain_texts{$docid} = undef;
				247	print STDERR "Skipping $docid\n";
				248	return(undef);
				249	} else {
				250	print STDERR "Using $docid\n";
				251	}
				252	}
				253	return(1);
				254	}