Blame - script/korapxml2conllu - KorAP/KorAP-XML-CoNLL-U

blob: d3a292cb2e25d6105a25bd1b601b9e3c18d5821e [file] [log] [blame]

Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame^]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
				4	use POSIX;
				5	use Getopt::Std;
				6	use Encode;
				7	use List::Util qw[min max];
				8
				9	my $MAX_SENTENCE_LENGTH=10000;
				10	my $COMMENT_START="#";
				11
				12	my $test=0;
				13	my $text_no=0;
				14	my %opts;
				15	my %plain_texts;
				16	my $usage=<<EOF;
				17	Usage: $0 [options] ZIPFILE [ZIPFILE...]
				18
				19	Options:
				20	-p pattern
				21
				22	Description:
				23	Convert KorAP-XML morpho zip to CoNLL(-U) format with all information necessary
				24	for reconstruction in comment lines.
				25
				26	Examples:
				27	$0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
				28
				29	ZIPSIGLEPATTERN='-x "15/FEB" "15/MAR"' $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
				30
				31	Results will be written to stdout
				32	EOF
				33
				34	getopts('dhp:', \%opts);
				35	die $usage if($opts{h} \|\| @ARGV == 0);
				36	my $debug=($opts{d}? 1 : 0);
				37
				38	my $docid="";
				39	my ($current_id, $current_from, $current_to, $token);
				40	my $current;
				41	my ($unknown, $known) = (0, 0);
				42	my @current_lines;
				43	my %processedFilenames;
				44	my $zipsiglepattern = (defined($ENV{ZIPSIGLEPATTERN})? $ENV{ZIPSIGLEPATTERN} : "");
				45
				46	my ($ID_idx, $FORM_idx, $LEMMA_idx, $UPOS_idx, $XPOS_idx, $FEATS_idx, $HEAD_idx, $DEPREC_idx, $DEPS_idx, $MISC_idx) = (0..9);
				47
				48	foreach my $morpho_zip (@ARGV) {
				49	die "cannot open $morpho_zip" if(! -r $morpho_zip);
				50	my $data_zip = $morpho_zip;
				51	if ($data_zip !~ /\.zip/ && $data_zip =~ /\.conllu?/i) {
				52	open(CONLL, "<$data_zip") or die "cannot open $data_zip";
				53	while(<CONLL>) {
				54	print;
				55	}
				56	close(CONLL);
				57	next;
				58	}
				59	$data_zip =~ s/\.([^.]+)\.zip$/.zip/;
				60	my $foundry = $1;
				61	die "cannot open data file $data_zip corresponding to $morpho_zip" if(! -r $data_zip);
				62
				63	my $first=1;
				64	my $pattern = (defined($opts{p})? $opts{p} : '');
				65	my @conll = ("_") x 10;
				66	my $filename;
				67
				68	my $morphocommand = "unzip -c $morpho_zip '/${pattern}///morpho.xml' $zipsiglepattern \|";
				69	print STDERR $morphocommand, "\n";
				70	open (MORPHOPIPE, $morphocommand) or die "cannot unzip $morpho_zip";
				71	open (PLAINTEXTPIPE, "unzip -c $data_zip '/${pattern}/*/data.xml' $zipsiglepattern \|") or die "cannot unzip $data_zip";
				72	print "$COMMENT_START foundry = $foundry\n";
				73	while (<MORPHOPIPE>) {
				74	if (/\s+inflating:\s+(.*)/) {
				75	$filename=$1;
				76	while($processedFilenames{$filename} && !eof(MORPHOPIPE)) {
				77	print STDERR "WARNING: $filename already processed\n";
				78	while (<MORPHOPIPE>) {
				79	last if(/\s+inflating:\s+(.*)/);
				80	}
				81	$filename=$1 if(!eof(MORPHOPIPE) && /\s+inflating:\s+(.*)/);
				82	}
				83	} elsif(m@<layer\s+.*docid="([^"]+)"@) {
				84	last if($test && $text_no++ > 3);
				85	if(!$first) {
				86	closeDoc(0);
				87	}
				88	$processedFilenames{$filename}=1;
				89	$docid=$1;
				90	@current_lines=();
				91	$known=$unknown=0;
				92	$current="";
				93	if ($first) {
				94	$first = 0;
				95	}
				96	if(!fetch_plaintext($docid)) { # skip this text
				97	while (<MORPHOPIPE>) {
				98	last if(m@</layer>@);
				99	}
				100	}
				101	print STDOUT "$COMMENT_START filename = $filename\n$COMMENT_START text_id = $docid\n";
				102	print STDERR "Analyzing $docid\n" if ($debug);
				103	} elsif (m@<f\s+.*name="([^"]+)">([^<]+)</f>@) {
				104	if ($1 eq "lemma") {
				105	$conll[$LEMMA_idx] = $2;
				106	$conll[$LEMMA_idx] =~ s/[\t\n\r]//g; # make sure that lemmas never contain tabs or newlines
				107	if($conll[$LEMMA_idx] eq 'UNKNOWN') {
				108	$conll[$LEMMA_idx] = "--";
				109	$unknown++;
				110	} else {
				111	$known++;
				112	}
				113	} elsif ($1 eq 'pos' \|\| $1 eq "ctag") {
				114	$unknown++;
				115	$conll[$XPOS_idx] = $conll[$UPOS_idx] = $2;
				116	} elsif ($1 eq 'msd') {
				117	$conll[$FEATS_idx] = $2;
				118	} elsif ($1 eq 'certainty') {
				119	$conll[$MISC_idx] = $2;
				120	}
				121	} elsif (/<span /) {
				122	($current_id) = /id="[^0-9]([^\"])"/;
				123	($current_from) = /from="([^\"]*)"/;
				124	($current_to) = /to="([^\"]*)"/;
				125	print STDERR "found span: $current_id $current_from $current_to\n" if($debug);
				126	$token = substr($plain_texts{$docid}, $current_from, min($current_to - $current_from, 32));
				127	if (!defined $token) {
				128	print STDERR "WARNING: could not retrieve token for $docid at $current_from-$current_to/", length($plain_texts{$docid}), " - ending with: ", substr($plain_texts{$docid},length($plain_texts{$docid})-10), "\n";
				129	$token = "_";
				130	}
				131	$token=~s/[\t\n\r]//g; # make sure that tokens never contain tabs or newlines
				132	@conll = ("_") x 10;
				133	$conll[$FORM_idx] = encode("utf-8", $token);
				134	} elsif (m@</fs>@) {
				135	my @vals = ($current_from, $current_to);
				136	print STDERR "joining : ", join(" ", @vals), "\n" if($debug);
				137	push @current_lines, \@vals;
				138	# convert gathered information to CONLL
				139	$conll[$ID_idx] = $#current_lines+1;
				140	$current .= join("\t", @conll) . "\n"; # conll columns
				141	if($conll[$XPOS_idx] eq '$.' \|\| ($conll[$XPOS_idx] eq 'SENT' && $token eq '.') \|\| $known + $unknown >= $MAX_SENTENCE_LENGTH) {
				142	$current .= "\n";
				143	if($known + $unknown > 0) { # only print sentence if it contains some words
				144	printTokenRanges();
				145	print STDOUT $current;
				146	}
				147	$current=""; $known=0; $unknown=0;
				148	@current_lines = ();
				149	}
				150	while (<MORPHOPIPE>) {
				151	last if (m@</span>@); # only consider first interpretation
				152	}
				153	}
				154	}
				155	$current .= "\n";
				156	closeDoc(1);
				157	close(MORPHOPIPE);
				158	close(PLAINTEXTPIPE);
				159	}
				160	exit;
				161
				162	sub printTokenRanges {
				163	print "$COMMENT_START start_offsets = ", $current_lines[0]->[0];
				164	foreach my $t (@current_lines) {
				165	print STDOUT " $t->[0]";
				166	}
				167	print "\n$COMMENT_START end_offsets = ", $current_lines[$#current_lines]->[1];
				168	foreach my $t (@current_lines) {
				169	print STDOUT " $t->[1]";
				170	}
				171	print "\n";
				172	}
				173
				174	sub closeDoc {
				175	my ($end) = @_;
				176	print STDERR "closing doc\n" if($debug);
				177	if($known + $unknown > 0) { # only parse a sentence if it has some words
				178	chomp $current;
				179	chomp $current;
				180	chomp $current;
				181	$current .= "\n\n";
				182	printTokenRanges();
				183	print STDOUT $current;
				184	}
				185	}
				186
				187	# read data.xml to figure out the tokens
				188	# (ideally tokens should also be in in morpho.xml, but they are not)
				189	sub fetch_plaintext {
				190	my ($target_id) = @_;
				191	my $docid;
				192	my $text_started=0;
				193
				194	if($plain_texts{$target_id}) {
				195	# print STDERR "already got $target_id\n";
				196	return;
				197	}
				198	while(<PLAINTEXTPIPE>) {
				199	if(/<raw_text[^>]+docid="([^"]*)/) {
				200	$docid=$1;
				201	$text_started=0;
				202	} elsif (m@<text>(.*)</text>@) {
				203	$_= decode("utf-8", $1, Encode::FB_DEFAULT);
				204	s/</</go;
				205	s/>/>/go;
				206	s/&/&/go;
				207	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
				208	$plain_texts{$docid} = $_;
				209	last if($docid eq $target_id);
				210	} elsif (m@<text>(.*)@) {
				211	$_= decode("utf-8", $1, Encode::FB_DEFAULT);
				212	s/</</go;
				213	s/>/>/go;
				214	s/&/&/go;
				215	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
				216	$plain_texts{$docid} = "$_ ";
				217	$text_started=1;
				218	} elsif ($text_started && m@(.*)</text>@) {
				219	$_= decode("utf-8", $1, Encode::FB_DEFAULT);
				220	s/</</go;
				221	s/>/>/go;
				222	s/&/&/go;
				223	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
				224	$plain_texts{$docid} .= $_;
				225	$text_started=0;
				226	last if($docid eq $target_id);
				227	} elsif ($text_started) {
				228	chomp;
				229	$_ = decode("utf-8", $_, Encode::FB_DEFAULT) . ' ';
				230	s/</</go;
				231	s/>/>/go;
				232	s/&/&/go;
				233	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
				234	$plain_texts{$docid} .= $_;
				235	}
				236	}
				237	if(defined($ENV{PLAINTEXTFILTER})) {
				238	if ($plain_texts{$docid} !~ $ENV{PLAINTEXTFILTER}) {
				239	$plain_texts{$docid} = undef;
				240	print STDERR "Skipping $docid\n";
				241	return(undef);
				242	} else {
				243	print STDERR "Using $docid\n";
				244	}
				245	}
				246	return(1);
				247	}