Blame - script/korapxml2conllu - KorAP/KorAP-XML-CoNLL-U

blob: 9dcec4211aee9a2e5b66f49e6a0b1e5364c99602 [file] [log] [blame]

Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
				4	use POSIX;
				5	use Getopt::Std;
				6	use Encode;
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	7
				8	my $MAX_SENTENCE_LENGTH=10000;
				9	my $COMMENT_START="#";
				10
				11	my $test=0;
				12	my $text_no=0;
				13	my %opts;
				14	my %plain_texts;
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	15	my %sentence_ends;
				16
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	17	my $usage=<<EOF;
				18	Usage: $0 [options] ZIPFILE [ZIPFILE...]
				19
				20	Options:
				21	-p pattern
				22
				23	Description:
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	24	Convert KorAP-XML base or morpho zips to CoNLL(-U) format with all information necessary
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	25	for reconstruction in comment lines.
				26
				27	Examples:
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	28	$0 /vol/corpora/DeReKo/current/KorAP/zip/zca20.zip
				29
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	30	$0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
				31
				32	ZIPSIGLEPATTERN='-x "15/FEB" "15/MAR"' $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
				33
				34	Results will be written to stdout
				35	EOF
				36
				37	getopts('dhp:', \%opts);
				38	die $usage if($opts{h} \|\| @ARGV == 0);
				39	my $debug=($opts{d}? 1 : 0);
				40
				41	my $docid="";
				42	my ($current_id, $current_from, $current_to, $token);
				43	my $current;
				44	my ($unknown, $known) = (0, 0);
				45	my @current_lines;
				46	my %processedFilenames;
				47	my $zipsiglepattern = (defined($ENV{ZIPSIGLEPATTERN})? $ENV{ZIPSIGLEPATTERN} : "");
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	48	my $baseOnly;
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	49
				50	my ($ID_idx, $FORM_idx, $LEMMA_idx, $UPOS_idx, $XPOS_idx, $FEATS_idx, $HEAD_idx, $DEPREC_idx, $DEPS_idx, $MISC_idx) = (0..9);
				51
Marc Kupietz	c7d1b93	2020-09-23 13:17:17 +0200	[diff] [blame]	52	my $UNZIP = `sh -c 'command -v unzip'`;
				53	chomp $UNZIP;
				54
				55
				56	if ($UNZIP eq '') {
				57	warn('No unzip executable found in PATH.');
				58	return 0;
				59	};
				60
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	61	foreach my $morpho_zip (@ARGV) {
				62	die "cannot open $morpho_zip" if(! -r $morpho_zip);
				63	my $data_zip = $morpho_zip;
				64	if ($data_zip !~ /\.zip/ && $data_zip =~ /\.conllu?/i) {
				65	open(CONLL, "<$data_zip") or die "cannot open $data_zip";
				66	while(<CONLL>) {
				67	print;
				68	}
				69	close(CONLL);
				70	next;
				71	}
				72	$data_zip =~ s/\.([^.]+)\.zip$/.zip/;
				73	my $foundry = $1;
				74	die "cannot open data file $data_zip corresponding to $morpho_zip" if(! -r $data_zip);
				75
				76	my $first=1;
				77	my $pattern = (defined($opts{p})? $opts{p} : '');
				78	my @conll = ("_") x 10;
				79	my $filename;
				80
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	81	$baseOnly = $morpho_zip eq $data_zip;
				82	my ($morphoOrTokenCommand, $plaintextAndStructureCommand);
				83	if(!$baseOnly) {
				84	$morphoOrTokenCommand = "$UNZIP -c $morpho_zip '/${pattern}///morpho.xml' $zipsiglepattern \|";
				85	$plaintextAndStructureCommand = "$UNZIP -c $data_zip '/${pattern}/*/data.xml' $zipsiglepattern \|";
				86	} else {
				87	$foundry = "base";
				88	$morphoOrTokenCommand = "$UNZIP -c $morpho_zip '/${pattern}///tokens.xml' $zipsiglepattern \|";
				89	$plaintextAndStructureCommand = "$UNZIP -c $data_zip '/${pattern}//[sd][ta].xml' $zipsiglepattern \|";
				90	}
				91
				92	open (MORPHO_OR_TOKENPIPE, $morphoOrTokenCommand) or die "cannot unzip $morpho_zip";
				93	open (PLAINTEXTPIPE, $plaintextAndStructureCommand) or die "cannot unzip $data_zip";
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	94	print "$COMMENT_START foundry = $foundry\n";
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	95	while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz	30c41b1	2020-09-22 14:32:34 +0200	[diff] [blame]	96	if (/^ inflating: (.*)/) {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	97	$filename=$1;
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	98	while($processedFilenames{$filename} && !eof(MORPHO_OR_TOKENPIPE)) {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	99	print STDERR "WARNING: $filename already processed\n";
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	100	while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	101	last if(/\s+inflating:\s+(.*)/);
				102	}
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	103	$filename=$1 if(!eof(MORPHO_OR_TOKENPIPE) && /\s+inflating:\s+(.*)/);
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	104	}
Marc Kupietz	30c41b1	2020-09-22 14:32:34 +0200	[diff] [blame]	105	} elsif(m@^\s<layer\s+.docid="([^"]+)"@) {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	106	last if($test && $text_no++ > 3);
				107	if(!$first) {
				108	closeDoc(0);
				109	}
				110	$processedFilenames{$filename}=1;
				111	$docid=$1;
				112	@current_lines=();
				113	$known=$unknown=0;
				114	$current="";
				115	if ($first) {
				116	$first = 0;
				117	}
				118	if(!fetch_plaintext($docid)) { # skip this text
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	119	while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	120	last if(m@</layer>@);
				121	}
				122	}
				123	print STDOUT "$COMMENT_START filename = $filename\n$COMMENT_START text_id = $docid\n";
				124	print STDERR "Analyzing $docid\n" if ($debug);
Marc Kupietz	30c41b1	2020-09-22 14:32:34 +0200	[diff] [blame]	125	} elsif (m@^\s<f\s+.name="([^"]+)">([^<]+)</f>@) {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	126	if ($1 eq "lemma") {
				127	$conll[$LEMMA_idx] = $2;
				128	$conll[$LEMMA_idx] =~ s/[\t\n\r]//g; # make sure that lemmas never contain tabs or newlines
				129	if($conll[$LEMMA_idx] eq 'UNKNOWN') {
				130	$conll[$LEMMA_idx] = "--";
				131	$unknown++;
				132	} else {
				133	$known++;
				134	}
				135	} elsif ($1 eq 'pos' \|\| $1 eq "ctag") {
				136	$unknown++;
				137	$conll[$XPOS_idx] = $conll[$UPOS_idx] = $2;
				138	} elsif ($1 eq 'msd') {
				139	$conll[$FEATS_idx] = $2;
				140	} elsif ($1 eq 'certainty') {
				141	$conll[$MISC_idx] = $2;
				142	}
				143	} elsif (/<span /) {
				144	($current_id) = /id="[^0-9]([^\"])"/;
				145	($current_from) = /from="([^\"]*)"/;
				146	($current_to) = /to="([^\"]*)"/;
				147	print STDERR "found span: $current_id $current_from $current_to\n" if($debug);
Marc Kupietz	7e71a82	2020-06-22 17:14:30 +0200	[diff] [blame]	148	$token = substr($plain_texts{$docid}, $current_from, $current_to - $current_from);
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	149	if (!defined $token) {
				150	print STDERR "WARNING: could not retrieve token for $docid at $current_from-$current_to/", length($plain_texts{$docid}), " - ending with: ", substr($plain_texts{$docid},length($plain_texts{$docid})-10), "\n";
				151	$token = "_";
				152	}
				153	$token=~s/[\t\n\r]//g; # make sure that tokens never contain tabs or newlines
				154	@conll = ("_") x 10;
				155	$conll[$FORM_idx] = encode("utf-8", $token);
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	156	if($baseOnly) {
				157	my @vals = ($current_from, $current_to);
				158	print STDERR "joining : ", join(" ", @vals), "\n" if($debug);
				159	push @current_lines, \@vals;
				160	$known++;
				161	$conll[$ID_idx] = $#current_lines+1;
				162	$current .= join("\t", @conll) . "\n"; # conll columns
				163	fetch_plaintext($docid);
				164	if ($sentence_ends{$docid}{$current_to}) {
				165	$current .= "\n";
				166	printTokenRanges();
				167	print STDOUT $current;
				168	$current = "";
				169	$known = 0;
				170	$unknown = 0;
				171	@current_lines = ();
				172	}
				173	}
Marc Kupietz	30c41b1	2020-09-22 14:32:34 +0200	[diff] [blame]	174	} elsif (m@^\s*</fs>@) {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	175	my @vals = ($current_from, $current_to);
				176	print STDERR "joining : ", join(" ", @vals), "\n" if($debug);
				177	push @current_lines, \@vals;
				178	# convert gathered information to CONLL
				179	$conll[$ID_idx] = $#current_lines+1;
				180	$current .= join("\t", @conll) . "\n"; # conll columns
				181	if($conll[$XPOS_idx] eq '$.' \|\| ($conll[$XPOS_idx] eq 'SENT' && $token eq '.') \|\| $known + $unknown >= $MAX_SENTENCE_LENGTH) {
				182	$current .= "\n";
				183	if($known + $unknown > 0) { # only print sentence if it contains some words
				184	printTokenRanges();
				185	print STDOUT $current;
				186	}
				187	$current=""; $known=0; $unknown=0;
				188	@current_lines = ();
				189	}
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	190	while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	191	last if (m@</span>@); # only consider first interpretation
				192	}
				193	}
				194	}
				195	$current .= "\n";
				196	closeDoc(1);
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	197	close(MORPHO_OR_TOKENPIPE);
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	198	close(PLAINTEXTPIPE);
				199	}
				200	exit;
				201
				202	sub printTokenRanges {
				203	print "$COMMENT_START start_offsets = ", $current_lines[0]->[0];
				204	foreach my $t (@current_lines) {
				205	print STDOUT " $t->[0]";
				206	}
				207	print "\n$COMMENT_START end_offsets = ", $current_lines[$#current_lines]->[1];
				208	foreach my $t (@current_lines) {
				209	print STDOUT " $t->[1]";
				210	}
				211	print "\n";
				212	}
				213
				214	sub closeDoc {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	215	print STDERR "closing doc\n" if($debug);
				216	if($known + $unknown > 0) { # only parse a sentence if it has some words
				217	chomp $current;
				218	chomp $current;
				219	chomp $current;
				220	$current .= "\n\n";
				221	printTokenRanges();
				222	print STDOUT $current;
				223	}
				224	}
				225
				226	# read data.xml to figure out the tokens
				227	# (ideally tokens should also be in in morpho.xml, but they are not)
				228	sub fetch_plaintext {
				229	my ($target_id) = @_;
				230	my $docid;
				231	my $text_started=0;
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	232	my ($current_id, $current_from, $current_to);
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	233
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	234	if($plain_texts{$target_id} && (!$baseOnly \|\| $sentence_ends{$target_id}{-1})) {
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	235	# print STDERR "already got $target_id\n";
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	236	return 1;
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	237	}
				238	while(<PLAINTEXTPIPE>) {
				239	if(/<raw_text[^>]+docid="([^"]*)/) {
				240	$docid=$1;
				241	$text_started=0;
Marc Kupietz	d845583	2021-02-11 17:30:29 +0100	[diff] [blame]	242	} elsif(/<layer[^>]+docid="([^"]*)/) {
				243	$docid=$1;
				244	$sentence_ends{$docid}{-1}=1;
				245	} elsif(m@<span @) {
				246	($current_id) = /id="[^0-9]([^\"])"/;
				247	($current_from) = /from="([^\"]*)"/;
				248	($current_to) = /to="([^\"]*)"/;
				249	} elsif(m@<f\s[^>]*>s</f>@) {
				250	print STDERR "Found sentence end for $docid \@$current_to\n" if($debug);
				251	$sentence_ends{$docid}{$current_to}=1;
Marc Kupietz	5e7f20a	2020-02-17 18:17:11 +0100	[diff] [blame]	252	} elsif (m@<text>(.*)</text>@) {
				253	$_= decode("utf-8", $1, Encode::FB_DEFAULT);
				254	s/</</go;
				255	s/>/>/go;
				256	s/&/&/go;
				257	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
				258	$plain_texts{$docid} = $_;
				259	last if($docid eq $target_id);
				260	} elsif (m@<text>(.*)@) {
				261	$_= decode("utf-8", $1, Encode::FB_DEFAULT);
				262	s/</</go;
				263	s/>/>/go;
				264	s/&/&/go;
				265	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
				266	$plain_texts{$docid} = "$_ ";
				267	$text_started=1;
				268	} elsif ($text_started && m@(.*)</text>@) {
				269	$_= decode("utf-8", $1, Encode::FB_DEFAULT);
				270	s/</</go;
				271	s/>/>/go;
				272	s/&/&/go;
				273	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
				274	$plain_texts{$docid} .= $_;
				275	$text_started=0;
				276	last if($docid eq $target_id);
				277	} elsif ($text_started) {
				278	chomp;
				279	$_ = decode("utf-8", $_, Encode::FB_DEFAULT) . ' ';
				280	s/</</go;
				281	s/>/>/go;
				282	s/&/&/go;
				283	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
				284	$plain_texts{$docid} .= $_;
				285	}
				286	}
				287	if(defined($ENV{PLAINTEXTFILTER})) {
				288	if ($plain_texts{$docid} !~ $ENV{PLAINTEXTFILTER}) {
				289	$plain_texts{$docid} = undef;
				290	print STDERR "Skipping $docid\n";
				291	return(undef);
				292	} else {
				293	print STDERR "Using $docid\n";
				294	}
				295	}
				296	return(1);
				297	}