Blame - tools/scramble_korapxml.pl - KorAP/KorAP-XML-Krill

blob: 48d6396a10041c807f764af655d72fe5cebabd92 [file] [log] [blame]

Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame^]	1	#!/usr/bin/env perl
				2	use Mojo::Base -strict;
				3	use Mojo::DOM;
				4	use Mojo::File qw'path';
				5	use Mojo::ByteStream 'b';
				6	use String::Random;
				7	use Pod::Usage;
				8	use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
				9
				10	#############################################################
				11	# This helper tool iterates over a single KorAP-XML files #
				12	# and randomizes all word strings occurring following #
				13	# several rules. This is useful to create example files #
				14	# based on corpora that can't be published. #
				15	# (c) IDS Mannheim #
				16	#############################################################
				17
				18	my %ERROR_HASH = (
				19	-sections => 'NAME\|SYNOPSIS',
				20	-verbose => 99,
				21	-output => '-',
				22	-exit => 1
				23	);
				24
				25	my ($orig_folder, $scr_folder);
				26	GetOptions(
				27	'input\|i=s' => \$orig_folder,
				28	'output\|o=s' => \$scr_folder,
				29	'help\|h' => sub {
				30	pod2usage(
				31	-sections => 'NAME\|SYNOPSIS\|DESCRIPTION\|ARGUMENTS\|OPTIONS',
				32	-verbose => 99,
				33	-output => '-'
				34	);
				35	}
				36	);
				37
				38	unless ($orig_folder \|\| $scr_folder) {
				39	pod2usage(%ERROR_HASH);
				40	};
				41
				42	my $string_gen = String::Random->new;
				43
				44	# Remember all generated pairs orig -> random
				45	my %replacements = ();
				46	my $offset = 0;
				47	my @offsets = ();
				48
				49	# Turn a word into a random word with similar characteristics
				50	sub get_rnd_word {
				51	my $o_word = shift;
				52	return $o_word unless $o_word =~ /[a-z]/i;
				53
				54	# Return the old replacement
				55	if ($replacements{$o_word}) {
				56	return $replacements{$o_word};
				57	};
				58
				59	my $word = $o_word;
				60
				61	# Turn the word into a pattern for String::Random
				62	# c: Any Latin lowercase character [a-z]
				63	# C: Any Latin uppercase character [A-Z]
				64	# n: Any digit [0-9]
				65	# !: A punctuation character
				66	$word =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzöäü1234567890~`!@$%^&*()-_+={}[]\|\\:;"'.<>?\/#,/CCCCCCCCCCCCCCCCCCCCCCCCCCccccccccccccccccccccccccccccccnnnnnnnnnn!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!/;
				67	$word =~ s/[^Ccn!]/n/g;
				68	$replacements{$o_word} = $string_gen->randpattern($word);
				69	}
				70
				71	# 1. Load data.xml
				72	# replace all surface forms of /[a-z]/
				73	# with character strings of the same length, randomly created.
				74	# Create an array, accessible by offsets.
				75	my $data_file = $orig_folder . '/data.xml';
				76	# Process the data file and replace all surface words with random words
				77	my $data = Mojo::File->new($data_file)->slurp;
				78	my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode);
				79	my $new_text = b($dom->at('text')->text)->split(
				80	" "
				81	)->map(
				82	sub {
				83	my $token = get_rnd_word($_);
				84	$offsets[$offset] = $token;
				85	# print $offset, ':', $_, ':', $token,"\n";
				86	$offset += length($token);
				87	$offset++; # space
				88
				89	# exit if $offset > 300;
				90	return $token;
				91	}
				92	)->join(
				93	" "
				94	);
				95	$dom->at('text')->content($new_text);
				96
				97	# Create folder
				98	path($scr_folder)->make_path->child('data.xml')->spurt(b($dom->to_string)->encode);
				99
				100
				101	# 2. Take some css selectors and rename attributes,
				102	# either according to the surface form ("=") or
				103	# somehow derived ("^"), or random as well ("~"),
				104	# based on the given content, that can be randomized and
				105	# stuffed in a hash as well.
				106	# If no CSS rules are parsed, the file will just be copied.
				107
				108	scramble('dgd/annot.xml' => [
				109	["f[name=trans]", "="],
				110	["f[name=lemma]", "^"],
				111	["f[name=pos]", "~"]
				112	] => 'dgd/annot.xml');
				113
				114	scramble('struct/structure.xml');
				115	scramble('header.xml');
				116
				117	# Scramble an annotation file
				118	sub scramble {
				119	my ($input, $rules, $output) = @_;
				120	my $data_file = path($orig_folder)->child($input);
				121
				122	unless (-f $data_file) {
				123	warn "$data_file does not exist";
				124	return;
				125	};
				126
				127	my $data = $data_file->slurp;
				128
				129	# Only transfer if rules exist
				130	if ($rules) {
				131	my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode);
				132
				133	foreach (@$rules) {
				134	transform($dom, $_->[0], $_->[1]);
				135	};
				136
				137	$data = b($dom->to_string)->encode;
				138	}
				139
				140	else {
				141
				142	# Just copy the data
				143	$output = $input;
				144	};
				145
				146	my $file = Mojo::File->new($scr_folder)->child($output);
				147	path($file->dirname)->make_path;
				148	$file->spurt($data);
				149	};
				150
				151
				152	# Iterate over an annotation document and scramble
				153	# all textual content based on CSS rules
				154	sub transform {
				155	my ($dom, $selector, $rule) = @_;
				156
				157	$dom->find("spanList > span")->each(
				158	sub {
				159	my $from = $_->attr("from");
				160	my $to = $_->attr("to");
				161	$_->find($selector)->each(
				162	sub {
				163	my $word = $_->text;
				164
				165	unless ($offsets[$from]) {
				166	# warn '!!! Unknown word at ' . $from . '!';
				167	$_->content('UNKN');
				168	return;
				169	};
				170
				171	# The derive rule means that the original
				172	# word is taken and appended the string 'ui'
				173	if ($rule eq '^') {
				174	my $deriv = $offsets[$from];
				175	chop($deriv);
				176	chop($deriv);
				177	$_->content($deriv . 'ui');
				178
				179	}
				180
				181	# The random rule means the word is replaced by
				182	# with a random word with the same characterisms.
				183	elsif ($rule eq '~') {
				184	$_->content(get_rnd_word($word));
				185	}
				186
				187	# Any other rule means, that the original word
				188	# from the character data is taken.
				189	else {
				190	$_->content($offsets[$from])
				191	}
				192	}
				193	)
				194	}
				195	)
				196	};
				197
				198	__END__
				199
				200	# Config data:
				201	{
				202	'/dgd/annot.xml' => [
				203	["f[name=norm]", "="],
				204	["f[name=lemma]", "^"],
				205	["f[name=pos]", "~"]
				206	],
				207	'/dgd/morpho.xml' => [
				208	["f[name=norm]", "="],
				209	["f[name=lemma]", "^"],
				210	["f[name=pos]", "~"]
				211	],
				212	'/dgd/nospeech.xml' => []
				213	}
				214
				215
				216	__END__
				217
				218	=pod
				219
				220	=encoding utf8
				221
				222	=head1 NAME
				223
				224	scramble_korapxml.pl - Merge KorAP-XML data and create Krill documents
				225
				226
				227	=head1 SYNOPSIS
				228
				229	scramble_korapxml.pl -i <input-directory> -o <output-directory>
				230
				231
				232	=head1 DESCRIPTION
				233
				234	This helper tool iterates over a single KorAP-XML folder
				235	and randomizes all word strings occurring following
				236	several rules. This is useful to create example files
				237	based on corpora that can't be published.
				238
				239	Two identical words in a single run will always be transfered
				240	to the same target word.
				241
				242	The C<data.xml> file will be scrambled automatically.