Blame - tools/scramble_korapxml.pl - KorAP/KorAP-XML-Krill

blob: b20a77f282e85c0fb3bf703b12a983e1bf0b55eb [file] [log] [blame]

Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	1	#!/usr/bin/env perl
				2	use Mojo::Base -strict;
				3	use Mojo::DOM;
				4	use Mojo::File qw'path';
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	5	use Mojo::JSON qw'decode_json';
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	6	use Mojo::ByteStream 'b';
				7	use String::Random;
				8	use Pod::Usage;
				9	use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
				10
				11	#############################################################
				12	# This helper tool iterates over a single KorAP-XML files #
				13	# and randomizes all word strings occurring following #
				14	# several rules. This is useful to create example files #
				15	# based on corpora that can't be published. #
				16	# (c) IDS Mannheim #
				17	#############################################################
				18
				19	my %ERROR_HASH = (
				20	-sections => 'NAME\|SYNOPSIS',
				21	-verbose => 99,
				22	-output => '-',
				23	-exit => 1
				24	);
				25
				26	my ($orig_folder, $scr_folder);
				27	GetOptions(
				28	'input\|i=s' => \$orig_folder,
				29	'output\|o=s' => \$scr_folder,
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	30	'rules\|r=s' => \(my $rule_file),
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	31	'help\|h' => sub {
				32	pod2usage(
				33	-sections => 'NAME\|SYNOPSIS\|DESCRIPTION\|ARGUMENTS\|OPTIONS',
				34	-verbose => 99,
				35	-output => '-'
				36	);
				37	}
				38	);
				39
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	40	unless ($orig_folder \|\| $scr_folder \|\| $rule_file) {
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	41	pod2usage(%ERROR_HASH);
				42	};
				43
				44	my $string_gen = String::Random->new;
				45
				46	# Remember all generated pairs orig -> random
				47	my %replacements = ();
				48	my $offset = 0;
				49	my @offsets = ();
				50
				51	# Turn a word into a random word with similar characteristics
				52	sub get_rnd_word {
				53	my $o_word = shift;
				54	return $o_word unless $o_word =~ /[a-z]/i;
				55
				56	# Return the old replacement
				57	if ($replacements{$o_word}) {
				58	return $replacements{$o_word};
				59	};
				60
				61	my $word = $o_word;
				62
				63	# Turn the word into a pattern for String::Random
				64	# c: Any Latin lowercase character [a-z]
				65	# C: Any Latin uppercase character [A-Z]
				66	# n: Any digit [0-9]
				67	# !: A punctuation character
				68	$word =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzöäü1234567890~`!@$%^&*()-_+={}[]\|\\:;"'.<>?\/#,/CCCCCCCCCCCCCCCCCCCCCCCCCCccccccccccccccccccccccccccccccnnnnnnnnnn!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!/;
				69	$word =~ s/[^Ccn!]/n/g;
				70	$replacements{$o_word} = $string_gen->randpattern($word);
				71	}
				72
				73	# 1. Load data.xml
				74	# replace all surface forms of /[a-z]/
				75	# with character strings of the same length, randomly created.
				76	# Create an array, accessible by offsets.
				77	my $data_file = $orig_folder . '/data.xml';
				78	# Process the data file and replace all surface words with random words
				79	my $data = Mojo::File->new($data_file)->slurp;
				80	my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode);
				81	my $new_text = b($dom->at('text')->text)->split(
				82	" "
				83	)->map(
				84	sub {
				85	my $token = get_rnd_word($_);
				86	$offsets[$offset] = $token;
				87	# print $offset, ':', $_, ':', $token,"\n";
				88	$offset += length($token);
				89	$offset++; # space
				90
				91	# exit if $offset > 300;
				92	return $token;
				93	}
				94	)->join(
				95	" "
				96	);
				97	$dom->at('text')->content($new_text);
				98
				99	# Create folder
				100	path($scr_folder)->make_path->child('data.xml')->spurt(b($dom->to_string)->encode);
				101
				102
				103	# 2. Take some css selectors and rename attributes,
				104	# either according to the surface form ("=") or
				105	# somehow derived ("^"), or random as well ("~"),
				106	# based on the given content, that can be randomized and
				107	# stuffed in a hash as well.
				108	# If no CSS rules are parsed, the file will just be copied.
				109
Akron	b61d6df	2021-02-16 10:52:12 +0100	[diff] [blame^]	110	if ($rule_file) {
				111	$rule_file = Mojo::File->new($rule_file);
				112	if (-e $rule_file) {
				113	my $rules = decode_json $rule_file->slurp;
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	114
Akron	b61d6df	2021-02-16 10:52:12 +0100	[diff] [blame^]	115	foreach my $rule (@$rules) {
				116	scramble(@$rule);
				117	};
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	118	};
				119	};
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	120
				121	# Scramble an annotation file
				122	sub scramble {
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	123	my ($input, $rules) = @_;
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	124	my $data_file = path($orig_folder)->child($input);
				125
				126	unless (-f $data_file) {
				127	warn "$data_file does not exist";
				128	return;
				129	};
				130
				131	my $data = $data_file->slurp;
				132
				133	# Only transfer if rules exist
				134	if ($rules) {
				135	my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode);
				136
				137	foreach (@$rules) {
Akron	cbf098a	2020-04-27 17:56:42 +0200	[diff] [blame]	138	if ($input =~ /header\.xml$/) {
				139	transform_header($dom, $_->[0]);
				140	} else {
				141	transform($dom, $_->[0], $_->[1]);
				142	};
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	143	};
				144
				145	$data = b($dom->to_string)->encode;
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	146	};
				147
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	148	my $file = Mojo::File->new($scr_folder)->child($input);
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	149	path($file->dirname)->make_path;
				150	$file->spurt($data);
				151	};
				152
				153
				154	# Iterate over an annotation document and scramble
				155	# all textual content based on CSS rules
				156	sub transform {
				157	my ($dom, $selector, $rule) = @_;
				158
				159	$dom->find("spanList > span")->each(
				160	sub {
				161	my $from = $_->attr("from");
				162	my $to = $_->attr("to");
				163	$_->find($selector)->each(
				164	sub {
				165	my $word = $_->text;
				166
				167	unless ($offsets[$from]) {
				168	# warn '!!! Unknown word at ' . $from . '!';
				169	$_->content('UNKN');
				170	return;
				171	};
				172
				173	# The derive rule means that the original
				174	# word is taken and appended the string 'ui'
				175	if ($rule eq '^') {
				176	my $deriv = $offsets[$from];
				177	chop($deriv);
				178	chop($deriv);
				179	$_->content($deriv . 'ui');
				180
				181	}
				182
				183	# The random rule means the word is replaced by
				184	# with a random word with the same characterisms.
				185	elsif ($rule eq '~') {
				186	$_->content(get_rnd_word($word));
				187	}
				188
				189	# Any other rule means, that the original word
				190	# from the character data is taken.
				191	else {
				192	$_->content($offsets[$from])
				193	}
				194	}
				195	)
				196	}
				197	)
				198	};
				199
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	200
Akron	cbf098a	2020-04-27 17:56:42 +0200	[diff] [blame]	201	# Transform header file
				202	sub transform_header {
				203	my ($dom, $selector) = @_;
				204
				205	$dom->find($selector)->each(
				206	sub {
				207	my $word = $_->text;
				208
				209	# The random rule means the word is replaced by
				210	# with a random word with the same characterisms.
				211	$_->content(get_rnd_word($word));
				212	}
				213	)
				214	};
				215
				216
				217
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	218	__END__
				219
				220	=pod
				221
				222	=encoding utf8
				223
				224	=head1 NAME
				225
				226	scramble_korapxml.pl - Merge KorAP-XML data and create Krill documents
				227
				228
				229	=head1 SYNOPSIS
				230
				231	scramble_korapxml.pl -i <input-directory> -o <output-directory>
				232
				233
				234	=head1 DESCRIPTION
				235
				236	This helper tool iterates over a single KorAP-XML folder
				237	and randomizes all word strings occurring following
				238	several rules. This is useful to create example files
				239	based on corpora that can't be published.
				240
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	241
				242	=head1 OPTIONS
				243
				244	=over 2
				245
				246	=item B<--input\|-i> <directory>
				247
				248	The unscrambled KorAP-XML directory.
				249
				250
				251	=item B<--output\|-o> <directory>
				252
				253	The output directory
				254
				255
				256	=item B<--rules\|-r> <file>
				257
				258	The rule file for transformation as a json file.
				259	Example:
				260
				261	[
				262	[
				263	"dgd/annot.xml",
				264	[
				265	["f[name=trans]", "="],
				266	["f[name=lemma]", "^"],
				267	["f[name=pos]", "~"]
				268	]
				269	],
				270	["struct/structure.xml"]
				271	]
				272
				273	All elements of the json list are copied from the input directory to
				274	the output directory.
				275	The C<data.xml> file will be automatically coppied and scrambled.
				276	If the file name is followed by a rule set, these
				277	CSS selector rules followed by a transformation type marker
				278	are used to transform elements of the file.
				279
Akron	cbf098a	2020-04-27 17:56:42 +0200	[diff] [blame]	280	All CSS selectors for annotation files
				281	are nested in C<spanList > span>.
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	282
				283	The following markers are supported:
				284
				285	=over 4
				286
				287	=item B<=>
				288
				289	Take the scrambled surface form from the C<data.xml>.
				290
				291	=item B<^>
				292
				293	Take the scrambled surface form from the C<data.xml> and
				294	modify the term by appending the string C<ui>.
				295
				296	=item B<~>
				297
				298	Create a randomized string, keeping the characteristicts of
				299	the original element content.
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	300	Two identical words in a single run will always be transfered
				301	to the same target word.
				302
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	303	=back
				304
Akron	cbf098a	2020-04-27 17:56:42 +0200	[diff] [blame]	305	For header files, the rules are not nested and only the
				306	randomized marker C<~> is supported.
				307
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	308	=back