Blame - tools/scramble_korapxml.pl - KorAP/KorAP-XML-Krill

blob: 95e727e490e81c20be110da458266f747431f78b [file] [log] [blame]

Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	1	#!/usr/bin/env perl
				2	use Mojo::Base -strict;
				3	use Mojo::DOM;
				4	use Mojo::File qw'path';
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	5	use Mojo::JSON qw'decode_json';
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	6	use Mojo::ByteStream 'b';
				7	use String::Random;
				8	use Pod::Usage;
				9	use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
				10
				11	#############################################################
				12	# This helper tool iterates over a single KorAP-XML files #
				13	# and randomizes all word strings occurring following #
				14	# several rules. This is useful to create example files #
				15	# based on corpora that can't be published. #
				16	# (c) IDS Mannheim #
				17	#############################################################
				18
				19	my %ERROR_HASH = (
				20	-sections => 'NAME\|SYNOPSIS',
				21	-verbose => 99,
				22	-output => '-',
				23	-exit => 1
				24	);
				25
				26	my ($orig_folder, $scr_folder);
				27	GetOptions(
				28	'input\|i=s' => \$orig_folder,
				29	'output\|o=s' => \$scr_folder,
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	30	'rules\|r=s' => \(my $rule_file),
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	31	'help\|h' => sub {
				32	pod2usage(
				33	-sections => 'NAME\|SYNOPSIS\|DESCRIPTION\|ARGUMENTS\|OPTIONS',
				34	-verbose => 99,
				35	-output => '-'
				36	);
				37	}
				38	);
				39
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	40	unless ($orig_folder \|\| $scr_folder \|\| $rule_file) {
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	41	pod2usage(%ERROR_HASH);
				42	};
				43
				44	my $string_gen = String::Random->new;
				45
				46	# Remember all generated pairs orig -> random
				47	my %replacements = ();
				48	my $offset = 0;
				49	my @offsets = ();
				50
				51	# Turn a word into a random word with similar characteristics
				52	sub get_rnd_word {
				53	my $o_word = shift;
				54	return $o_word unless $o_word =~ /[a-z]/i;
				55
				56	# Return the old replacement
				57	if ($replacements{$o_word}) {
				58	return $replacements{$o_word};
				59	};
				60
				61	my $word = $o_word;
				62
				63	# Turn the word into a pattern for String::Random
				64	# c: Any Latin lowercase character [a-z]
				65	# C: Any Latin uppercase character [A-Z]
				66	# n: Any digit [0-9]
				67	# !: A punctuation character
				68	$word =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzöäü1234567890~`!@$%^&*()-_+={}[]\|\\:;"'.<>?\/#,/CCCCCCCCCCCCCCCCCCCCCCCCCCccccccccccccccccccccccccccccccnnnnnnnnnn!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!/;
				69	$word =~ s/[^Ccn!]/n/g;
				70	$replacements{$o_word} = $string_gen->randpattern($word);
				71	}
				72
				73	# 1. Load data.xml
				74	# replace all surface forms of /[a-z]/
				75	# with character strings of the same length, randomly created.
				76	# Create an array, accessible by offsets.
				77	my $data_file = $orig_folder . '/data.xml';
				78	# Process the data file and replace all surface words with random words
				79	my $data = Mojo::File->new($data_file)->slurp;
				80	my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode);
				81	my $new_text = b($dom->at('text')->text)->split(
				82	" "
				83	)->map(
				84	sub {
				85	my $token = get_rnd_word($_);
				86	$offsets[$offset] = $token;
				87	# print $offset, ':', $_, ':', $token,"\n";
				88	$offset += length($token);
				89	$offset++; # space
				90
				91	# exit if $offset > 300;
				92	return $token;
				93	}
				94	)->join(
				95	" "
				96	);
				97	$dom->at('text')->content($new_text);
				98
				99	# Create folder
				100	path($scr_folder)->make_path->child('data.xml')->spurt(b($dom->to_string)->encode);
				101
				102
				103	# 2. Take some css selectors and rename attributes,
				104	# either according to the surface form ("=") or
				105	# somehow derived ("^"), or random as well ("~"),
				106	# based on the given content, that can be randomized and
				107	# stuffed in a hash as well.
				108	# If no CSS rules are parsed, the file will just be copied.
				109
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	110	$rule_file = Mojo::File->new($rule_file);
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	111
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	112	if (-e $rule_file) {
				113	my $rules = decode_json $rule_file->slurp;
				114
				115	foreach my $rule (@$rules) {
				116	scramble(@$rule);
				117	};
				118	};
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	119
				120	# Scramble an annotation file
				121	sub scramble {
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	122	my ($input, $rules) = @_;
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	123	my $data_file = path($orig_folder)->child($input);
				124
				125	unless (-f $data_file) {
				126	warn "$data_file does not exist";
				127	return;
				128	};
				129
				130	my $data = $data_file->slurp;
				131
				132	# Only transfer if rules exist
				133	if ($rules) {
				134	my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode);
				135
				136	foreach (@$rules) {
Akron	cbf098a	2020-04-27 17:56:42 +0200	[diff] [blame]	137	if ($input =~ /header\.xml$/) {
				138	transform_header($dom, $_->[0]);
				139	} else {
				140	transform($dom, $_->[0], $_->[1]);
				141	};
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	142	};
				143
				144	$data = b($dom->to_string)->encode;
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	145	};
				146
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	147	my $file = Mojo::File->new($scr_folder)->child($input);
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	148	path($file->dirname)->make_path;
				149	$file->spurt($data);
				150	};
				151
				152
				153	# Iterate over an annotation document and scramble
				154	# all textual content based on CSS rules
				155	sub transform {
				156	my ($dom, $selector, $rule) = @_;
				157
				158	$dom->find("spanList > span")->each(
				159	sub {
				160	my $from = $_->attr("from");
				161	my $to = $_->attr("to");
				162	$_->find($selector)->each(
				163	sub {
				164	my $word = $_->text;
				165
				166	unless ($offsets[$from]) {
				167	# warn '!!! Unknown word at ' . $from . '!';
				168	$_->content('UNKN');
				169	return;
				170	};
				171
				172	# The derive rule means that the original
				173	# word is taken and appended the string 'ui'
				174	if ($rule eq '^') {
				175	my $deriv = $offsets[$from];
				176	chop($deriv);
				177	chop($deriv);
				178	$_->content($deriv . 'ui');
				179
				180	}
				181
				182	# The random rule means the word is replaced by
				183	# with a random word with the same characterisms.
				184	elsif ($rule eq '~') {
				185	$_->content(get_rnd_word($word));
				186	}
				187
				188	# Any other rule means, that the original word
				189	# from the character data is taken.
				190	else {
				191	$_->content($offsets[$from])
				192	}
				193	}
				194	)
				195	}
				196	)
				197	};
				198
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	199
Akron	cbf098a	2020-04-27 17:56:42 +0200	[diff] [blame]	200	# Transform header file
				201	sub transform_header {
				202	my ($dom, $selector) = @_;
				203
				204	$dom->find($selector)->each(
				205	sub {
				206	my $word = $_->text;
				207
				208	# The random rule means the word is replaced by
				209	# with a random word with the same characterisms.
				210	$_->content(get_rnd_word($word));
				211	}
				212	)
				213	};
				214
				215
				216
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	217	__END__
				218
				219	=pod
				220
				221	=encoding utf8
				222
				223	=head1 NAME
				224
				225	scramble_korapxml.pl - Merge KorAP-XML data and create Krill documents
				226
				227
				228	=head1 SYNOPSIS
				229
				230	scramble_korapxml.pl -i <input-directory> -o <output-directory>
				231
				232
				233	=head1 DESCRIPTION
				234
				235	This helper tool iterates over a single KorAP-XML folder
				236	and randomizes all word strings occurring following
				237	several rules. This is useful to create example files
				238	based on corpora that can't be published.
				239
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	240
				241	=head1 OPTIONS
				242
				243	=over 2
				244
				245	=item B<--input\|-i> <directory>
				246
				247	The unscrambled KorAP-XML directory.
				248
				249
				250	=item B<--output\|-o> <directory>
				251
				252	The output directory
				253
				254
				255	=item B<--rules\|-r> <file>
				256
				257	The rule file for transformation as a json file.
				258	Example:
				259
				260	[
				261	[
				262	"dgd/annot.xml",
				263	[
				264	["f[name=trans]", "="],
				265	["f[name=lemma]", "^"],
				266	["f[name=pos]", "~"]
				267	]
				268	],
				269	["struct/structure.xml"]
				270	]
				271
				272	All elements of the json list are copied from the input directory to
				273	the output directory.
				274	The C<data.xml> file will be automatically coppied and scrambled.
				275	If the file name is followed by a rule set, these
				276	CSS selector rules followed by a transformation type marker
				277	are used to transform elements of the file.
				278
Akron	cbf098a	2020-04-27 17:56:42 +0200	[diff] [blame]	279	All CSS selectors for annotation files
				280	are nested in C<spanList > span>.
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	281
				282	The following markers are supported:
				283
				284	=over 4
				285
				286	=item B<=>
				287
				288	Take the scrambled surface form from the C<data.xml>.
				289
				290	=item B<^>
				291
				292	Take the scrambled surface form from the C<data.xml> and
				293	modify the term by appending the string C<ui>.
				294
				295	=item B<~>
				296
				297	Create a randomized string, keeping the characteristicts of
				298	the original element content.
Akron	53bc81d	2020-04-27 16:24:35 +0200	[diff] [blame]	299	Two identical words in a single run will always be transfered
				300	to the same target word.
				301
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	302	=back
				303
Akron	cbf098a	2020-04-27 17:56:42 +0200	[diff] [blame]	304	For header files, the rules are not nested and only the
				305	randomized marker C<~> is supported.
				306
Akron	59a0e4b	2020-04-27 17:43:29 +0200	[diff] [blame]	307	=back