script/strip_commercial_annotations - KorAP/KorAP-XML-Krill - Gitiles

 #!/usr/bin/env perl
 use Mojo::Base -strict;
 use Mojo::JSON 'j';
 use Mojo::Util qw/slurp spurt/;
 use Pod::Usage;

 ####
 # Remove xip and cnx foundries from legacy index files
 # This needs the installation of the Mojolicious package:
 # $ cpan install Mojolicious
 ####

 our @ARGV;
 my $COMM_FOUNDRIES = qr!(?:xip|cnx|connexor)!;

 # Get file info from command line
 my $file     = $ARGV[0] or die pod2usage(1);
 my $out_file = $ARGV[1] || 'clean_' . $file;

 # Load file and jsonify
 my $j = j(slurp $file);

 # Read fields
 my ($tokens, $stream);

 # Clean tokens
 sub _clean ($) {
   return join ' ', grep { $_ !~ $COMM_FOUNDRIES }
     split / /, $_
 };

 # Legacy index file
 if ($tokens = $j->{fields}->[1]) {

   # Strip annotation info
   foreach (qw/layerInfo foundries/) {
     $tokens->{$_} = _clean $tokens->{$_} if $tokens->{$_};
   };

   # Read data
   $stream = $tokens->{data};
 }

 # New index file
 elsif ($tokens = $j->{data}) {
   # Strip annotation info
   foreach (qw/layerInfos foundries/) {
     $tokens->{$_} = _clean $tokens->{$_} if $tokens->{$_};
   };

   # Read data
   $stream = $tokens->{stream};
 };

 # Clean data from xip and cnx
 my $clean_data = [];
 foreach my $token (@$stream) {
   my $clean_token = [];
   foreach my $term (@$token) {
     if ($term !~ /^(?:(?:<>|<|>|@|-):)?$COMM_FOUNDRIES/o) {
       push @$clean_token, $term;
     };
   };
   push @$clean_data, $clean_token;
 };

 # Legacy index file
 if ($tokens->{data}) {
   $tokens->{data} = $clean_data;
 }

 # New index file
 elsif ($tokens->{stream}) {
   $tokens->{stream} = $clean_data;
 };

 # Write file
 spurt j($j), $out_file;

 __END__

 =pod

 =head1 NAME

 strip_commercial_annotations

 =head1 SYNOPSIS

 perl strip_commercial_annotations my_file.json [my_clean_file.json]

 =cut
	#!/usr/bin/env perl
	use Mojo::Base -strict;
	use Mojo::JSON 'j';
	use Mojo::Util qw/slurp spurt/;
	use Pod::Usage;

	####
	# Remove xip and cnx foundries from legacy index files
	# This needs the installation of the Mojolicious package:
	# $ cpan install Mojolicious
	####

	our @ARGV;
	my $COMM_FOUNDRIES = qr!(?:xip\|cnx\|connexor)!;

	# Get file info from command line
	my $file = $ARGV[0] or die pod2usage(1);
	my $out_file = $ARGV[1] \|\| 'clean_' . $file;

	# Load file and jsonify
	my $j = j(slurp $file);

	# Read fields
	my ($tokens, $stream);

	# Clean tokens
	sub _clean ($) {
	return join ' ', grep { $_ !~ $COMM_FOUNDRIES }
	split / /, $_
	};

	# Legacy index file
	if ($tokens = $j->{fields}->[1]) {

	# Strip annotation info
	foreach (qw/layerInfo foundries/) {
	$tokens->{$_} = _clean $tokens->{$_} if $tokens->{$_};
	};

	# Read data
	$stream = $tokens->{data};
	}

	# New index file
	elsif ($tokens = $j->{data}) {
	# Strip annotation info
	foreach (qw/layerInfos foundries/) {
	$tokens->{$_} = _clean $tokens->{$_} if $tokens->{$_};
	};

	# Read data
	$stream = $tokens->{stream};
	};

	# Clean data from xip and cnx
	my $clean_data = [];
	foreach my $token (@$stream) {
	my $clean_token = [];
	foreach my $term (@$token) {
	if ($term !~ /^(?:(?:<>\|<\|>\|@\|-):)?$COMM_FOUNDRIES/o) {
	push @$clean_token, $term;
	};
	};
	push @$clean_data, $clean_token;
	};

	# Legacy index file
	if ($tokens->{data}) {
	$tokens->{data} = $clean_data;
	}

	# New index file
	elsif ($tokens->{stream}) {
	$tokens->{stream} = $clean_data;
	};

	# Write file
	spurt j($j), $out_file;

	__END__

	=pod

	=head1 NAME

	strip_commercial_annotations

	=head1 SYNOPSIS

	perl strip_commercial_annotations my_file.json [my_clean_file.json]

	=cut