Perl Script for processing Konvents2021 data
Change-Id: I3a8e6d202872ff8caaf8fbd3de6575a312b1b4dc
diff --git a/perl/extractbench.pl b/perl/extractbench.pl
new file mode 100644
index 0000000..cf21012
--- /dev/null
+++ b/perl/extractbench.pl
@@ -0,0 +1,79 @@
+use strict;
+use utf8;
+
+# usage: perl extractbench.pl konventsin.tsv konventsout.tsv
+
+my $input = $ARGV[0];
+my $output = $ARGV[1];
+
+open( IN, '<:encoding(UTF-8)', $input ) or die("can't open $input");
+open( OUT, '>:encoding(UTF-8)', $output ) or die("can't open $output");
+
+my $splitbysent = 0;
+my $contextsize = 5;
+
+print OUT "ID\tNGRAMNORM\tCLASS\tNGRAM1\tCONTEXT1\tNGRAM2\tCONTEXT2\n";
+
+while ( defined( my $line = <IN> ) ) {
+ chomp($line);
+ my @fields = split(/\t/,$line);
+ my $id = $fields[0];
+ my $ngramnorm = $fields[1];
+ my $class = $fields[2];
+ my $data = $fields[3];
+ my @sents;
+ $sents[0] = $data;
+ if ($splitbysent==1) {
+ @sents = split(/\s*[.:]\s*/,$data);
+ }
+ my $left = "";
+ my $right = "";
+ my $inner = "";
+ my $ngram1 = "";
+ my $ngram2 = "";
+ foreach my $sent (@sents) {
+ if ($sent=~/(.*?)(<b>.+<\/b>)(.*)/) {
+ $left = $1;
+ my $innerall = $2;
+ $right = $3;
+ my @innertokens = split(/\s+/,$innerall);
+ foreach my $innertoken (@innertokens) {
+ if ($innertoken=~/<b>(.+?)<\/b>/) {
+ $ngram1 .= " " . $1;
+ $ngram2 .= " " . $1;
+ }
+ else {
+ $ngram1 .= " " . $innertoken;
+ $inner .= " " . $innertoken;
+ }
+ }
+ }
+ }
+ my @lefts = split(/[\W]+/,$left);
+ my @rights = split(/[\W]+/,$right);
+ # for some reason we have a spurious first element in @rights.
+ shift(@rights);
+ $left = "";
+ for (my $i=$contextsize; $i >0; $i--) {
+ my $j = scalar(@lefts)-$i;
+ if ($j>0) {
+ $left .= " " . $lefts[$j];
+ }
+ }
+ $right = "";
+ for (my $i=0; $i<$contextsize;$i++) {
+ if ($i < scalar(@rights)) {
+ $right .= " " . $rights[$i];
+ }
+ }
+ $ngram1 =~ s/[^\w\s]+/ /g;
+ $ngram2 =~s/[^\w\s]+/ /g;
+ my $context1 = $left . " " . $right;
+ my $context2 = $left . " " . $inner . " " . $right;
+ $context1=~s/\s+/ /g;
+ $context2=~s/\s+/ /g;
+ print OUT $id . "\t" . $ngramnorm . "\t" . $class . "\t" . $ngram1 . "\t" . $context1 . "\t" . $ngram2 . "\t" . $context2 . "\n";
+}
+
+close(IN);
+close(OUT);