Perl Script for processing Konvents2021 data

Change-Id: I3a8e6d202872ff8caaf8fbd3de6575a312b1b4dc
diff --git a/perl/extractbench.pl b/perl/extractbench.pl
new file mode 100644
index 0000000..cf21012
--- /dev/null
+++ b/perl/extractbench.pl
@@ -0,0 +1,79 @@
+use strict;
+use utf8;
+
+# usage: perl extractbench.pl konventsin.tsv konventsout.tsv
+
+my $input  = $ARGV[0];
+my $output = $ARGV[1];
+
+open( IN,  '<:encoding(UTF-8)', $input )  or die("can't open $input");
+open( OUT, '>:encoding(UTF-8)', $output ) or die("can't open $output");
+
+my $splitbysent = 0;
+my $contextsize = 5;
+
+print OUT "ID\tNGRAMNORM\tCLASS\tNGRAM1\tCONTEXT1\tNGRAM2\tCONTEXT2\n";
+
+while ( defined( my $line = <IN> ) ) {
+	chomp($line);
+	my @fields = split(/\t/,$line);
+	my $id = $fields[0];
+	my $ngramnorm = $fields[1];
+	my $class = $fields[2];
+	my $data = $fields[3];
+	my @sents;
+	$sents[0] = $data;
+	if ($splitbysent==1) {
+	   @sents = split(/\s*[.:]\s*/,$data);
+	}
+	my $left = "";
+	my $right = "";
+	my $inner = "";
+	my $ngram1 = "";
+	my $ngram2 = "";
+	foreach my $sent (@sents) {
+		if ($sent=~/(.*?)(<b>.+<\/b>)(.*)/) {
+			$left = $1;
+			my $innerall = $2;
+			$right = $3;
+			my @innertokens = split(/\s+/,$innerall);
+			foreach my $innertoken (@innertokens) {
+				if ($innertoken=~/<b>(.+?)<\/b>/) {
+					$ngram1 .= " " . $1;
+					$ngram2 .= " " . $1;
+				}
+				else {
+					$ngram1 .= " " . $innertoken;
+					$inner .= " " . $innertoken;
+				}
+			}
+		}
+	}
+	my @lefts = split(/[\W]+/,$left);
+	my @rights = split(/[\W]+/,$right);
+	# for some reason we have a spurious first element in @rights.
+	shift(@rights);
+	$left = "";
+	for (my $i=$contextsize; $i >0; $i--) {
+		my $j = scalar(@lefts)-$i;
+		if ($j>0) {
+			$left .= " " . $lefts[$j];
+		}
+	}
+	$right = "";
+	for (my $i=0; $i<$contextsize;$i++) {
+		if ($i < scalar(@rights)) {
+			$right .= " " . $rights[$i];
+		}
+	}
+	$ngram1 =~ s/[^\w\s]+/ /g;
+	$ngram2 =~s/[^\w\s]+/ /g;
+	my $context1 = $left . " " . $right;
+	my $context2 = $left . " " . $inner . " " . $right;
+	$context1=~s/\s+/ /g;
+	$context2=~s/\s+/ /g;
+	print OUT $id . "\t" . $ngramnorm . "\t" . $class . "\t" . $ngram1 . "\t" . $context1 . "\t" . $ngram2 . "\t" . $context2 . "\n";
+}
+
+close(IN);
+close(OUT);