blob: cf2101230e98f823e492aa90cc777feff9365618 [file] [log] [blame]
use strict;
use utf8;
# usage: perl extractbench.pl konventsin.tsv konventsout.tsv
my $input = $ARGV[0];
my $output = $ARGV[1];
open( IN, '<:encoding(UTF-8)', $input ) or die("can't open $input");
open( OUT, '>:encoding(UTF-8)', $output ) or die("can't open $output");
my $splitbysent = 0;
my $contextsize = 5;
print OUT "ID\tNGRAMNORM\tCLASS\tNGRAM1\tCONTEXT1\tNGRAM2\tCONTEXT2\n";
while ( defined( my $line = <IN> ) ) {
chomp($line);
my @fields = split(/\t/,$line);
my $id = $fields[0];
my $ngramnorm = $fields[1];
my $class = $fields[2];
my $data = $fields[3];
my @sents;
$sents[0] = $data;
if ($splitbysent==1) {
@sents = split(/\s*[.:]\s*/,$data);
}
my $left = "";
my $right = "";
my $inner = "";
my $ngram1 = "";
my $ngram2 = "";
foreach my $sent (@sents) {
if ($sent=~/(.*?)(<b>.+<\/b>)(.*)/) {
$left = $1;
my $innerall = $2;
$right = $3;
my @innertokens = split(/\s+/,$innerall);
foreach my $innertoken (@innertokens) {
if ($innertoken=~/<b>(.+?)<\/b>/) {
$ngram1 .= " " . $1;
$ngram2 .= " " . $1;
}
else {
$ngram1 .= " " . $innertoken;
$inner .= " " . $innertoken;
}
}
}
}
my @lefts = split(/[\W]+/,$left);
my @rights = split(/[\W]+/,$right);
# for some reason we have a spurious first element in @rights.
shift(@rights);
$left = "";
for (my $i=$contextsize; $i >0; $i--) {
my $j = scalar(@lefts)-$i;
if ($j>0) {
$left .= " " . $lefts[$j];
}
}
$right = "";
for (my $i=0; $i<$contextsize;$i++) {
if ($i < scalar(@rights)) {
$right .= " " . $rights[$i];
}
}
$ngram1 =~ s/[^\w\s]+/ /g;
$ngram2 =~s/[^\w\s]+/ /g;
my $context1 = $left . " " . $right;
my $context2 = $left . " " . $inner . " " . $right;
$context1=~s/\s+/ /g;
$context2=~s/\s+/ /g;
print OUT $id . "\t" . $ngramnorm . "\t" . $class . "\t" . $ngram1 . "\t" . $context1 . "\t" . $ngram2 . "\t" . $context2 . "\n";
}
close(IN);
close(OUT);