| use strict; | 
 | use utf8; | 
 |  | 
 | # usage: perl extractbench.pl konvensin.tsv konvensout.tsv | 
 |  | 
 | my $input  = $ARGV[0]; | 
 | my $output = $ARGV[1]; | 
 |  | 
 | open( IN,  '<:encoding(UTF-8)', $input )  or die("can't open $input"); | 
 | open( OUT, '>:encoding(UTF-8)', $output ) or die("can't open $output"); | 
 |  | 
 | my $splitbysent = 0; | 
 | my $contextsize = 5; | 
 |  | 
 | print OUT "ID\tNGRAMNORM\tCLASS\tNGRAM1\tCONTEXT1\tNGRAM2\tCONTEXT2\n"; | 
 |  | 
 | while ( defined( my $line = <IN> ) ) { | 
 | 	chomp($line); | 
 | 	my @fields = split(/\t/,$line); | 
 | 	my $id = $fields[0]; | 
 | 	my $ngramnorm = $fields[1]; | 
 | 	my $class = $fields[2]; | 
 | 	my $data = $fields[3]; | 
 | 	my @sents; | 
 | 	$sents[0] = $data; | 
 | 	if ($splitbysent==1) { | 
 | 	   @sents = split(/\s*[.:]\s*/,$data); | 
 | 	} | 
 | 	my $left = ""; | 
 | 	my $right = ""; | 
 | 	my $inner = ""; | 
 | 	my $ngram1 = ""; | 
 | 	my $ngram2 = ""; | 
 | 	foreach my $sent (@sents) { | 
 | 		if ($sent=~/(.*?)(<b>.+<\/b>)(.*)/) { | 
 | 			$left = $1; | 
 | 			my $innerall = $2; | 
 | 			$right = $3; | 
 | 			my @innertokens = split(/\s+/,$innerall); | 
 | 			foreach my $innertoken (@innertokens) { | 
 | 				if ($innertoken=~/<b>(.+?)<\/b>/) { | 
 | 					$ngram1 .= " " . $1; | 
 | 					$ngram2 .= " " . $1; | 
 | 				} | 
 | 				else { | 
 | 					$ngram1 .= " " . $innertoken; | 
 | 					$inner .= " " . $innertoken; | 
 | 				} | 
 | 			} | 
 | 		} | 
 | 	} | 
 | 	my @lefts = split(/[\W]+/,$left); | 
 | 	my @rights = split(/[\W]+/,$right); | 
 | 	# for some reason we have a spurious first element in @rights. | 
 | 	shift(@rights); | 
 | 	$left = ""; | 
 | 	for (my $i=$contextsize; $i >0; $i--) { | 
 | 		my $j = scalar(@lefts)-$i; | 
 | 		if ($j>0) { | 
 | 			$left .= " " . $lefts[$j]; | 
 | 		} | 
 | 	} | 
 | 	$right = ""; | 
 | 	for (my $i=0; $i<$contextsize;$i++) { | 
 | 		if ($i < scalar(@rights)) { | 
 | 			$right .= " " . $rights[$i]; | 
 | 		} | 
 | 	} | 
 | 	$ngram1 =~ s/[^\w\s]+/ /g; | 
 | 	$ngram2 =~s/[^\w\s]+/ /g; | 
 | 	my $context1 = $left . " " . $right; | 
 | 	my $context2 = $left . " " . $inner . " " . $right; | 
 | 	$context1=~s/\s+/ /g; | 
 | 	$context2=~s/\s+/ /g; | 
 | 	print OUT $id . "\t" . $ngramnorm . "\t" . $class . "\t" . $ngram1 . "\t" . $context1 . "\t" . $ngram2 . "\t" . $context2 . "\n"; | 
 | } | 
 |  | 
 | close(IN); | 
 | close(OUT); |