| use strict; |
| use utf8; |
| |
| # usage: perl extractbench.pl konventsin.tsv konventsout.tsv |
| |
| my $input = $ARGV[0]; |
| my $output = $ARGV[1]; |
| |
| open( IN, '<:encoding(UTF-8)', $input ) or die("can't open $input"); |
| open( OUT, '>:encoding(UTF-8)', $output ) or die("can't open $output"); |
| |
| my $splitbysent = 0; |
| my $contextsize = 5; |
| |
| print OUT "ID\tNGRAMNORM\tCLASS\tNGRAM1\tCONTEXT1\tNGRAM2\tCONTEXT2\n"; |
| |
| while ( defined( my $line = <IN> ) ) { |
| chomp($line); |
| my @fields = split(/\t/,$line); |
| my $id = $fields[0]; |
| my $ngramnorm = $fields[1]; |
| my $class = $fields[2]; |
| my $data = $fields[3]; |
| my @sents; |
| $sents[0] = $data; |
| if ($splitbysent==1) { |
| @sents = split(/\s*[.:]\s*/,$data); |
| } |
| my $left = ""; |
| my $right = ""; |
| my $inner = ""; |
| my $ngram1 = ""; |
| my $ngram2 = ""; |
| foreach my $sent (@sents) { |
| if ($sent=~/(.*?)(<b>.+<\/b>)(.*)/) { |
| $left = $1; |
| my $innerall = $2; |
| $right = $3; |
| my @innertokens = split(/\s+/,$innerall); |
| foreach my $innertoken (@innertokens) { |
| if ($innertoken=~/<b>(.+?)<\/b>/) { |
| $ngram1 .= " " . $1; |
| $ngram2 .= " " . $1; |
| } |
| else { |
| $ngram1 .= " " . $innertoken; |
| $inner .= " " . $innertoken; |
| } |
| } |
| } |
| } |
| my @lefts = split(/[\W]+/,$left); |
| my @rights = split(/[\W]+/,$right); |
| # for some reason we have a spurious first element in @rights. |
| shift(@rights); |
| $left = ""; |
| for (my $i=$contextsize; $i >0; $i--) { |
| my $j = scalar(@lefts)-$i; |
| if ($j>0) { |
| $left .= " " . $lefts[$j]; |
| } |
| } |
| $right = ""; |
| for (my $i=0; $i<$contextsize;$i++) { |
| if ($i < scalar(@rights)) { |
| $right .= " " . $rights[$i]; |
| } |
| } |
| $ngram1 =~ s/[^\w\s]+/ /g; |
| $ngram2 =~s/[^\w\s]+/ /g; |
| my $context1 = $left . " " . $right; |
| my $context2 = $left . " " . $inner . " " . $right; |
| $context1=~s/\s+/ /g; |
| $context2=~s/\s+/ /g; |
| print OUT $id . "\t" . $ngramnorm . "\t" . $class . "\t" . $ngram1 . "\t" . $context1 . "\t" . $ngram2 . "\t" . $context2 . "\n"; |
| } |
| |
| close(IN); |
| close(OUT); |