blob: cf2101230e98f823e492aa90cc777feff9365618 [file] [log] [blame]
PeterFankhauserIDS54c18ee2021-05-15 17:41:49 +02001use strict;
2use utf8;
3
4# usage: perl extractbench.pl konventsin.tsv konventsout.tsv
5
6my $input = $ARGV[0];
7my $output = $ARGV[1];
8
9open( IN, '<:encoding(UTF-8)', $input ) or die("can't open $input");
10open( OUT, '>:encoding(UTF-8)', $output ) or die("can't open $output");
11
12my $splitbysent = 0;
13my $contextsize = 5;
14
15print OUT "ID\tNGRAMNORM\tCLASS\tNGRAM1\tCONTEXT1\tNGRAM2\tCONTEXT2\n";
16
17while ( defined( my $line = <IN> ) ) {
18 chomp($line);
19 my @fields = split(/\t/,$line);
20 my $id = $fields[0];
21 my $ngramnorm = $fields[1];
22 my $class = $fields[2];
23 my $data = $fields[3];
24 my @sents;
25 $sents[0] = $data;
26 if ($splitbysent==1) {
27 @sents = split(/\s*[.:]\s*/,$data);
28 }
29 my $left = "";
30 my $right = "";
31 my $inner = "";
32 my $ngram1 = "";
33 my $ngram2 = "";
34 foreach my $sent (@sents) {
35 if ($sent=~/(.*?)(<b>.+<\/b>)(.*)/) {
36 $left = $1;
37 my $innerall = $2;
38 $right = $3;
39 my @innertokens = split(/\s+/,$innerall);
40 foreach my $innertoken (@innertokens) {
41 if ($innertoken=~/<b>(.+?)<\/b>/) {
42 $ngram1 .= " " . $1;
43 $ngram2 .= " " . $1;
44 }
45 else {
46 $ngram1 .= " " . $innertoken;
47 $inner .= " " . $innertoken;
48 }
49 }
50 }
51 }
52 my @lefts = split(/[\W]+/,$left);
53 my @rights = split(/[\W]+/,$right);
54 # for some reason we have a spurious first element in @rights.
55 shift(@rights);
56 $left = "";
57 for (my $i=$contextsize; $i >0; $i--) {
58 my $j = scalar(@lefts)-$i;
59 if ($j>0) {
60 $left .= " " . $lefts[$j];
61 }
62 }
63 $right = "";
64 for (my $i=0; $i<$contextsize;$i++) {
65 if ($i < scalar(@rights)) {
66 $right .= " " . $rights[$i];
67 }
68 }
69 $ngram1 =~ s/[^\w\s]+/ /g;
70 $ngram2 =~s/[^\w\s]+/ /g;
71 my $context1 = $left . " " . $right;
72 my $context2 = $left . " " . $inner . " " . $right;
73 $context1=~s/\s+/ /g;
74 $context2=~s/\s+/ /g;
75 print OUT $id . "\t" . $ngramnorm . "\t" . $class . "\t" . $ngram1 . "\t" . $context1 . "\t" . $ngram2 . "\t" . $context2 . "\n";
76}
77
78close(IN);
79close(OUT);