PeterFankhauserIDS | 54c18ee | 2021-05-15 17:41:49 +0200 | [diff] [blame] | 1 | use strict; |
| 2 | use utf8; |
| 3 | |
Marc Kupietz | 3080f31 | 2023-08-30 15:58:07 +0200 | [diff] [blame^] | 4 | # usage: perl extractbench.pl konvensin.tsv konvensout.tsv |
PeterFankhauserIDS | 54c18ee | 2021-05-15 17:41:49 +0200 | [diff] [blame] | 5 | |
| 6 | my $input = $ARGV[0]; |
| 7 | my $output = $ARGV[1]; |
| 8 | |
| 9 | open( IN, '<:encoding(UTF-8)', $input ) or die("can't open $input"); |
| 10 | open( OUT, '>:encoding(UTF-8)', $output ) or die("can't open $output"); |
| 11 | |
| 12 | my $splitbysent = 0; |
| 13 | my $contextsize = 5; |
| 14 | |
| 15 | print OUT "ID\tNGRAMNORM\tCLASS\tNGRAM1\tCONTEXT1\tNGRAM2\tCONTEXT2\n"; |
| 16 | |
| 17 | while ( defined( my $line = <IN> ) ) { |
| 18 | chomp($line); |
| 19 | my @fields = split(/\t/,$line); |
| 20 | my $id = $fields[0]; |
| 21 | my $ngramnorm = $fields[1]; |
| 22 | my $class = $fields[2]; |
| 23 | my $data = $fields[3]; |
| 24 | my @sents; |
| 25 | $sents[0] = $data; |
| 26 | if ($splitbysent==1) { |
| 27 | @sents = split(/\s*[.:]\s*/,$data); |
| 28 | } |
| 29 | my $left = ""; |
| 30 | my $right = ""; |
| 31 | my $inner = ""; |
| 32 | my $ngram1 = ""; |
| 33 | my $ngram2 = ""; |
| 34 | foreach my $sent (@sents) { |
| 35 | if ($sent=~/(.*?)(<b>.+<\/b>)(.*)/) { |
| 36 | $left = $1; |
| 37 | my $innerall = $2; |
| 38 | $right = $3; |
| 39 | my @innertokens = split(/\s+/,$innerall); |
| 40 | foreach my $innertoken (@innertokens) { |
| 41 | if ($innertoken=~/<b>(.+?)<\/b>/) { |
| 42 | $ngram1 .= " " . $1; |
| 43 | $ngram2 .= " " . $1; |
| 44 | } |
| 45 | else { |
| 46 | $ngram1 .= " " . $innertoken; |
| 47 | $inner .= " " . $innertoken; |
| 48 | } |
| 49 | } |
| 50 | } |
| 51 | } |
| 52 | my @lefts = split(/[\W]+/,$left); |
| 53 | my @rights = split(/[\W]+/,$right); |
| 54 | # for some reason we have a spurious first element in @rights. |
| 55 | shift(@rights); |
| 56 | $left = ""; |
| 57 | for (my $i=$contextsize; $i >0; $i--) { |
| 58 | my $j = scalar(@lefts)-$i; |
| 59 | if ($j>0) { |
| 60 | $left .= " " . $lefts[$j]; |
| 61 | } |
| 62 | } |
| 63 | $right = ""; |
| 64 | for (my $i=0; $i<$contextsize;$i++) { |
| 65 | if ($i < scalar(@rights)) { |
| 66 | $right .= " " . $rights[$i]; |
| 67 | } |
| 68 | } |
| 69 | $ngram1 =~ s/[^\w\s]+/ /g; |
| 70 | $ngram2 =~s/[^\w\s]+/ /g; |
| 71 | my $context1 = $left . " " . $right; |
| 72 | my $context2 = $left . " " . $inner . " " . $right; |
| 73 | $context1=~s/\s+/ /g; |
| 74 | $context2=~s/\s+/ /g; |
| 75 | print OUT $id . "\t" . $ngramnorm . "\t" . $class . "\t" . $ngram1 . "\t" . $context1 . "\t" . $ngram2 . "\t" . $context2 . "\n"; |
| 76 | } |
| 77 | |
| 78 | close(IN); |
| 79 | close(OUT); |