Add eos evaluation
Change-Id: Ia721ce1df8798fa2771059b4feb12eb56459325b
diff --git a/benchmarks/cleanup/eos.pl b/benchmarks/cleanup/eos.pl
new file mode 100644
index 0000000..02d0a43
--- /dev/null
+++ b/benchmarks/cleanup/eos.pl
@@ -0,0 +1,5 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+# This script rewrites a file
diff --git a/benchmarks/cleanup/jtok.pl b/benchmarks/cleanup/jtok.pl
new file mode 100644
index 0000000..664e919
--- /dev/null
+++ b/benchmarks/cleanup/jtok.pl
@@ -0,0 +1,22 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my $init = 1;
+my $c = '';
+foreach (<>) {
+ if (s/\s +Token: \"// && s/^(\"?[^\"]*?)\".+?$/$1/g) {
+ $c .= $_;
+ }
+ elsif (m/Text Unit Start/) {
+ if ($init) {
+ $init = 0;
+ } else {
+ $c =~ s/[\s\n\t]+//g;
+ print $c,"\n";
+ $c = '';
+ };
+ };
+};
+
+print "\n";
diff --git a/benchmarks/cleanup/split_conllu.pl b/benchmarks/cleanup/split_conllu.pl
index 9dfd824..441be36 100644
--- a/benchmarks/cleanup/split_conllu.pl
+++ b/benchmarks/cleanup/split_conllu.pl
@@ -9,6 +9,7 @@
open(X, '<' . $file);
open(RAW, '>' . $file . '.raw');
open(SPLIT, '>' . $file . '.split');
+open(EOS, '>' . $file . '.eos');
my $init;
@@ -21,6 +22,9 @@
print RAW ' ';
};
print RAW $1;
+ my $temp = $1;
+ $temp =~ s/[\s\n\t]+//g;
+ print EOS $temp, "\n";
}
elsif (m/^\d+[\s\t]/) {
if (/^\d+[\s\t]+([^\t\s]+)[\t\s]/) {
@@ -32,4 +36,5 @@
close(X);
close(RAW);
+close(EOS);
close(SPLIT);
diff --git a/benchmarks/cleanup/tokenize_eos.pl b/benchmarks/cleanup/tokenize_eos.pl
new file mode 100644
index 0000000..42f5ae7
--- /dev/null
+++ b/benchmarks/cleanup/tokenize_eos.pl
@@ -0,0 +1,14 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my $c = '';
+foreach (<>) {
+ $c .= $_;
+};
+
+foreach my $c (split("</eos>", $c)) {
+ $c =~ s/[\s\n\t]+//g;
+ print $c, "\n";
+};
+
diff --git a/benchmarks/cleanup/tokenize_nn.pl b/benchmarks/cleanup/tokenize_nn.pl
new file mode 100644
index 0000000..3124c6a
--- /dev/null
+++ b/benchmarks/cleanup/tokenize_nn.pl
@@ -0,0 +1,14 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my $c = '';
+foreach (<>) {
+ $c .= $_;
+};
+$c =~ s/^\n+//s;
+foreach my $c (split(/\n\n/, $c)) {
+ $c =~ s/[\s\n\t]+//g;
+ print $c, "\n";
+};
+
diff --git a/benchmarks/cleanup/tokenize_simple.pl b/benchmarks/cleanup/tokenize_simple.pl
new file mode 100644
index 0000000..cad1749
--- /dev/null
+++ b/benchmarks/cleanup/tokenize_simple.pl
@@ -0,0 +1,8 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+foreach (<>) {
+ s/[\s\n\t]+//g;
+ print $_, "\n";
+};