Add eos evaluation

Change-Id: Ia721ce1df8798fa2771059b4feb12eb56459325b
diff --git a/benchmarks/cleanup/eos.pl b/benchmarks/cleanup/eos.pl
new file mode 100644
index 0000000..02d0a43
--- /dev/null
+++ b/benchmarks/cleanup/eos.pl
@@ -0,0 +1,5 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+# This script rewrites a file 
diff --git a/benchmarks/cleanup/jtok.pl b/benchmarks/cleanup/jtok.pl
new file mode 100644
index 0000000..664e919
--- /dev/null
+++ b/benchmarks/cleanup/jtok.pl
@@ -0,0 +1,22 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my $init = 1;
+my $c = '';
+foreach (<>) {
+  if (s/\s +Token: \"// && s/^(\"?[^\"]*?)\".+?$/$1/g) {
+    $c .= $_;
+  }
+  elsif (m/Text Unit Start/) {
+    if ($init) {
+      $init = 0;
+    } else {
+      $c =~ s/[\s\n\t]+//g;
+      print $c,"\n";
+      $c = '';
+    };
+  };
+};
+
+print "\n";
diff --git a/benchmarks/cleanup/split_conllu.pl b/benchmarks/cleanup/split_conllu.pl
index 9dfd824..441be36 100644
--- a/benchmarks/cleanup/split_conllu.pl
+++ b/benchmarks/cleanup/split_conllu.pl
@@ -9,6 +9,7 @@
 open(X, '<' . $file);
 open(RAW, '>' . $file . '.raw');
 open(SPLIT, '>' . $file . '.split');
+open(EOS, '>' . $file . '.eos');
 
 my $init;
 
@@ -21,6 +22,9 @@
       print RAW ' ';
     };
     print RAW $1;
+    my $temp = $1;
+    $temp =~ s/[\s\n\t]+//g;
+    print EOS $temp, "\n";
   }
   elsif (m/^\d+[\s\t]/) {
     if (/^\d+[\s\t]+([^\t\s]+)[\t\s]/) {
@@ -32,4 +36,5 @@
 
 close(X);
 close(RAW);
+close(EOS);
 close(SPLIT);
diff --git a/benchmarks/cleanup/tokenize_eos.pl b/benchmarks/cleanup/tokenize_eos.pl
new file mode 100644
index 0000000..42f5ae7
--- /dev/null
+++ b/benchmarks/cleanup/tokenize_eos.pl
@@ -0,0 +1,14 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my $c = '';
+foreach (<>) {
+  $c .= $_;
+};
+
+foreach my $c (split("</eos>", $c)) {
+  $c =~ s/[\s\n\t]+//g;
+  print $c, "\n";
+};
+
diff --git a/benchmarks/cleanup/tokenize_nn.pl b/benchmarks/cleanup/tokenize_nn.pl
new file mode 100644
index 0000000..3124c6a
--- /dev/null
+++ b/benchmarks/cleanup/tokenize_nn.pl
@@ -0,0 +1,14 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my $c = '';
+foreach (<>) {
+  $c .= $_;
+};
+$c =~ s/^\n+//s;
+foreach my $c (split(/\n\n/, $c)) {
+  $c =~ s/[\s\n\t]+//g;
+  print $c, "\n";
+};
+
diff --git a/benchmarks/cleanup/tokenize_simple.pl b/benchmarks/cleanup/tokenize_simple.pl
new file mode 100644
index 0000000..cad1749
--- /dev/null
+++ b/benchmarks/cleanup/tokenize_simple.pl
@@ -0,0 +1,8 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+foreach (<>) {
+  s/[\s\n\t]+//g;
+  print $_, "\n";
+};