Add UD evaluation

Change-Id: I87b50f7b46c7f1d111e5e8ad3f925ca5280d74a2
diff --git a/.gitignore b/.gitignore
index e9e408f..0e95a24 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 /corpus/empirist_*
+/corpus/de_*-ud-train*
 /sandbox
 /Sandbox
 /corpus/*.txt
diff --git a/Readme.md b/Readme.md
index 2ab1ef5..2fba873 100644
--- a/Readme.md
+++ b/Readme.md
@@ -22,7 +22,7 @@
 To run the benchmark, call
 
 ```shell
-$ docker run --rm -it \
+$ docker run --rm -i \
   -v ${PWD}/benchmarks:/euralex/benchmarks \
   -v ${PWD}/corpus:/euralex/corpus \
   korap/euralex22 benchmarks/[BENCHMARK-SCRIPT]
@@ -38,8 +38,8 @@
 
 ## `empirist.pl`
 
-To run the empirist test suite, you need to download the empirist
-gold standard corpus and tooling first and extract it into
+To run the empirist evaluation suite, you first need to download
+the empirist gold standard corpus and tooling, and extract it into
 the corpus directory.
 
 ```shell
@@ -50,17 +50,6 @@
 $ unzip empirist_gold_web.zip -d corpus
 ```
 
-To run the evaluation using the measurement tools provided by EmpiriST 2015,
-run
-
-```shell
-$ docker run --rm -it \
-  -v ${PWD}/benchmarks:/euralex/benchmarks \
-  -v ${PWD}/corpus:/euralex/corpus \
-  korap/euralex2 benchmarks/empirist.pl
-```
-
-
 Quality measurements based on EmpiriST 2015.
 
 To investigate the output, start the benchmark with mounted
@@ -71,6 +60,26 @@
 -v ${PWD}/output_web:/euralex/empirist_web
 ```
 
+## `ud-tokens.pl`
+
+To run the evaluation suite against the 
+[Universal Dependency](https://github.com/UniversalDependencies/UD_German-GSD)
+corpus, first install the empirist tooling as explained above,
+and download the corpus.
+
+```shell
+$ wget https://github.com/UniversalDependencies/UD_German-GSD/raw/master/de_gsd-ud-train.conllu \
+  -O corpus/de_gsd-ud-train.conllu
+```
+
+
+```shell
+$ docker run --rm -it \
+  -v ${PWD}/benchmarks:/euralex/benchmarks \
+  -v ${PWD}/corpus:/euralex/corpus \
+  korap/euralex2 benchmarks/empirist.pl
+```
+
 
 # Tools
 
diff --git a/benchmarks/cleanup/split_conllu.pl b/benchmarks/cleanup/split_conllu.pl
new file mode 100644
index 0000000..9dfd824
--- /dev/null
+++ b/benchmarks/cleanup/split_conllu.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+our @ARGV;
+
+my $file = $ARGV[0];
+
+open(X, '<' . $file);
+open(RAW, '>' . $file . '.raw');
+open(SPLIT, '>' . $file . '.split');
+
+my $init;
+
+while(!eof(X)) {
+  local $_ = <X>;
+
+  if (/^# text = (.+?)$/) {
+    if ($init) {
+      print SPLIT "\n";
+      print RAW ' ';
+    };
+    print RAW $1;
+  }
+  elsif (m/^\d+[\s\t]/) {
+    if (/^\d+[\s\t]+([^\t\s]+)[\t\s]/) {
+      print SPLIT $1,"\n";
+      $init = 1;
+    }
+  };
+};
+
+close(X);
+close(RAW);
+close(SPLIT);
diff --git a/benchmarks/empirist.pl b/benchmarks/empirist.pl
index f7a734c..24687ec 100644
--- a/benchmarks/empirist.pl
+++ b/benchmarks/empirist.pl
@@ -60,16 +60,16 @@
   }
 );
 
-# delete $tools{waste};
-# delete $tools{datok};
-# delete $tools{korap_tokenizer};
-# delete $tools{opennlp_simple};
-# delete $tools{opennlp_tokenizer};
-# delete $tools{tree_tagger};
-# delete $tools{jtok};
-# delete $tools{syntok};
-# delete $tools{somajo};
-# delete $tools{stanford};
+#delete $tools{waste};
+#delete $tools{datok};
+#delete $tools{korap_tokenizer};
+#delete $tools{opennlp_simple};
+#delete $tools{opennlp_tokenizer};
+#delete $tools{tree_tagger};
+#delete $tools{jtok};
+#delete $tools{syntok};
+#delete $tools{somajo};
+#delete $tools{stanford};
 
 # Create project folders
 foreach (keys %tools) {
diff --git a/benchmarks/ud_tokens.pl b/benchmarks/ud_tokens.pl
new file mode 100644
index 0000000..6ae8fba
--- /dev/null
+++ b/benchmarks/ud_tokens.pl
@@ -0,0 +1,86 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+# Comparison path
+my $cmd = '/euralex/corpus/empirist_gold_cmc/tools/compare_tokenization.perl';
+
+# Output path
+my $ud_path = '/euralex/ud_tokens';
+mkdir $ud_path;
+
+my $base = 'de_gsd-ud-train.conllu';
+
+# Split files
+chdir '/euralex/corpus/';
+system 'perl /euralex/benchmarks/cleanup/split_conllu.pl /euralex/corpus/' . $base;
+chdir '/euralex';
+
+my $gold = '/euralex/corpus/' . $base . '.split';
+my $raw = '/euralex/corpus/' . $base . '.raw';
+
+my %tools = (
+  waste => sub {
+    system 'cat ' . $raw . ' | waste -N -v0 --rcfile=./Waste/waste.rc > ' . $ud_path . '/waste/' . $base;
+  },
+  datok => sub {
+    system 'cat ' . $raw . ' | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok - > ' . $ud_path . '/datok/' . $base;
+  },
+  korap_tokenizer => sub {
+    system 'cat ' . $raw . ' | java -jar ./KorAP-Tokenizer/KorAP-Tokenizer.jar -l de > ' . $ud_path . '/korap_tokenizer/' . $base;
+  },
+  opennlp_simple => sub {
+    system 'cat ' . $raw . ' | ./opennlp/bin/opennlp SimpleTokenizer 2> /dev/null | sed "s/\s/\n/g" > ' . $ud_path . '/opennlp_simple/' . $base;
+  },
+  opennlp_tokenizer => sub {
+    system 'cat ' . $raw . ' | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin 2> /dev/null | sed "s/\s/\n/g" > ' . $ud_path . '/opennlp_tokenizer/' . $base;
+  },
+  tree_tagger => sub {
+    system 'cat ' . $raw . ' | perl ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations 2> /dev/null > ' . $ud_path . '/tree_tagger/' . $base;
+  },
+  jtok => sub {
+    chdir '/euralex/JTok/bin';
+    system 'sh tokenize ' . $raw . ' de | grep "Token: " | perl -CS -pe "s/\s +Token: \"//; s/^(\"?[^\"]*?)\".+?$/\1/g" > ' . $ud_path . '/jtok/' . $base;
+    chdir '/euralex';
+  },
+  syntok => sub {
+    system 'python3 -m syntok.tokenizer ' . $raw . ' | sed "s/\s/\n/g" > ' . $ud_path . '/syntok/' . $base;
+  },
+  somajo => sub {
+    system 'somajo-tokenizer ' . $raw . ' 2> /dev/null > ' . $ud_path . '/somajo/' . $base;
+  },
+  stanford => sub {
+    system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
+      '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -file ' . $raw . ' 2> /dev/null';
+    system 'perl /euralex/benchmarks/cleanup/stanford.pl ' . $base . '.raw.out > ' . $ud_path . '/stanford/' . $base;
+    system 'rm ' . $base . '.raw.out';
+  }
+);
+
+# delete $tools{waste};
+# delete $tools{datok};
+# delete $tools{korap_tokenizer};
+# delete $tools{opennlp_simple};
+# delete $tools{opennlp_tokenizer};
+# delete $tools{tree_tagger};
+# delete $tools{jtok};
+# delete $tools{syntok};
+# delete $tools{somajo};
+# delete $tools{stanford};
+
+# Create project folders
+foreach (keys %tools) {
+  mkdir $ud_path . '/' . $_;
+};
+
+# Run tokenization
+foreach (keys %tools) {
+  $tools{$_}->();
+};
+
+foreach my $tool (keys %tools) {
+  print "\n##########\n";
+  print "##### $tool - UD\n";
+  print "##\n";
+  system $cmd . ' -x ' . $gold . ' ' . $ud_path . '/' . $tool . '/' . $base . ' 2> /dev/null';
+};