Add eos evaluation Change-Id: Ia721ce1df8798fa2771059b4feb12eb56459325b

commit: 049e52606bcd1fb192789d49110607042a55e2f8 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Mar 18 09:59:34 2022 +0100
committer: Akron <nils@diewald-online.de> Fri Mar 18 09:59:34 2022 +0100
tree: 797fee47ae18690d1734547d83c4bff3d2ead858
parent: 54fd31434d861d66feec491b9b1eff0e661b1225 [diff]
diff --git a/Readme.md b/Readme.md
index 00dd987..8f75046 100644
--- a/Readme.md
+++ b/Readme.md

@@ -63,9 +63,9 @@
 -v ${PWD}/output_web:/euralex/empirist_web
 ```
 
-## `ud-tokens.pl`
+## `ud_tokens.pl`
 
-To run the evaluation suite against the 
+To run the token evaluation suite against the 
 [Universal Dependency](https://github.com/UniversalDependencies/UD_German-GSD)
 corpus, first install the empirist tooling as explained above,
 and download the corpus.
@@ -75,13 +75,10 @@
   -O corpus/de_gsd-ud-train.conllu
 ```
 
+## `ud_sentences.pl`
 
-```shell
-$ docker run --rm -it \
-  -v ${PWD}/benchmarks:/euralex/benchmarks \
-  -v ${PWD}/corpus:/euralex/corpus \
-  korap/euralex2 benchmarks/empirist.pl
-```
+To run the sentence evaluation suite, first download the corpus
+as explained above.
 
 
 # Tools

diff --git a/benchmarks/cleanup/eos.pl b/benchmarks/cleanup/eos.pl
new file mode 100644
index 0000000..02d0a43
--- /dev/null
+++ b/benchmarks/cleanup/eos.pl

@@ -0,0 +1,5 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+# This script rewrites a file 

diff --git a/benchmarks/cleanup/jtok.pl b/benchmarks/cleanup/jtok.pl
new file mode 100644
index 0000000..664e919
--- /dev/null
+++ b/benchmarks/cleanup/jtok.pl

@@ -0,0 +1,22 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my $init = 1;
+my $c = '';
+foreach (<>) {
+  if (s/\s +Token: \"// && s/^(\"?[^\"]*?)\".+?$/$1/g) {
+    $c .= $_;
+  }
+  elsif (m/Text Unit Start/) {
+    if ($init) {
+      $init = 0;
+    } else {
+      $c =~ s/[\s\n\t]+//g;
+      print $c,"\n";
+      $c = '';
+    };
+  };
+};
+
+print "\n";

diff --git a/benchmarks/cleanup/split_conllu.pl b/benchmarks/cleanup/split_conllu.pl
index 9dfd824..441be36 100644
--- a/benchmarks/cleanup/split_conllu.pl
+++ b/benchmarks/cleanup/split_conllu.pl

@@ -9,6 +9,7 @@
 open(X, '<' . $file);
 open(RAW, '>' . $file . '.raw');
 open(SPLIT, '>' . $file . '.split');
+open(EOS, '>' . $file . '.eos');
 
 my $init;
 
@@ -21,6 +22,9 @@
       print RAW ' ';
     };
     print RAW $1;
+    my $temp = $1;
+    $temp =~ s/[\s\n\t]+//g;
+    print EOS $temp, "\n";
   }
   elsif (m/^\d+[\s\t]/) {
     if (/^\d+[\s\t]+([^\t\s]+)[\t\s]/) {
@@ -32,4 +36,5 @@
 
 close(X);
 close(RAW);
+close(EOS);
 close(SPLIT);

diff --git a/benchmarks/cleanup/tokenize_eos.pl b/benchmarks/cleanup/tokenize_eos.pl
new file mode 100644
index 0000000..42f5ae7
--- /dev/null
+++ b/benchmarks/cleanup/tokenize_eos.pl

@@ -0,0 +1,14 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my $c = '';
+foreach (<>) {
+  $c .= $_;
+};
+
+foreach my $c (split("</eos>", $c)) {
+  $c =~ s/[\s\n\t]+//g;
+  print $c, "\n";
+};
+

diff --git a/benchmarks/cleanup/tokenize_nn.pl b/benchmarks/cleanup/tokenize_nn.pl
new file mode 100644
index 0000000..3124c6a
--- /dev/null
+++ b/benchmarks/cleanup/tokenize_nn.pl

@@ -0,0 +1,14 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my $c = '';
+foreach (<>) {
+  $c .= $_;
+};
+$c =~ s/^\n+//s;
+foreach my $c (split(/\n\n/, $c)) {
+  $c =~ s/[\s\n\t]+//g;
+  print $c, "\n";
+};
+

diff --git a/benchmarks/cleanup/tokenize_simple.pl b/benchmarks/cleanup/tokenize_simple.pl
new file mode 100644
index 0000000..cad1749
--- /dev/null
+++ b/benchmarks/cleanup/tokenize_simple.pl

@@ -0,0 +1,8 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+foreach (<>) {
+  s/[\s\n\t]+//g;
+  print $_, "\n";
+};

diff --git a/benchmarks/ud_sentences.pl b/benchmarks/ud_sentences.pl
new file mode 100644
index 0000000..c5f80c6
--- /dev/null
+++ b/benchmarks/ud_sentences.pl

@@ -0,0 +1,117 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+# Comparison path
+my $cmd = '/euralex/corpus/empirist_gold_cmc/tools/compare_tokenization.perl';
+# my $cmd = '/euralex/corpus/deep-eos/eval.py';
+
+my $cleanup = 'perl /euralex/benchmarks/cleanup/';
+my $tokenize_eos = $cleanup . 'tokenize_eos.pl';
+my $tokenize_nn = $cleanup . 'tokenize_nn.pl';
+
+# Output path
+my $ud_path = '/euralex/ud_eos';
+mkdir $ud_path;
+
+my $base = 'de_gsd-ud-train.conllu';
+
+# Split files
+chdir '/euralex/corpus/';
+system 'perl /euralex/benchmarks/cleanup/split_conllu.pl /euralex/corpus/' . $base;
+chdir '/euralex';
+
+my $gold = '/euralex/corpus/' . $base . '.eos';
+my $raw = '/euralex/corpus/' . $base . '.raw';
+
+my %tools = (
+  waste => sub {
+    system 'cat ' . $raw . ' | waste -N -v0 --rcfile=./Waste/waste.rc | ' . $tokenize_nn . ' > ' . $ud_path . '/waste/' . $base;
+  },
+  datok => sub {
+    system 'cat ' . $raw . ' | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok - | ' . $tokenize_nn . ' > ' . $ud_path . '/datok/' . $base;
+  },
+  cutter => sub {
+    system 'python3 ./cutter/cutter.py sent ' . $raw . ' | ' . $tokenize_eos. ' > ' . $ud_path . '/cutter/' . $base;
+  },
+  korap_tokenizer => sub {
+    system 'cat ' . $raw . ' | java -jar ./KorAP-Tokenizer/KorAP-Tokenizer.jar -s -l de | ' . $tokenize_nn . ' > ' . $ud_path . '/korap_tokenizer/' . $base;
+  },
+  'opennlp_sentence' => sub {
+    system 'cat ' . $raw . ' | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin 2> /dev/null > ' . $ud_path . '/opennlp_sentence/' . $base;
+  },
+  jtok => sub {
+    chdir '/euralex/JTok/bin';
+    system 'sh tokenize ' . $raw . ' de utf8 | ' . $cleanup . '/jtok.pl > ' . $ud_path . '/jtok/' . $base;
+    chdir '/euralex';
+  },
+  syntok => sub {
+    system 'python3 -m syntok.segmenter ' . $raw . ' | ' . $cleanup . '/tokenize_simple.pl > ' . $ud_path . '/syntok/' . $base;
+  },
+  somajo => sub {
+    system 'somajo-tokenizer --split_sentences ' . $raw . ' 2> /dev/null | ' . $tokenize_nn . ' > ' . $ud_path . '/somajo/' . $base;
+  },
+  stanford => sub {
+    system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
+      '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -file ' . $raw . ' 2> /dev/null';
+    system 'perl /euralex/benchmarks/cleanup/stanford.pl ' . $base . '.raw.out | ' . $tokenize_nn . ' > ' . $ud_path . '/stanford/' . $base;
+    system 'rm ' . $base . '.raw.out';
+  },
+  nnsplit => sub {
+    system './nnsplit/nnsplit_bench ' . $raw . ' | ' . $tokenize_eos. ' > ' . $ud_path . '/nnsplit/' . $base
+  },
+  spacy_dep => sub {
+    system 'python3 ./spacy/spacy_sent.py dep ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_dep/' . $base
+  },
+  spacy_stat => sub {
+    system 'python3 ./spacy/spacy_sent.py stat ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_stat/' . $base
+  },
+  spacy_sentencizer => sub {
+    system 'python3 ./spacy/spacy_sent.py sentencizer ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_sentencizer/' . $base
+  },
+  'deep-eos_bi-lstm-de' => sub {
+    system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_bi-lstm-de/' . $base;
+  },
+  'deep-eos_cnn-de' => sub {
+    system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_cnn-de/' . $base;
+  },
+  'deep-eos_lstm-de' => sub {
+    system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_lstm-de/' . $base;;
+  },
+);
+
+
+#delete $tools{waste};
+#delete $tools{datok};
+#delete $tools{korap_tokenizer};
+#delete $tools{'opennlp_sentence'};
+#delete $tools{jtok};
+#delete $tools{syntok};
+#delete $tools{somajo};
+#delete $tools{stanford};
+#delete $tools{nnsplit};
+#delete $tools{'deep-eos_bi-lstm-de'};
+#delete $tools{'deep-eos_cnn-de'};
+#delete $tools{'deep-eos_lstm-de'};
+#delete $tools{'spacy_dep'};
+#delete $tools{'spacy_stat'};
+#delete $tools{'spacy_sentencizer'};
+#delete $tools{'cutter'};
+
+
+# Create project folders
+foreach (keys %tools) {
+  mkdir $ud_path . '/' . $_;
+};
+
+# Run tokenization
+foreach (keys %tools) {
+  $tools{$_}->();
+};
+
+foreach my $tool (keys %tools) {
+  print "\n##########\n";
+  print "##### $tool - UD\n";
+  print "##\n";
+  system $cmd . ' -x ' . $gold . ' ' . $ud_path . '/' . $tool . '/' . $base . ' 2> /dev/null';
+};

diff --git a/nnsplit_bench/src/main.rs b/nnsplit_bench/src/main.rs
index f057006..5c3f8eb 100644
--- a/nnsplit_bench/src/main.rs
+++ b/nnsplit_bench/src/main.rs

@@ -15,7 +15,7 @@
     let splits = &splitter.split(&input)[0];
 
     for sentence in splits.iter() {
-        println!("{}", sentence.text());
+        println!("{}</eos>", sentence.text());
     }
 
     Ok(())

diff --git a/spacy/spacy_sent.py b/spacy/spacy_sent.py
index df294c0..b132b07 100644
--- a/spacy/spacy_sent.py
+++ b/spacy/spacy_sent.py

@@ -24,4 +24,4 @@
 
     for sent in doc.sents:
         print(sent.text)
-        print("</eos>")
+        print(" </eos> ")
commit	049e52606bcd1fb192789d49110607042a55e2f8	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Mar 18 09:59:34 2022 +0100
committer	Akron <nils@diewald-online.de>	Fri Mar 18 09:59:34 2022 +0100
tree	797fee47ae18690d1734547d83c4bff3d2ead858
parent	54fd31434d861d66feec491b9b1eff0e661b1225 [diff]