Add eos evaluation
Change-Id: Ia721ce1df8798fa2771059b4feb12eb56459325b
diff --git a/Readme.md b/Readme.md
index 00dd987..8f75046 100644
--- a/Readme.md
+++ b/Readme.md
@@ -63,9 +63,9 @@
-v ${PWD}/output_web:/euralex/empirist_web
```
-## `ud-tokens.pl`
+## `ud_tokens.pl`
-To run the evaluation suite against the
+To run the token evaluation suite against the
[Universal Dependency](https://github.com/UniversalDependencies/UD_German-GSD)
corpus, first install the empirist tooling as explained above,
and download the corpus.
@@ -75,13 +75,10 @@
-O corpus/de_gsd-ud-train.conllu
```
+## `ud_sentences.pl`
-```shell
-$ docker run --rm -it \
- -v ${PWD}/benchmarks:/euralex/benchmarks \
- -v ${PWD}/corpus:/euralex/corpus \
- korap/euralex2 benchmarks/empirist.pl
-```
+To run the sentence evaluation suite, first download the corpus
+as explained above.
# Tools
diff --git a/benchmarks/cleanup/eos.pl b/benchmarks/cleanup/eos.pl
new file mode 100644
index 0000000..02d0a43
--- /dev/null
+++ b/benchmarks/cleanup/eos.pl
@@ -0,0 +1,5 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+# This script rewrites a file
diff --git a/benchmarks/cleanup/jtok.pl b/benchmarks/cleanup/jtok.pl
new file mode 100644
index 0000000..664e919
--- /dev/null
+++ b/benchmarks/cleanup/jtok.pl
@@ -0,0 +1,22 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my $init = 1;
+my $c = '';
+foreach (<>) {
+ if (s/\s +Token: \"// && s/^(\"?[^\"]*?)\".+?$/$1/g) {
+ $c .= $_;
+ }
+ elsif (m/Text Unit Start/) {
+ if ($init) {
+ $init = 0;
+ } else {
+ $c =~ s/[\s\n\t]+//g;
+ print $c,"\n";
+ $c = '';
+ };
+ };
+};
+
+print "\n";
diff --git a/benchmarks/cleanup/split_conllu.pl b/benchmarks/cleanup/split_conllu.pl
index 9dfd824..441be36 100644
--- a/benchmarks/cleanup/split_conllu.pl
+++ b/benchmarks/cleanup/split_conllu.pl
@@ -9,6 +9,7 @@
open(X, '<' . $file);
open(RAW, '>' . $file . '.raw');
open(SPLIT, '>' . $file . '.split');
+open(EOS, '>' . $file . '.eos');
my $init;
@@ -21,6 +22,9 @@
print RAW ' ';
};
print RAW $1;
+ my $temp = $1;
+ $temp =~ s/[\s\n\t]+//g;
+ print EOS $temp, "\n";
}
elsif (m/^\d+[\s\t]/) {
if (/^\d+[\s\t]+([^\t\s]+)[\t\s]/) {
@@ -32,4 +36,5 @@
close(X);
close(RAW);
+close(EOS);
close(SPLIT);
diff --git a/benchmarks/cleanup/tokenize_eos.pl b/benchmarks/cleanup/tokenize_eos.pl
new file mode 100644
index 0000000..42f5ae7
--- /dev/null
+++ b/benchmarks/cleanup/tokenize_eos.pl
@@ -0,0 +1,14 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my $c = '';
+foreach (<>) {
+ $c .= $_;
+};
+
+foreach my $c (split("</eos>", $c)) {
+ $c =~ s/[\s\n\t]+//g;
+ print $c, "\n";
+};
+
diff --git a/benchmarks/cleanup/tokenize_nn.pl b/benchmarks/cleanup/tokenize_nn.pl
new file mode 100644
index 0000000..3124c6a
--- /dev/null
+++ b/benchmarks/cleanup/tokenize_nn.pl
@@ -0,0 +1,14 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my $c = '';
+foreach (<>) {
+ $c .= $_;
+};
+$c =~ s/^\n+//s;
+foreach my $c (split(/\n\n/, $c)) {
+ $c =~ s/[\s\n\t]+//g;
+ print $c, "\n";
+};
+
diff --git a/benchmarks/cleanup/tokenize_simple.pl b/benchmarks/cleanup/tokenize_simple.pl
new file mode 100644
index 0000000..cad1749
--- /dev/null
+++ b/benchmarks/cleanup/tokenize_simple.pl
@@ -0,0 +1,8 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+foreach (<>) {
+ s/[\s\n\t]+//g;
+ print $_, "\n";
+};
diff --git a/benchmarks/ud_sentences.pl b/benchmarks/ud_sentences.pl
new file mode 100644
index 0000000..c5f80c6
--- /dev/null
+++ b/benchmarks/ud_sentences.pl
@@ -0,0 +1,117 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+# Comparison path
+my $cmd = '/euralex/corpus/empirist_gold_cmc/tools/compare_tokenization.perl';
+# my $cmd = '/euralex/corpus/deep-eos/eval.py';
+
+my $cleanup = 'perl /euralex/benchmarks/cleanup/';
+my $tokenize_eos = $cleanup . 'tokenize_eos.pl';
+my $tokenize_nn = $cleanup . 'tokenize_nn.pl';
+
+# Output path
+my $ud_path = '/euralex/ud_eos';
+mkdir $ud_path;
+
+my $base = 'de_gsd-ud-train.conllu';
+
+# Split files
+chdir '/euralex/corpus/';
+system 'perl /euralex/benchmarks/cleanup/split_conllu.pl /euralex/corpus/' . $base;
+chdir '/euralex';
+
+my $gold = '/euralex/corpus/' . $base . '.eos';
+my $raw = '/euralex/corpus/' . $base . '.raw';
+
+my %tools = (
+ waste => sub {
+ system 'cat ' . $raw . ' | waste -N -v0 --rcfile=./Waste/waste.rc | ' . $tokenize_nn . ' > ' . $ud_path . '/waste/' . $base;
+ },
+ datok => sub {
+ system 'cat ' . $raw . ' | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok - | ' . $tokenize_nn . ' > ' . $ud_path . '/datok/' . $base;
+ },
+ cutter => sub {
+ system 'python3 ./cutter/cutter.py sent ' . $raw . ' | ' . $tokenize_eos. ' > ' . $ud_path . '/cutter/' . $base;
+ },
+ korap_tokenizer => sub {
+ system 'cat ' . $raw . ' | java -jar ./KorAP-Tokenizer/KorAP-Tokenizer.jar -s -l de | ' . $tokenize_nn . ' > ' . $ud_path . '/korap_tokenizer/' . $base;
+ },
+ 'opennlp_sentence' => sub {
+ system 'cat ' . $raw . ' | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin 2> /dev/null > ' . $ud_path . '/opennlp_sentence/' . $base;
+ },
+ jtok => sub {
+ chdir '/euralex/JTok/bin';
+ system 'sh tokenize ' . $raw . ' de utf8 | ' . $cleanup . '/jtok.pl > ' . $ud_path . '/jtok/' . $base;
+ chdir '/euralex';
+ },
+ syntok => sub {
+ system 'python3 -m syntok.segmenter ' . $raw . ' | ' . $cleanup . '/tokenize_simple.pl > ' . $ud_path . '/syntok/' . $base;
+ },
+ somajo => sub {
+ system 'somajo-tokenizer --split_sentences ' . $raw . ' 2> /dev/null | ' . $tokenize_nn . ' > ' . $ud_path . '/somajo/' . $base;
+ },
+ stanford => sub {
+ system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
+ '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -file ' . $raw . ' 2> /dev/null';
+ system 'perl /euralex/benchmarks/cleanup/stanford.pl ' . $base . '.raw.out | ' . $tokenize_nn . ' > ' . $ud_path . '/stanford/' . $base;
+ system 'rm ' . $base . '.raw.out';
+ },
+ nnsplit => sub {
+ system './nnsplit/nnsplit_bench ' . $raw . ' | ' . $tokenize_eos. ' > ' . $ud_path . '/nnsplit/' . $base
+ },
+ spacy_dep => sub {
+ system 'python3 ./spacy/spacy_sent.py dep ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_dep/' . $base
+ },
+ spacy_stat => sub {
+ system 'python3 ./spacy/spacy_sent.py stat ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_stat/' . $base
+ },
+ spacy_sentencizer => sub {
+ system 'python3 ./spacy/spacy_sent.py sentencizer ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_sentencizer/' . $base
+ },
+ 'deep-eos_bi-lstm-de' => sub {
+ system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_bi-lstm-de/' . $base;
+ },
+ 'deep-eos_cnn-de' => sub {
+ system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_cnn-de/' . $base;
+ },
+ 'deep-eos_lstm-de' => sub {
+ system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_lstm-de/' . $base;;
+ },
+);
+
+
+#delete $tools{waste};
+#delete $tools{datok};
+#delete $tools{korap_tokenizer};
+#delete $tools{'opennlp_sentence'};
+#delete $tools{jtok};
+#delete $tools{syntok};
+#delete $tools{somajo};
+#delete $tools{stanford};
+#delete $tools{nnsplit};
+#delete $tools{'deep-eos_bi-lstm-de'};
+#delete $tools{'deep-eos_cnn-de'};
+#delete $tools{'deep-eos_lstm-de'};
+#delete $tools{'spacy_dep'};
+#delete $tools{'spacy_stat'};
+#delete $tools{'spacy_sentencizer'};
+#delete $tools{'cutter'};
+
+
+# Create project folders
+foreach (keys %tools) {
+ mkdir $ud_path . '/' . $_;
+};
+
+# Run tokenization
+foreach (keys %tools) {
+ $tools{$_}->();
+};
+
+foreach my $tool (keys %tools) {
+ print "\n##########\n";
+ print "##### $tool - UD\n";
+ print "##\n";
+ system $cmd . ' -x ' . $gold . ' ' . $ud_path . '/' . $tool . '/' . $base . ' 2> /dev/null';
+};
diff --git a/nnsplit_bench/src/main.rs b/nnsplit_bench/src/main.rs
index f057006..5c3f8eb 100644
--- a/nnsplit_bench/src/main.rs
+++ b/nnsplit_bench/src/main.rs
@@ -15,7 +15,7 @@
let splits = &splitter.split(&input)[0];
for sentence in splits.iter() {
- println!("{}", sentence.text());
+ println!("{}</eos>", sentence.text());
}
Ok(())
diff --git a/spacy/spacy_sent.py b/spacy/spacy_sent.py
index df294c0..b132b07 100644
--- a/spacy/spacy_sent.py
+++ b/spacy/spacy_sent.py
@@ -24,4 +24,4 @@
for sent in doc.sents:
print(sent.text)
- print("</eos>")
+ print(" </eos> ")