Cleanup output of stanford tokenizer+sentencesplitter
Change-Id: I4d620d319b0546aef21a0f7070c4ab5c5356d646
diff --git a/benchmarks/cleanup/stanford.pl b/benchmarks/cleanup/stanford.pl
new file mode 100644
index 0000000..ced1274
--- /dev/null
+++ b/benchmarks/cleanup/stanford.pl
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+# This script rewrites the pipeline output
+# of the stanford parser for tokenize,ssplit,mwt
+
+our @ARGV;
+
+if (open(my $file, '<' . $ARGV[0])) {
+ foreach (readline($file)) {
+ if (s/^\[Text\=(.+?)\s+CharacterOffsetBegin\=\d+\s+CharacterOffsetEnd=\d+\]$/$1/) {
+ print $_;
+ }
+ elsif (m/^Sentence\s+\#\d+\s+\(/) {
+ print "\n";
+ };
+ };
+
+ print "Done.";
+ close($file);
+}
+else {
+ warn 'Unable to open file'
+};
diff --git a/benchmarks/empirist.pl b/benchmarks/empirist.pl
index ef03fda..f7a734c 100644
--- a/benchmarks/empirist.pl
+++ b/benchmarks/empirist.pl
@@ -50,18 +50,26 @@
somajo => sub {
my $raw = $gold_path . $_[1] . '/raw/' . $_[0];
system 'somajo-tokenizer ' . $raw . ' 2> /dev/null > ' . $empirist_path . $_[1] . '/somajo/' . $_[0];
+ },
+ stanford => sub {
+ my $raw = $gold_path . $_[1] . '/raw/' . $_[0];
+ system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
+ '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -file ' . $raw . ' 2> /dev/null';
+ system 'perl /euralex/benchmarks/cleanup/stanford.pl ' . $_[0] . '.out > ' . $empirist_path . $_[1] . '/stanford/' . $_[0];
+ system 'rm ' . $_[0] . '.out';
}
);
-#delete $tools{waste};
-#delete $tools{datok};
-#delete $tools{korap_tokenizer};
-#delete $tools{opennlp_simple};
-#delete $tools{opennlp_tokenizer};
-#delete $tools{tree_tagger};
-#delete $tools{jtok};
-#delete $tools{syntok};
-#delete $tools{somajo};
+# delete $tools{waste};
+# delete $tools{datok};
+# delete $tools{korap_tokenizer};
+# delete $tools{opennlp_simple};
+# delete $tools{opennlp_tokenizer};
+# delete $tools{tree_tagger};
+# delete $tools{jtok};
+# delete $tools{syntok};
+# delete $tools{somajo};
+# delete $tools{stanford};
# Create project folders
foreach (keys %tools) {