Added Stanford CoreNLP Tokenizer
Change-Id: Ib9debb34cb9a66fad041cb584d641e155b46a347
diff --git a/Dockerfile b/Dockerfile
index c734929..fd276af 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -193,6 +193,28 @@
RUN echo "SpaCy" && python3 ./spacy/spacy_tok.py example.txt
+###########################
+# Install Stanford parser #
+###########################
+
+# Following https://stanfordnlp.github.io/CoreNLP/index.html
+
+RUN wget https://nlp.stanford.edu/software/stanford-corenlp-latest.zip
+
+RUN wget https://search.maven.org/remotecontent?filepath=edu/stanford/nlp/stanford-corenlp/4.4.0/stanford-corenlp-4.4.0-models-german.jar -O stanford-corenlp-4.4.0-models-german.jar
+
+RUN unzip stanford-corenlp-latest.zip && \
+ rm stanford-corenlp-latest.zip && \
+ mv stanford-corenlp-4.4.0-models-german.jar stanford-corenlp-4.4.0/
+
+# Run with threads!
+RUN echo "StanfordNLP" && \
+ CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP \
+ -annotators tokenize \
+ -tokenize.language=german \
+ -file example.txt
+
+
#################
# Install Datok #
#################
diff --git a/benchmarks/benchmark.pl b/benchmarks/benchmark.pl
index 987a0a5..9ad5bd9 100644
--- a/benchmarks/benchmark.pl
+++ b/benchmarks/benchmark.pl
@@ -78,6 +78,14 @@
},
SpaCy => sub {
system 'python3 ./spacy/spacy_tok.py ./corpus/'.$FILE.' > /dev/null'
+ },
+ Stanford => sub {
+ system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
+ '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -file ./corpus/' . $FILE
+ },
+ Stanford_t4 => sub {
+ system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
+ '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -threads=4 -file ./corpus/' . $FILE
}
};
@@ -100,6 +108,8 @@
#delete $models->{'nnsplit'};
#delete $models->{'elephant'};
#delete $models->{'SpaCy'};
+#delete $models->{'Stanford'};
+#delete $models->{'Stanford_t4'};
my $t0 = Benchmark->new;