Add cutter, more spacys and Stanford Tokenizer only
Change-Id: I6ae4c014298d9c07e62850d39408b77cb145a9cd
diff --git a/Dockerfile b/Dockerfile
index fd276af..2d92c07 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -147,7 +147,7 @@
COPY nnsplit_bench /euralex/nnsplit_bench/
-RUN apt-get install -y cargo
+RUN apt-get update && apt-get install -y cargo
RUN cd ./nnsplit_bench && \
cargo build --release
@@ -192,6 +192,10 @@
RUN echo "SpaCy" && python3 ./spacy/spacy_tok.py example.txt
+# Sentence splitter
+RUN python3 -m spacy download de_core_news_sm && \
+ python3 -m spacy download de_dep_news_trf
+
###########################
# Install Stanford parser #
@@ -215,6 +219,17 @@
-file example.txt
+##################
+# Install Cutter #
+##################
+
+RUN pip3 install cutter-ng
+
+COPY cutter /euralex/cutter/
+
+RUN echo "Cutter\n" && python3 ./cutter/cutter.py nosent example.txt
+
+
#################
# Install Datok #
#################
diff --git a/benchmarks/benchmark.pl b/benchmarks/benchmark.pl
index 027901e..9dcc170 100644
--- a/benchmarks/benchmark.pl
+++ b/benchmarks/benchmark.pl
@@ -98,9 +98,21 @@
elephant => sub {
system './elephant-wrapper/bin/tokenize.sh -i ./corpus/'.$FILE.' UD_German > /dev/null'
},
- SpaCy => sub {
+ cutter => sub {
+ system 'python3 ./cutter/cutter.py nosent ./corpus/'.$FILE.' > /dev/null'
+ },
+ spacy_tok => sub {
system 'python3 ./spacy/spacy_tok.py ./corpus/'.$FILE.' > /dev/null'
},
+ spacy_dep => sub {
+ system 'python3 ./spacy/spacy_sent.py dep ./corpus/'.$FILE.' > /dev/null'
+ },
+ spacy_stat => sub {
+ system 'python3 ./spacy/spacy_sent.py stat ./corpus/'.$FILE.' > /dev/null'
+ },
+ spacy_sentencizer => sub {
+ system 'python3 ./spacy/spacy_sent.py sentencizer ./corpus/'.$FILE.' > /dev/null'
+ },
Stanford => sub {
system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
'-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -file ./corpus/' . $FILE
@@ -116,7 +128,11 @@
Stanford_t8 => sub {
system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
'-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -threads=8 -file ./corpus/' . $FILE
- }
+ },
+ Stanford_tokonly => sub {
+ system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
+ '-props german -annotators tokenize -tokenize.language=german -file ./corpus/' . $FILE
+ },
};
#delete $models->{'SoMaJo'};
@@ -139,11 +155,17 @@
#delete $models->{'Waste'};
#delete $models->{'nnsplit'};
#delete $models->{'elephant'};
-#delete $models->{'SpaCy'};
#delete $models->{'Stanford'};
#delete $models->{'Stanford_t2'};
#delete $models->{'Stanford_t4'};
#delete $models->{'Stanford_t8'};
+delete $models->{'Stanford_tokonly'};
+delete $models->{'cutter'};
+delete $models->{'spacy_tok'};
+delete $models->{'spacy_sentencizer'};
+delete $models->{'spacy_dep'};
+delete $models->{'spacy_stat'};
+
my $t0 = Benchmark->new;
diff --git a/cutter/cutter.py b/cutter/cutter.py
new file mode 100644
index 0000000..a865a49
--- /dev/null
+++ b/cutter/cutter.py
@@ -0,0 +1,22 @@
+import Cutter
+import sys
+
+sys.setrecursionlimit(100000)
+
+cutter = Cutter.Cutter(profile='de')
+
+sent = sys.argv[1]
+
+file = open(sys.argv[2], 'r')
+
+text = file.read()
+
+file.close()
+
+for token in cutter.cut(text):
+ if token[0]:
+ print(token[0])
+
+ if sent == "sent":
+ if token[1].startswith("+EOS"):
+ print("</eos>")
diff --git a/spacy/spacy_sent.py b/spacy/spacy_sent.py
new file mode 100644
index 0000000..df294c0
--- /dev/null
+++ b/spacy/spacy_sent.py
@@ -0,0 +1,27 @@
+import spacy
+import sys
+from spacy.lang.de import German
+
+# slower and more accurate: ("de_dep_news_trf")
+
+model = sys.argv[1]
+
+if model == 'dep':
+ nlp = spacy.load("de_core_news_sm")
+elif model == 'stat':
+ nlp = spacy.load("de_core_news_sm", exclude=["parser"])
+ nlp.enable_pipe("senter")
+elif model == 'sentencizer':
+ nlp = German()
+ nlp.add_pipe("sentencizer")
+
+# Create a Sentence Splitter based on dependency parsing.
+
+with open(sys.argv[2], 'r') as f:
+ contents = f.read()
+
+ doc = nlp(contents)
+
+ for sent in doc.sents:
+ print(sent.text)
+ print("</eos>")