Added SpaCy
Change-Id: Ic87af300ee2f557db9b065e023f11e4f9bbb9ffa
diff --git a/.gitignore b/.gitignore
index 54163b0..7565a4e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
/sandbox
/Sandbox
+/corpus/*.txt
\#*
*~
.*
diff --git a/Dockerfile b/Dockerfile
index 4c421f5..8444470 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -183,6 +183,17 @@
#################
+# Install SpaCy #
+#################
+
+RUN pip3 install -U spacy
+
+COPY spacy /euralex/spacy/
+
+RUN echo "SpaCy" && python3 ./spacy/spacy_tok.py example.txt
+
+
+#################
# Install Datok #
#################
@@ -208,6 +219,17 @@
RUN echo "KorAP-Tokenizer\n" && cat example.txt | java -jar KorAP-Tokenizer/KorAP-Tokenizer.jar -l de -s -
+RUN useradd -ms /bin/bash euralex
+
+RUN rm -r ./nnsplit_bench && \
+ rm /euralex/v0.1.zip
+
+RUN chown euralex:euralex -R /euralex/treetagger
+
+USER euralex
+
+WORKDIR /euralex
+
ENTRYPOINT [ "sh" ]
LABEL maintainer="korap@ids-mannheim.de"
diff --git a/spacy/spacy_tok.py b/spacy/spacy_tok.py
new file mode 100644
index 0000000..62f2bff
--- /dev/null
+++ b/spacy/spacy_tok.py
@@ -0,0 +1,17 @@
+import sys
+
+from spacy.lang.de import German
+
+nlp = German()
+
+# Create a Tokenizer with the default settings for English
+# including punctuation rules and exceptions
+tokenizer = nlp.tokenizer
+
+with open(sys.argv[1], 'r') as f:
+ contents = f.read()
+
+ tokens = tokenizer(contents)
+
+ for t in tokens:
+ print(t)