Added SpaCy

Change-Id: Ic87af300ee2f557db9b065e023f11e4f9bbb9ffa
diff --git a/.gitignore b/.gitignore
index 54163b0..7565a4e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 /sandbox
 /Sandbox
+/corpus/*.txt
 \#*
 *~
 .*
diff --git a/Dockerfile b/Dockerfile
index 4c421f5..8444470 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -183,6 +183,17 @@
 
 
 #################
+# Install SpaCy #
+#################
+
+RUN pip3 install -U spacy
+
+COPY spacy /euralex/spacy/
+
+RUN echo "SpaCy" && python3 ./spacy/spacy_tok.py example.txt
+
+
+#################
 # Install Datok #
 #################
 
@@ -208,6 +219,17 @@
 RUN echo "KorAP-Tokenizer\n" && cat example.txt | java -jar KorAP-Tokenizer/KorAP-Tokenizer.jar -l de -s -
 
 
+RUN useradd -ms /bin/bash euralex
+
+RUN rm -r ./nnsplit_bench && \
+    rm /euralex/v0.1.zip
+
+RUN chown euralex:euralex -R /euralex/treetagger
+
+USER euralex
+
+WORKDIR /euralex
+
 ENTRYPOINT [ "sh" ]
 
 LABEL maintainer="korap@ids-mannheim.de"
diff --git a/spacy/spacy_tok.py b/spacy/spacy_tok.py
new file mode 100644
index 0000000..62f2bff
--- /dev/null
+++ b/spacy/spacy_tok.py
@@ -0,0 +1,17 @@
+import sys
+
+from spacy.lang.de import German
+
+nlp = German()
+
+# Create a Tokenizer with the default settings for English
+# including punctuation rules and exceptions
+tokenizer = nlp.tokenizer
+
+with open(sys.argv[1], 'r') as f:
+    contents = f.read()
+
+    tokens = tokenizer(contents)
+
+    for t in tokens:
+        print(t)