Make number of procs env configurable
Change-Id: Ic511e217e8c6cfd3e83a162c1dfb7da07ebb4bf4
diff --git a/Dockerfile b/Dockerfile
index 9499173..ceeae9c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -49,9 +49,17 @@
ENV SPACY_USE_GERMALEMMA="True"
ENV SPACY_PARSE_TIMEOUT="30"
ENV SPACY_MAX_SENTENCE_LENGTH="500"
+ENV SPACY_N_PROCESS="10"
+ENV SPACY_BATCH_SIZE="2000"
+ENV SPACY_CHUNK_SIZE="20000"
WORKDIR /app
-RUN mkdir -p "/app/logs"
+RUN mkdir -p "/app/logs" "/app/tmp"
+
+# Set temp directories to use app directory instead of system /tmp
+ENV TMPDIR="/app/tmp"
+ENV TEMP="/app/tmp"
+ENV TMP="/app/tmp"
# Define the entry point
CMD ["python", "/app/systems/parse_spacy_pipe.py"]
\ No newline at end of file
diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
index cd7d6c5..402d2d6 100644
--- a/systems/parse_spacy_pipe.py
+++ b/systems/parse_spacy_pipe.py
@@ -187,9 +187,9 @@
args = parser.parse_args()
file_has_next, chunk_ix = True, 0
- CHUNK_SIZE = 20000
- SPACY_BATCH = 2000
- SPACY_PROC = 10
+ CHUNK_SIZE = int(os.getenv("SPACY_CHUNK_SIZE", "20000"))
+ SPACY_BATCH = int(os.getenv("SPACY_BATCH_SIZE", "2000"))
+ SPACY_PROC = int(os.getenv("SPACY_N_PROCESS", "10"))
# =====================================================================================
# LOGGING INFO ...
@@ -210,6 +210,7 @@
logger.info(f"Using SPACY_USE_GERMALEMMA environment variable: {args.use_germalemma}")
logger.info(f"Chunking {args.corpus_name} Corpus in chunks of {CHUNK_SIZE} Sentences")
+ logger.info(f"Processing configuration: batch_size={SPACY_BATCH}, n_process={SPACY_PROC}")
# =====================================================================================
# POS TAG DOCUMENTS