Initial import Change-Id: I6315233ee1bfbdf7cc985cb336d0df7a10274189

commit: 8604485f28d54fef4e34fde53f0bd08161901722 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sat Nov 29 10:19:03 2025 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sat Nov 29 10:21:33 2025 +0100
tree: 47796f9dd1e45a9b88a2dc493fa16a1c6b8b273e
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..3ec93e5
--- /dev/null
+++ b/.dockerignore

@@ -0,0 +1,24 @@
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+venv/
+ENV/
+*.log
+.pytest_cache/
+.coverage
+htmlcov/
+dist/
+build/
+*.egg-info/
+.DS_Store
+*.conllu
+*.zip
+models/
+logs/
+tmp/
+.git/
+.gitignore
+README.md
+.github/

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..73a96da
--- /dev/null
+++ b/.gitignore

@@ -0,0 +1,20 @@
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+venv/
+ENV/
+*.log
+.pytest_cache/
+.coverage
+htmlcov/
+dist/
+build/
+*.egg-info/
+.DS_Store
+*.conllu
+*.zip
+models/
+logs/
+tmp/

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..3f7be4d
--- /dev/null
+++ b/Dockerfile

@@ -0,0 +1,71 @@
+# Multi-stage Docker build for size optimization
+FROM python:3.12-slim-bookworm AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set environment variables
+ENV PIP_CACHE_DIR="/tmp/.cache/pip" \
+    PYTHONPATH="PYTHONPATH:."
+ENV VIRTUAL_ENV=/app/venv
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# Set the working directory and copy requirements
+WORKDIR /app
+COPY requirements.txt /app/requirements.txt
+
+# Install Python dependencies in virtual environment
+RUN python -m venv venv
+RUN venv/bin/pip install --upgrade pip
+RUN venv/bin/pip install -r requirements.txt
+
+# Production stage
+FROM python:3.12-slim-bookworm AS production
+
+# Install minimal runtime dependencies
+RUN apt-get update && apt-get install -y \
+    wget \
+    coreutils \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+# Copy virtual environment from builder
+COPY --from=builder /app/venv /app/venv
+
+# Copy application code
+COPY lib /app/lib
+COPY systems /app/systems
+COPY my_utils /app/my_utils
+COPY docker-entrypoint.sh /docker-entrypoint.sh
+COPY download_with_progress.py /app/download_with_progress.py
+
+# Set environment variables
+ENV VIRTUAL_ENV=/app/venv
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+ENV PYTHONPATH="PYTHONPATH:."
+
+# spaCy processing configuration
+ENV SPACY_USE_DEPENDENCIES="True"
+ENV SPACY_USE_GERMALEMMA="True"
+ENV SPACY_PARSE_TIMEOUT="30"
+ENV SPACY_MAX_SENTENCE_LENGTH="500"
+ENV SPACY_N_PROCESS="10"
+ENV SPACY_BATCH_SIZE="2000"
+ENV SPACY_CHUNK_SIZE="20000"
+
+WORKDIR /app
+RUN mkdir -p "/app/logs" "/app/tmp" "/local/models"
+
+# Set temp directories to use app directory instead of system /tmp
+ENV TMPDIR="/app/tmp"
+ENV TEMP="/app/tmp"
+ENV TMP="/app/tmp"
+
+# Make entrypoint executable
+RUN chmod +x /docker-entrypoint.sh
+
+# Define the entry point
+ENTRYPOINT ["/docker-entrypoint.sh"]

diff --git a/Dockerfile.with-models b/Dockerfile.with-models
new file mode 100644
index 0000000..39c7bd4
--- /dev/null
+++ b/Dockerfile.with-models

@@ -0,0 +1,96 @@
+# Dockerfile with pre-installed models
+# Build: docker build -f Dockerfile.with-models -t korap/conllu-spacy:with-models .
+
+# Multi-stage Docker build for size optimization
+FROM python:3.12-slim-bookworm AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set environment variables
+ENV PIP_CACHE_DIR="/tmp/.cache/pip" \
+    PYTHONPATH="PYTHONPATH:."
+ENV VIRTUAL_ENV=/app/venv
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# Set the working directory and copy requirements
+WORKDIR /app
+COPY requirements.txt /app/requirements.txt
+
+# Install Python dependencies in virtual environment
+RUN python -m venv venv
+RUN venv/bin/pip install --upgrade pip
+RUN venv/bin/pip install -r requirements.txt
+
+# Download spaCy models to /local/models
+RUN mkdir -p /local/models
+
+# Download the default model (de_core_news_lg)
+RUN venv/bin/python -m spacy download de_core_news_lg --no-cache-dir
+
+# Move model to /local/models for persistence
+RUN MODEL_PATH=$(venv/bin/python -c "import site; print(site.getsitepackages()[0] + '/de_core_news_lg')") && \
+    mv "$MODEL_PATH" /local/models/de_core_news_lg
+
+# Optionally download additional models
+# Uncomment to include medium model:
+# RUN venv/bin/python -m spacy download de_core_news_md --no-cache-dir && \
+#     MODEL_PATH=$(venv/bin/python -c "import site; print(site.getsitepackages()[0] + '/de_core_news_md')") && \
+#     mv "$MODEL_PATH" /local/models/de_core_news_md
+
+# Uncomment to include small model:
+# RUN venv/bin/python -m spacy download de_core_news_sm --no-cache-dir && \
+#     MODEL_PATH=$(venv/bin/python -c "import site; print(site.getsitepackages()[0] + '/de_core_news_sm')") && \
+#     mv "$MODEL_PATH" /local/models/de_core_news_sm
+
+# Production stage
+FROM python:3.12-slim-bookworm AS production
+
+# Install minimal runtime dependencies
+RUN apt-get update && apt-get install -y \
+    wget \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+# Copy virtual environment from builder
+COPY --from=builder /app/venv /app/venv
+
+# Copy pre-downloaded models
+COPY --from=builder /local/models /local/models
+
+# Copy application code
+COPY lib /app/lib
+COPY systems /app/systems
+COPY my_utils /app/my_utils
+COPY docker-entrypoint.sh /docker-entrypoint.sh
+
+# Set environment variables
+ENV VIRTUAL_ENV=/app/venv
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+ENV PYTHONPATH="PYTHONPATH:."
+
+# spaCy processing configuration
+ENV SPACY_USE_DEPENDENCIES="True"
+ENV SPACY_USE_GERMALEMMA="True"
+ENV SPACY_PARSE_TIMEOUT="30"
+ENV SPACY_MAX_SENTENCE_LENGTH="500"
+ENV SPACY_N_PROCESS="10"
+ENV SPACY_BATCH_SIZE="2000"
+ENV SPACY_CHUNK_SIZE="20000"
+
+WORKDIR /app
+RUN mkdir -p "/app/logs" "/app/tmp"
+
+# Set temp directories to use app directory instead of system /tmp
+ENV TMPDIR="/app/tmp"
+ENV TEMP="/app/tmp"
+ENV TMP="/app/tmp"
+
+# Make entrypoint executable
+RUN chmod +x /docker-entrypoint.sh
+
+# Define the entry point
+ENTRYPOINT ["/docker-entrypoint.sh"]

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..4cb360d
--- /dev/null
+++ b/LICENSE

@@ -0,0 +1,37 @@
+BSD 2-Clause License
+
+Copyright (c) 2025, IDS Mannheim
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+---
+
+This project includes components from:
+
+spaCy (https://spacy.io/)
+  License: MIT License
+  Copyright (c) 2016-2025 ExplosionAI GmbH
+
+GermaLemma (https://github.com/WZBSocialScienceCenter/germalemma)
+  License: Apache License 2.0
+  Copyright (c) 2017 Markus Konrad

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..5d1b22d
--- /dev/null
+++ b/Makefile

@@ -0,0 +1,27 @@
+.PHONY: build build-with-models preload-models run test clean
+
+build:
+	docker build -t korap/conllu-spacy:latest .
+
+build-with-models:
+	docker build -f Dockerfile.with-models -t korap/conllu-spacy:with-models .
+
+preload-models:
+	@echo "Preloading default model (de_core_news_lg) to ./models..."
+	./preload-models.sh
+
+preload-models-all:
+	@echo "Preloading all models to ./models..."
+	./preload-models.sh de_core_news_lg ./models
+	./preload-models.sh de_core_news_md ./models
+	./preload-models.sh de_core_news_sm ./models
+
+run:
+	docker run --rm -i korap/conllu-spacy:latest
+
+test:
+	@echo "Testing with sample input..."
+	@echo "Not implemented yet - add test input file"
+
+clean:
+	docker rmi korap/conllu-spacy:latest

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3ab2b6b
--- /dev/null
+++ b/README.md

@@ -0,0 +1,235 @@
+# spaCy Docker Image with CoNLL-U Support
+
+Docker image for **spaCy** POS tagging, lemmatization and dependency parsing with support for input and output in [CoNLL-U format](https://universaldependencies.org/format.html).
+
+This is a slim, focused implementation extracted from [sota-pos-lemmatizers](https://korap.ids-mannheim.de/gerrit/plugins/gitiles/KorAP/sota-pos-lemmatizers), originally developed by José Angel Daza(@angel-daza), following the same pattern as [conllu-treetagger-docker](https://github.com/KorAP/conllu-treetagger-docker).
+
+## Features
+
+- **CoNLL-U input/output**: Reads and writes CoNLL-U format
+- **On-demand model fetching**: Models are downloaded on first run and cached in `/local/models`
+- **GermaLemma integration**: Enhanced lemmatization for German (optional)
+- **Morphological features**: Extracts and formats morphological features in CoNLL-U format
+- **Dependency parsing**: Optional dependency relations (HEAD/DEPREL columns)
+- **Flexible configuration**: Environment variables for batch size, chunk size, timeouts, etc.
+
+## Installation
+
+### From source
+
+```shell
+git clone https://github.com/KorAP/conllu-spacy-tagger-docker.git
+cd conllu-spacy-tagger-docker
+docker build -t korap/conllu-spacy .
+```
+
+## Usage
+
+### Basic usage
+
+```shell
+# Default: German model with dependency parsing and GermaLemma
+docker run --rm -i korap/conllu-spacy < input.conllu > output.conllu
+```
+
+### Faster processing without dependency parsing
+
+```shell
+# Disable dependency parsing for faster processing
+docker run --rm -i korap/conllu-spacy -d < input.conllu > output.conllu
+```
+
+### Using a different spaCy model
+
+```shell
+# Use a different model (will be downloaded if not available)
+docker run --rm -i korap/conllu-spacy -m de_core_news_sm < input.conllu > output.conllu
+```
+
+### Persisting Models
+
+To avoid downloading the language model on every run, mount a local directory to `/local/models`:
+
+```shell
+docker run --rm -i -v /path/to/local/models:/local/models korap/conllu-spacy < input.conllu > output.conllu
+```
+
+The first run will download the model to `/path/to/local/models/`, and subsequent runs will reuse it.
+
+### Preloading Models
+
+There are several ways to preload models before running the container:
+
+#### Option 1: Using the preload script (recommended)
+
+```shell
+# Preload the default model (de_core_news_lg)
+./preload-models.sh
+
+# Preload a specific model
+./preload-models.sh de_core_news_sm
+
+# Preload to a custom directory
+./preload-models.sh de_core_news_lg /path/to/models
+
+# Then run with the preloaded models
+docker run --rm -i -v ./models:/local/models korap/conllu-spacy < input.conllu
+```
+
+#### Option 2: Build image with models included
+
+```shell
+# Build an image with models pre-installed
+docker build -f Dockerfile.with-models -t korap/conllu-spacy:with-models .
+
+# Run without needing to mount volumes
+docker run --rm -i korap/conllu-spacy:with-models < input.conllu > output.conllu
+```
+
+Edit `Dockerfile.with-models` to include additional models (sm, md) by uncommenting the relevant lines.
+
+#### Option 3: Manual download
+
+```shell
+# Create models directory
+mkdir -p ./models
+
+# Download using a temporary container
+docker run --rm -v ./models:/models python:3.12-slim bash -c "
+  pip install -q spacy &&
+  python -m spacy download de_core_news_lg &&
+  python -c 'import spacy, shutil, site;
+  shutil.copytree(site.getsitepackages()[0] + \"/de_core_news_lg\", \"/models/de_core_news_lg\")'
+"
+
+# Use the preloaded model
+docker run --rm -i -v ./models:/local/models korap/conllu-spacy < input.conllu
+```
+
+### Running with korapxmltool
+
+`korapxmltool`, which includes `korapxml2conllu` as a shortcut, can be downloaded from [https://github.com/KorAP/korapxmltool](https://github.com/KorAP/korapxmltool).
+
+```shell
+korapxml2conllu goe.zip | docker run --rm -i korap/conllu-spacy
+```
+
+#### Generate a spaCy-tagged KorAP XML zip directly
+
+```shell
+korapxmltool -A "docker run --rm -i korap/conllu-spacy" -t zip goe.zip
+```
+
+### Command-line Options
+
+```
+Usage: docker run --rm -i korap/conllu-spacy [OPTIONS]
+
+Options:
+  -h            Display help message
+  -m MODEL      Specify spaCy model (default: de_core_news_lg)
+  -L            List available/installed models
+  -d            Disable dependency parsing (faster processing)
+  -g            Disable GermaLemma (use spaCy lemmatizer only)
+```
+
+### Environment Variables
+
+You can customize processing behavior with environment variables:
+
+```shell
+docker run --rm -i \
+  -e SPACY_USE_DEPENDENCIES="False" \
+  -e SPACY_USE_GERMALEMMA="True" \
+  -e SPACY_CHUNK_SIZE="10000" \
+  -e SPACY_BATCH_SIZE="1000" \
+  -e SPACY_N_PROCESS="1" \
+  -e SPACY_PARSE_TIMEOUT="30" \
+  -e SPACY_MAX_SENTENCE_LENGTH="500" \
+  korap/conllu-spacy < input.conllu > output.conllu
+```
+
+**Available environment variables:**
+
+- `SPACY_USE_DEPENDENCIES`: Enable/disable dependency parsing (default: "True")
+- `SPACY_USE_GERMALEMMA`: Enable/disable GermaLemma (default: "True")
+- `SPACY_CHUNK_SIZE`: Number of sentences to process per chunk (default: 20000)
+- `SPACY_BATCH_SIZE`: Batch size for spaCy processing (default: 2000)
+- `SPACY_N_PROCESS`: Number of processes (default: 10)
+- `SPACY_PARSE_TIMEOUT`: Timeout for dependency parsing per sentence in seconds (default: 30)
+- `SPACY_MAX_SENTENCE_LENGTH`: Maximum sentence length for dependency parsing in tokens (default: 500)
+
+### Examples
+
+```shell
+# Fast processing: disable dependency parsing
+docker run --rm -i korap/conllu-spacy -d < input.conllu > output.conllu
+
+# Use spaCy lemmatizer only (without GermaLemma)
+docker run --rm -i korap/conllu-spacy -g < input.conllu > output.conllu
+
+# Smaller model for faster download
+docker run --rm -i korap/conllu-spacy -m de_core_news_sm < input.conllu > output.conllu
+
+# Persistent model storage
+docker run --rm -i -v ./models:/local/models korap/conllu-spacy < input.conllu > output.conllu
+```
+
+### Miscellaneous commands
+
+List installed models:
+
+```shell
+docker run --rm -i korap/conllu-spacy -L
+```
+
+Open a shell within the container:
+
+```shell
+docker run --rm -it --entrypoint /bin/bash korap/conllu-spacy
+```
+
+## Supported Models
+
+Any spaCy model can be specified with the `-m` option. Models will be downloaded automatically on first use.
+
+Common German models:
+- `de_core_news_lg` (default, 560MB) - Large German model
+- `de_core_news_md` (100MB) - Medium German model
+- `de_core_news_sm` (15MB) - Small German model
+
+See [spaCy Models](https://spacy.io/models) for a complete list.
+
+## Performance
+
+From the sota-pos-lemmatizers benchmarks on the TIGER corpus (50,472 sentences):
+
+| Configuration                  | Lemma Acc | POS Acc | POS F1 | sents/sec |
+|--------------------------------|-----------|---------|--------|-----------|
+| spaCy + GermaLemma             | **90.98** | **99.07**| **95.84** | **1,230** |
+| spaCy (without GermaLemma)     | 85.33     | 99.07   | 95.84  | 1,577     |
+
+**Note**: Disabling dependency parsing (`-d` flag) significantly improves processing speed while maintaining POS tagging and lemmatization quality.
+
+## Architecture
+
+The project consists of:
+
+- **Dockerfile**: Multi-stage build for optimized image size
+- **docker-entrypoint.sh**: Entry point script that handles model fetching and CLI argument parsing
+- **systems/parse_spacy_pipe.py**: Main spaCy processing pipeline
+- **lib/CoNLL_Annotation.py**: CoNLL-U format parsing and token classes
+- **my_utils/file_utils.py**: File handling utilities for chunked processing
+
+## Credits
+
+Based on the [sota-pos-lemmatizers](https://korap.ids-mannheim.de/gerrit/plugins/gitiles/KorAP/sota-pos-lemmatizers) evaluation project, originally by [José Angel Daza](https://github.com/angel-daza) and [Marc Kupietz](https://github.com/kupietz), with contributions by [Rebecca Wilm](https://github.com/rebecca-wilm), follows the pattern established by [conllu-treetagger-docker](https://github.com/KorAP/conllu-treetagger-docker).
+
+- **spaCy**: [https://spacy.io/](https://spacy.io/)
+- **GermaLemma**: [https://github.com/WZBSocialScienceCenter/germalemma](https://github.com/WZBSocialScienceCenter/germalemma)
+
+## License
+
+See the licenses of the individual components:
+- spaCy: MIT License
+- GermaLemma: Apache 2.0 License

diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
new file mode 100755
index 0000000..1ada4a7
--- /dev/null
+++ b/docker-entrypoint.sh

@@ -0,0 +1,158 @@
+#!/bin/bash
+
+set -o pipefail
+
+# Default values
+model="de_core_news_lg"
+use_dependencies="True"
+use_germalemma="True"
+
+usage() {
+    echo "Usage: $0 [-h] [-m MODEL] [-L] [-d] [-g]"
+    echo "  -h            Display this help message"
+    echo "  -m MODEL      Specify spaCy model (default: $model)"
+    echo "  -L            List available/installed models"
+    echo "  -d            Disable dependency parsing (faster processing)"
+    echo "  -g            Disable GermaLemma (use spaCy lemmatizer only)"
+    exit 1
+}
+
+# Parse command line options
+while getopts "hm:Ldg" opt; do
+    case $opt in
+        h)
+            usage
+            ;;
+        m)
+            model="$OPTARG"
+            ;;
+        L)
+            python -m spacy info 2>/dev/null || echo "No models installed"
+            exit 0
+            ;;
+        d)
+            use_dependencies="False"
+            ;;
+        g)
+            use_germalemma="False"
+            ;;
+        \?)
+            echo "Invalid option: -$OPTARG" >&2
+            usage
+            ;;
+        :)
+            echo "Option -$OPTARG requires an argument" >&2
+            usage
+            ;;
+    esac
+done
+
+if [ $OPTIND -le $# ]; then
+    usage
+fi
+
+MODEL_DIR="/local/models"
+MODEL_PATH="$MODEL_DIR/$model"
+
+# Ensure MODEL_DIR exists
+mkdir -p "$MODEL_DIR"
+
+# Function to check if model is installed and usable
+is_model_installed() {
+    local model_name="$1"
+    # Check if model is installed in the venv
+    python -c "import spacy; spacy.load('$model_name')" 2>/dev/null
+    return $?
+}
+
+# Function to check if preloaded model exists and is valid
+has_preloaded_model() {
+    local model_path="$1"
+    # Check for config.cfg which indicates a valid spaCy model
+    if [ -f "$model_path/config.cfg" ]; then
+        return 0
+    fi
+    return 1
+}
+
+# Function to install model
+install_model() {
+    local model_name="$1"
+
+    # Check if model exists in /local/models - if so, we'll use absolute path
+    if has_preloaded_model "$MODEL_PATH"; then
+        echo "Found preloaded model in $MODEL_PATH" >&2
+        echo "Will use absolute path to avoid download" >&2
+        return 0
+    fi
+
+    # Check if already installed in venv
+    if is_model_installed "$model_name"; then
+        echo "Model $model_name already installed in venv" >&2
+        return 0
+    fi
+
+    # Try to download model to /local/models if writable
+    if [ -w "$MODEL_DIR" ]; then
+        # Download and install to /local/models with progress
+        if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
+            # Try to move the installed model to /local/models for persistence
+            SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
+            INSTALLED_MODEL="$SITE_PACKAGES/$model_name"
+
+            if [ -d "$INSTALLED_MODEL" ]; then
+                echo "Moving model to $MODEL_PATH for persistence..." >&2
+                mv "$INSTALLED_MODEL" "$MODEL_PATH" 2>/dev/null || true
+                # Create symlink back
+                ln -sf "$MODEL_PATH" "$INSTALLED_MODEL" 2>/dev/null || true
+                echo "Model saved to $MODEL_PATH" >&2
+            fi
+            return 0
+        else
+            echo "Failed to download model $model_name" >&2
+            return 1
+        fi
+    else
+        # MODEL_DIR not writable, install to venv (ephemeral)
+        echo "Cannot write to $MODEL_DIR, installing to venv (ephemeral)" >&2
+        if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
+            return 0
+        else
+            echo "Failed to download model $model_name" >&2
+            return 1
+        fi
+    fi
+}
+
+# Install or verify model
+if ! install_model "$model"; then
+    echo "ERROR: Could not install model $model, aborting." >&2
+    exit 1
+fi
+
+# Determine which model path to use
+# If preloaded model exists, use absolute path; otherwise use model name
+if has_preloaded_model "$MODEL_PATH"; then
+    MODEL_TO_USE="$MODEL_PATH"
+    echo "Using preloaded model at: $MODEL_TO_USE" >&2
+else
+    MODEL_TO_USE="$model"
+    echo "Using installed model: $MODEL_TO_USE" >&2
+fi
+
+# Set environment variables for the Python script
+export SPACY_USE_DEPENDENCIES="$use_dependencies"
+export SPACY_USE_GERMALEMMA="$use_germalemma"
+
+# Log configuration
+echo "Configuration:" >&2
+echo "  Model: $MODEL_TO_USE" >&2
+echo "  Use dependencies: $use_dependencies" >&2
+echo "  Use GermaLemma: $use_germalemma" >&2
+
+# Run the spaCy tagging pipeline
+python /app/systems/parse_spacy_pipe.py \
+    --spacy_model "$MODEL_TO_USE" \
+    --corpus_name "stdin" \
+    --gld_token_type "CoNLLUP_Token" \
+    --comment_str "#"

diff --git a/download_with_progress.py b/download_with_progress.py
new file mode 100755
index 0000000..a907fe5
--- /dev/null
+++ b/download_with_progress.py

@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""
+Download spaCy model with progress bar
+"""
+import sys
+import subprocess
+import re
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: download_with_progress.py MODEL_NAME")
+        sys.exit(1)
+
+    model_name = sys.argv[1]
+
+    print(f"Downloading {model_name}...", file=sys.stderr)
+    print("This may take several minutes for large models (de_core_news_lg is ~560MB)", file=sys.stderr)
+    print("", file=sys.stderr)
+
+    # Run spacy download with unbuffered output
+    process = subprocess.Popen(
+        [sys.executable, "-u", "-m", "spacy", "download", model_name, "--no-cache-dir"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        universal_newlines=True,
+        bufsize=1
+    )
+
+    download_started = False
+
+    for line in iter(process.stdout.readline, ''):
+        if not line:
+            break
+
+        # Print the line
+        print(line.rstrip(), file=sys.stderr)
+
+        # Detect download progress
+        if 'Downloading' in line and not download_started:
+            download_started = True
+            print("Download in progress...", file=sys.stderr)
+
+        # Look for percentage or size indicators
+        if '%' in line or 'MB' in line or 'KB' in line:
+            # Extract and show progress
+            match = re.search(r'(\d+)%', line)
+            if match:
+                percent = match.group(1)
+                bar_length = 40
+                filled = int(bar_length * int(percent) / 100)
+                bar = '█' * filled + '░' * (bar_length - filled)
+                print(f"\rProgress: [{bar}] {percent}%", end='', file=sys.stderr)
+
+    process.stdout.close()
+    return_code = process.wait()
+
+    if return_code != 0:
+        print(f"\nError: Download failed with code {return_code}", file=sys.stderr)
+        sys.exit(return_code)
+
+    print("\n✓ Download complete!", file=sys.stderr)
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main())

diff --git a/lib/CoNLL_Annotation.py b/lib/CoNLL_Annotation.py
new file mode 100644
index 0000000..1a8ddf0
--- /dev/null
+++ b/lib/CoNLL_Annotation.py

@@ -0,0 +1,220 @@
+from collections import defaultdict, OrderedDict
+import re
+
+# CoNLL-U Format - https://universaldependencies.org/format.html
+
+
+def get_token_type(type_str):
+    if type_str =="CoNLL09_Token":
+        return CoNLL09_Token
+    elif type_str == "RNNTagger_Token":
+        return RNNTagger_Token
+    elif type_str == "CoNLLUP_Token":
+        return CoNLLUP_Token
+    elif type_str == "TigerNew_Token":
+        return TigerNew_Token
+    else:
+        raise NotImplementedError(f"I don't know what to do with {type_str} token type!")
+
+
+class TigerNew_Token():
+    def __init__(self, raw_line, word_ix):
+        info = raw_line.split() # [FORM, XPOS]
+        self.info = info
+        self.id = word_ix + 1 # 1-based ID as in the CoNLL file
+        self.position = word_ix # 0-based position in sentence
+        self.word = info[0]
+        self.lemma = "_"
+        self.pos_universal = "_"
+        self.pos_tag = info[1]
+        self.detail_tag = "_"
+        self.head = "_"
+        self.dep_tag = "_"
+        self.blank = "_"
+        self.auto_score = "_"
+        
+    def get_info(self):
+        return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
+                str(self.head), self.dep_tag, self.blank, self.auto_score]
+
+    def get_conllU_line(self, separator="\t"):
+        info = self.get_info()
+        return separator.join(info)
+
+
+class RNNTagger_Token():
+    def __init__(self, raw_line, word_ix):
+        info = raw_line.split() # [FORM, XPOS.FEATS, LEMMA]
+        self.info = info
+        self.id = word_ix + 1 # 1-based ID as in the CoNLL file
+        self.position = word_ix # 0-based position in sentence
+        self.word = info[0]
+        self.lemma = info[2]
+        self.pos_universal = "_"
+        self.pos_tag, self.detail_tag = self._process_tag(info[1]) # 'NN.Gen.Sg.Fem'
+        self.head = "_"
+        self.dep_tag = "_"
+        self.blank = "_"
+        self.auto_score = "_"
+        
+    def _process_tag(self, tag):
+        if tag == "_" or "." not in tag: return tag, "_"
+        info = tag.split(".")
+        return info[0], "|".join(info[1:])
+        
+    def get_info(self):
+        return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
+                str(self.head), self.dep_tag, self.blank, self.auto_score]
+
+    def get_conllU_line(self, separator="\t"):
+        info = self.get_info()
+        return separator.join(info)
+
+
+class CoNLLUP_Token():
+    def __init__(self, raw_line, word_ix):
+        info = raw_line.split()
+        # print(info)
+        # [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+        # [11, Prügel, Prügel, NN, NN, _, _, _,	_, 1.000000]
+        self.info = info
+        self.id = info[0] # 1-based ID as in the CoNLL file
+        self.position = word_ix # 0-based position in sentence
+        self.word = info[1]
+        self.lemma = info[2]
+        self.pos_universal = info[3]
+        self.pos_tag = self._process_tag(info[4]) # 'XPOS=NE|Case=Nom|Gender=Masc|Number=Sing' TODO: Reuse MorphInfo in the self.detail_tag
+        self.detail_tag = info[5]
+        self.head = info[6]
+        self.dep_tag = info[7]
+        self.blank = info[8] # ???
+        self.auto_score = info[9]
+        
+    def _process_tag(self, tag):
+        if tag == "_" or "|" not in tag: return tag # The XPOS=NE|Case=Nom... is only for Turku!
+        info = tag.split("|")
+        info = [x.split("=") for x in info]
+        return info[0][1]
+        
+    def get_info(self):
+        return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
+                str(self.head), self.dep_tag, self.blank, self.auto_score]
+
+    def get_conllU_line(self, separator="\t"):
+        info = self.get_info()
+        return separator.join(info)
+
+
+
+class CoNLL09_Token():
+    def __init__(self, raw_line, word_ix):
+        info = raw_line.split()
+        # print(info)
+        # # ['1', 'Frau', 'Frau', 'Frau', 'NN', 'NN', '_', 'nom|sg|fem', '5', '5', 'CJ', 'CJ', '_', '_', 'AM-DIS', '_']
+        self.info = info
+        self.id = info[0] # 1-based ID as in the CoNLL file
+        self.position = word_ix # 0-based position in sentence
+        self.word = info[1]
+        self.lemma = info[2]
+        self.pos_universal = "_" # _convert_to_universal(self.pos_tag, self.lemma)
+        self.pos_tag = info[4]
+        self.head = info[8]
+        self.dep_tag = info[10]
+        self.detail_tag = "_"
+        self.is_pred = True if info[12] == "Y" else False
+        if self.is_pred:
+            self.pred_sense = info[13].strip("[]")
+            self.pred_sense_id = str(self.position) + "##" + self.pred_sense
+        else:
+            self.pred_sense = None
+            self.pred_sense_id = ""
+        if len(info) > 14:
+            self.labels = info[14:]
+        else:
+            self.labels = []
+
+    def get_conllU_line(self, separator="\t"):
+        # We want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+        tok_id = str(self.id) #.split("_")[0]
+        conllUinfo = [tok_id, self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag, self.head, self.dep_tag, "_", "_"]
+        return separator.join(conllUinfo)
+
+    def get_conll09_line(self, delim="\t"):
+        # We want:
+        # 1 Frau Frau Frau NN NN _ nom|sg|fem 5 5 CJ CJ _ _ AM-DIS _
+        # 10	fall	fall	fall	VB	VB	_	_	8	8	VC	VC	Y	fall.01	_	_	_	_	_
+        is_pred_str = "Y" if self.is_pred else "_"
+        sense_str = self.pred_sense if self.is_pred else "_"
+        info = [self.id, self.word, self.lemma, self.lemma, self.pos_tag, self.pos_tag, "_", self.detail_tag,
+                self.head, self.head, self.dep_tag, self.dep_tag, is_pred_str, sense_str] + self.labels
+        return delim.join(info)
+
+
+
+################################# GETTING SENTENCE ANNOTATIONS ####################################
+class AnnotatedSentence():
+    def __init__(self):
+        self.metadata = []
+        self.tokens = []
+
+    def get_words(self):
+        return [tok.word for tok in self.tokens]
+
+    def get_sentence(self):
+        return " ".join([tok.word for tok in self.tokens])
+
+    def get_pos_tags(self, universal=False):
+        if universal:
+            return [tok.pos_universal for tok in self.tokens]
+        else:
+            return [tok.pos_tag for tok in self.tokens]
+
+
+def get_annotation(raw_lines, raw_meta, token_class):
+    ann = AnnotatedSentence()
+    ann.metadata = [m.strip("\n") for m in raw_meta]
+    # Annotate the predicates and senses
+    real_index = 0
+    for i, line in enumerate(raw_lines):
+        tok = token_class(line, real_index)
+        ann.tokens.append(tok)
+        real_index += 1
+    return ann
+
+
+def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token, comment_str="###C:", our_foundry="spacy"):
+    n_sents = 0
+    annotated_sentences, buffer_meta, buffer_lst = [], [], []
+    for i, line in enumerate(line_generator):
+        if line.startswith(comment_str):
+            line = re.sub(r'(foundry\s*=\s*).*', r"\1" + our_foundry, line)
+            line = re.sub(r'(filename\s*=\s* .[^/]*/[^/]+/[^/]+/).*', r"\1" + our_foundry + "/morpho.xml", line)
+            buffer_meta.append(line)
+            continue
+        if len(line.split()) > 0:
+            buffer_lst.append(line)
+        else:
+            ann = get_annotation(buffer_lst, buffer_meta, token_class)
+            n_sents += 1
+            buffer_lst, buffer_meta = [], []
+            annotated_sentences.append(ann)
+        if chunk_size > 0 and n_sents == chunk_size: break
+    # logger.info("Read {} Sentences!".format(n_sents))
+    return annotated_sentences, n_sents
+
+    
+def read_conll_generator(filepath, token_class=CoNLLUP_Token, sent_sep=None, comment_str="###C:"):
+    buffer_meta, buffer_lst = [], []
+    sentence_finished = False
+    with open(filepath) as f:
+        for i, line in enumerate(f.readlines()):
+            if sent_sep and sent_sep in line: sentence_finished = True
+            if line.startswith(comment_str):
+                continue
+            if len(line.split()) > 0 and not sentence_finished:
+                buffer_lst.append(line)
+            else:
+                ann = get_annotation(buffer_lst, buffer_meta, token_class)
+                buffer_lst, buffer_meta = [], []
+                sentence_finished = False
+                yield ann
\ No newline at end of file

diff --git a/lib/__init__.py b/lib/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/lib/__init__.py


diff --git a/my_utils/__init__.py b/my_utils/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/my_utils/__init__.py


diff --git a/my_utils/file_utils.py b/my_utils/file_utils.py
new file mode 100644
index 0000000..c5eb8ed
--- /dev/null
+++ b/my_utils/file_utils.py

@@ -0,0 +1,124 @@
+import requests, logging, json
+import subprocess, time
+import glob, logging
+import os.path, sys
+from lib.CoNLL_Annotation import read_conll, read_conll_generator
+
+logger = logging.getLogger(__name__)
+
+
+def list_to_file(my_list, out_path):
+    with open(out_path, "w") as out:
+        for item_str in my_list:
+            out.write(f"{item_str}\n")
+
+def counter_to_file(my_counter, out_path):
+    with open(out_path, "w") as out:
+        for item, count in my_counter:
+            item_str = "\t".join(item)
+            out.write(f"{item_str}\t{count}\n")
+
+def dict_to_file(my_dict, out_path):
+    with open(out_path, "w", encoding='utf8') as out:
+        json.dump(my_dict, fp=out, ensure_ascii=False)
+
+
+def file_to_dict(file_path):
+    d = {}
+    with open(file_path) as f:
+        d = json.load(f)
+    return d  
+
+
+def write_conll_file(conll_objs, out_path):
+    with open(out_path, "w", encoding='utf8') as out:
+        for obj in conll_objs:
+            for tok in obj.tokens:
+                out.write(tok.get_conllU_line()+"\n")
+            out.write("\n")
+
+def file_generator(file_path):
+    with open(file_path, "r") as data_file:
+        logger.info("Reading instances from lines in file at: %s", file_path)
+        for line in data_file:
+            if not line: continue
+            yield line
+
+
+def get_file_annos_chunk(line_generator, chunk_size, token_class, comment_str="###C:", our_foundry="spacy"):
+    file_has_next = True
+    chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str, our_foundry=our_foundry)
+    if n_sents == 0: file_has_next = False
+    sents, gld, meta = [], [], []
+    return chunk, file_has_next
+
+
+def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+    """ Same as get_file_annos_chunk but directly get (text, labels) pairs"""
+    file_has_next = True
+    chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
+    if n_sents == 0: file_has_next = False
+    sents, gld, meta = [], [], []
+    for anno in chunk:
+        if len(anno.metadata) > 0: meta.append("\n".join(anno.metadata))
+        sents.append(anno.get_sentence())
+        gld.append(anno.get_pos_tags())
+    return sents, gld, file_has_next
+
+
+def get_file_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+    file_has_next = True
+    chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
+    if n_sents < chunk_size: file_has_next = False
+    raw_text = ""
+    for anno in chunk:
+        if len(anno.metadata) > 0: 
+            raw_text += "\n".join(anno.metadata) + "\n"
+        else:
+            raw_text += "\n"
+        for tok in anno.tokens:
+            raw_text += tok.get_conllU_line() + "\n"
+        raw_text += "\n"
+    return raw_text, file_has_next, n_sents
+
+
+def turku_parse_file(raw_text, filename, chunk_ix):
+    out_file_str = f"{filename}.parsed.{chunk_ix}.conllu"
+    # For each file make a request to obtain the parse back
+    logger.info(f"Sending Request {chunk_ix} to Parser Server...")
+    response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))
+    response_to_file(response.text, out_file_str)
+
+
+
+def response_to_file(response_str, fname):
+    fout = open(fname, "w")
+    fout.write(response_str)
+    fout.close()
+
+
+def expand_file(f, substitute_comment=False):
+    # Expand the .gz file
+    fname = f[:-3]
+    if not os.path.isfile(fname): 
+        p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
+        if p == 0:
+            logger.info("Successfully uncompressed file")
+        else:
+            logger.info(f"Couldn't expand file {f}")
+            raise Exception
+    else:
+        logger.info(f"File {fname} is already uncompressed. Skipping this step...")
+    
+    # Substitute the Commentary Lines on the Expanded file
+    if substitute_comment:
+        fixed_filename = f"{fname}.fixed"
+        p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
+        if p == 0:
+            logger.info("Successfully fixed comments on file")
+        else:
+            logger.info(f"Something went wrong when substituting commentaries")
+            raise Exception    
+        return fixed_filename
+    else:
+        return fname

diff --git a/preload-models.sh b/preload-models.sh
new file mode 100755
index 0000000..ad770f8
--- /dev/null
+++ b/preload-models.sh

@@ -0,0 +1,82 @@
+#!/bin/bash
+# Script to preload spaCy models to a local directory
+# Usage: ./preload-models.sh [MODEL_NAME] [TARGET_DIR]
+
+set -e
+
+MODEL_NAME="${1:-de_core_news_lg}"
+TARGET_DIR="${2:-./models}"
+
+echo "Preloading spaCy model: $MODEL_NAME"
+echo "Target directory: $TARGET_DIR"
+
+# Create target directory if it doesn't exist
+mkdir -p "$TARGET_DIR"
+
+# Check if model already exists
+if [ -d "$TARGET_DIR/$MODEL_NAME" ]; then
+    echo "Model $MODEL_NAME already exists in $TARGET_DIR"
+    echo "Remove it first if you want to re-download: rm -rf $TARGET_DIR/$MODEL_NAME"
+    exit 0
+fi
+
+echo "Downloading model using temporary Docker container..."
+
+# Use a temporary container to download the model
+docker run --rm -v "$(realpath $TARGET_DIR)":/models python:3.12-slim-bookworm bash -c "
+    set -e
+    echo 'Installing spaCy...'
+    pip install -q spacy
+
+    echo 'Downloading model $MODEL_NAME...'
+    echo 'This may take several minutes depending on your connection speed.'
+    python -m spacy download $MODEL_NAME --no-cache-dir 2>&1 | while IFS= read -r line; do
+        echo \"\$line\"
+        # Show progress dots for download
+        if [[ \"\$line\" == *\"Downloading\"* ]]; then
+            echo -n \"Progress: \"
+        fi
+    done
+
+    echo 'Moving model to /models...'
+    python -c \"
+import spacy
+import shutil
+import site
+import os
+
+# Get the installed model path
+site_packages = site.getsitepackages()[0]
+model_path = site_packages + '/$MODEL_NAME'
+
+# spaCy packages contain a subdirectory with the versioned model
+# Find the actual model directory (e.g., de_core_news_lg-3.8.0)
+items = os.listdir(model_path)
+model_subdir = None
+for item in items:
+    item_path = os.path.join(model_path, item)
+    if os.path.isdir(item_path) and '$MODEL_NAME' in item:
+        model_subdir = item_path
+        break
+
+if model_subdir:
+    # Copy the actual model directory
+    shutil.copytree(model_subdir, '/models/$MODEL_NAME')
+    print(f'Model copied successfully from {model_subdir}!')
+else:
+    # Fallback: copy the whole package
+    shutil.copytree(model_path, '/models/$MODEL_NAME')
+    print('Model copied successfully!')
+\"
+"
+
+if [ -d "$TARGET_DIR/$MODEL_NAME" ]; then
+    echo ""
+    echo "✓ Model $MODEL_NAME successfully preloaded to $TARGET_DIR/$MODEL_NAME"
+    echo ""
+    echo "You can now run the container with:"
+    echo "  docker run --rm -i -v $(realpath $TARGET_DIR):/local/models korap/conllu-spacy"
+else
+    echo "✗ Error: Model download failed"
+    exit 1
+fi

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..46c867c
--- /dev/null
+++ b/requirements.txt

@@ -0,0 +1,5 @@
+pip
+wheel
+thinc
+spacy
+germalemma

diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
new file mode 100644
index 0000000..0abfb33
--- /dev/null
+++ b/systems/parse_spacy_pipe.py

@@ -0,0 +1,327 @@
+from sys import stdin
+import argparse, os
+import spacy
+from spacy.tokens import Doc
+import logging, sys, time, signal
+from lib.CoNLL_Annotation import get_token_type
+import my_utils.file_utils as fu
+from germalemma import GermaLemma
+
+# Dependency parsing safety limits
+DEFAULT_PARSE_TIMEOUT = 0.5  # seconds per sentence
+DEFAULT_MAX_SENTENCE_LENGTH = 500  # tokens
+
+class TimeoutException(Exception):
+	pass
+
+def timeout_handler(signum, frame):
+	raise TimeoutException("Dependency parsing timeout")
+
+def safe_dependency_parse(spacy_model, text, timeout=DEFAULT_PARSE_TIMEOUT, max_length=DEFAULT_MAX_SENTENCE_LENGTH):
+	"""
+	Safely parse a sentence with timeout and length limits.
+	
+	Args:
+		spacy_model: Loaded spaCy model
+		text: Text to parse
+		timeout: Maximum seconds to wait for parsing
+		max_length: Maximum sentence length in tokens
+		
+	Returns:
+		tuple: (spacy_doc, success, warning_message)
+	"""
+	# Check sentence length
+	if len(text.split()) > max_length:
+		# Process without dependency parsing for long sentences
+		disabled_components = ["ner", "parser"]
+		doc = spacy_model(text, disable=disabled_components)
+		return doc, False, f"Sentence too long ({len(text.split())} tokens > {max_length}), dependency parsing skipped"
+	
+	# Set up timeout
+	old_handler = signal.signal(signal.SIGALRM, timeout_handler)
+	signal.setitimer(signal.ITIMER_REAL, timeout)
+	
+	try:
+		doc = spacy_model(text)
+		signal.setitimer(signal.ITIMER_REAL, 0)  # Cancel alarm
+		signal.signal(signal.SIGALRM, old_handler)
+		return doc, True, None
+	except TimeoutException:
+		signal.setitimer(signal.ITIMER_REAL, 0)  # Cancel alarm
+		signal.signal(signal.SIGALRM, old_handler)
+		# Retry without dependency parsing
+		disabled_components = ["ner", "parser"]
+		doc = spacy_model(text, disable=disabled_components)
+		return doc, False, f"Dependency parsing timeout after {timeout}s, processed without dependencies"
+	except Exception as e:
+		signal.setitimer(signal.ITIMER_REAL, 0)  # Cancel alarm
+		signal.signal(signal.SIGALRM, old_handler)
+		# Retry without dependency parsing
+		disabled_components = ["ner", "parser"]
+		doc = spacy_model(text, disable=disabled_components)
+		return doc, False, f"Dependency parsing error: {str(e)}, processed without dependencies"
+
+def format_morphological_features(token):
+	"""
+	Extract and format morphological features from a spaCy token for CoNLL-U output.
+	
+	Args:
+		token: spaCy token object
+		
+	Returns:
+		str: Formatted morphological features string for CoNLL-U 5th column
+			 Returns "_" if no features are available
+	"""
+	if not hasattr(token, 'morph') or not token.morph:
+		return "_"
+	
+	morph_dict = token.morph.to_dict()
+	if not morph_dict:
+		return "_"
+	
+	# Format as CoNLL-U format: Feature=Value|Feature2=Value2
+	features = []
+	for feature, value in sorted(morph_dict.items()):
+		features.append(f"{feature}={value}")
+	
+	return "|".join(features)
+
+
+def format_dependency_relations(doc):
+	"""
+	Extract and format dependency relations from a spaCy doc for CoNLL-U output.
+	
+	Args:
+		doc: spaCy Doc object
+		
+	Returns:
+		list: List of tuples (head_id, deprel) for each token
+	"""
+	dependencies = []
+	for i, token in enumerate(doc):
+		# HEAD column: 1-based index of the head token (0 for root)
+		if token.dep_ == "ROOT":
+			head_id = 0
+		else:
+			# Find the 1-based index of the head token
+			head_id = None
+			for j, potential_head in enumerate(doc):
+				if potential_head == token.head:
+					head_id = j + 1
+					break
+			if head_id is None:
+				head_id = 0  # Fallback to root if head not found
+		
+		# DEPREL column: dependency relation
+		deprel = token.dep_ if token.dep_ else "_"
+		
+		dependencies.append((head_id, deprel))
+	
+	return dependencies
+
+
+class WhitespaceTokenizer(object):
+	def __init__(self, vocab):
+		self.vocab = vocab
+
+	def __call__(self, text):
+		words = text.split(' ')
+		# Filter out empty strings to avoid spaCy errors
+		words = [w for w in words if w]
+		# Handle edge case of empty input - use a placeholder token
+		if not words:
+			words = ['_EMPTY_']
+		# All tokens 'own' a subsequent space character in this tokenizer
+		spaces = [True] * len(words)
+		return Doc(self.vocab, words=words, spaces=spaces)
+
+
+def get_conll_str(anno_obj, spacy_doc, use_germalemma, use_dependencies):
+	#  First lines are comments. (metadata)
+	conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+	
+	# Get dependency relations if enabled
+	dependencies = format_dependency_relations(spacy_doc) if use_dependencies == "True" else None
+	
+	for ix, token in enumerate(spacy_doc):
+		morph_features = format_morphological_features(token)
+		
+		# Get HEAD and DEPREL columns
+		if dependencies:
+			head_id, deprel = dependencies[ix]
+		else:
+			head_id, deprel = "_", "_"
+		
+		if use_germalemma == "True":
+			content = (str(ix+1), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, str(head_id), deprel, "_", "_")
+		else:
+			content = (str(ix+1), token.text, token.lemma_, token.pos_, token.tag_, morph_features, str(head_id), deprel, "_", "_") # Pure SpaCy!
+		conll_lines.append("\t".join(content))
+	return "\n".join(conll_lines)
+
+	
+def find_germalemma(word, pos, spacy_lemma):
+	simplify_pos = {"ADJA":"ADJ", "ADJD":"ADJ",
+					"NA":"N", "NE":"N", "NN":"N",
+					"ADV":"ADV", "PAV":"ADV", "PROAV":"ADV", "PAVREL":"ADV", "PWAV":"ADV", "PWAVREL":"ADV",
+					"VAFIN":"V", "VAIMP":"V", "VAINF":"V", "VAPP":"V", "VMFIN":"V", "VMINF":"V",
+					"VMPP":"V", "VVFIN":"V", "VVIMP":"V", "VVINF":"V", "VVIZU":"V","VVPP":"V"
+				}
+	# simplify_pos = {"VERB": "V", "ADV": "ADV", "ADJ": "ADJ", "NOUN":"N", "PROPN": "N"}
+	try:
+		return lemmatizer.find_lemma(word, simplify_pos.get(pos, "UNK"))
+	except:
+		return spacy_lemma
+
+
+if __name__ == "__main__":
+	"""
+		--- Example Real Data TEST  ---
+		
+		cat /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/zca18.conllu | python systems/parse_spacy_pipe.py \
+			--corpus_name DeReKo_zca18 --comment_str "#" > output_zca18.conll
+	"""
+	
+	parser = argparse.ArgumentParser()
+	parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
+	parser.add_argument("-sm", "--spacy_model", help="Spacy model containing the pipeline to tag", default="de_core_news_lg")
+	parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLLUP_Token")
+	parser.add_argument("-ugl", "--use_germalemma", help="Use Germalemma lemmatizer on top of SpaCy", default="True")
+	parser.add_argument("-udp", "--use_dependencies", help="Include dependency parsing (adds HEAD/DEPREL columns, set to False for faster processing)", default="True")
+	parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
+	args = parser.parse_args()
+	
+	file_has_next, chunk_ix = True, 0
+	CHUNK_SIZE = int(os.getenv("SPACY_CHUNK_SIZE", "20000"))
+	SPACY_BATCH = int(os.getenv("SPACY_BATCH_SIZE", "2000"))
+	SPACY_PROC = int(os.getenv("SPACY_N_PROCESS", "1"))
+	
+	# =====================================================================================
+	#                    LOGGING INFO ...
+	# =====================================================================================
+	logger = logging.getLogger(__name__)
+	console_hdlr = logging.StreamHandler(sys.stderr)
+	file_hdlr = logging.FileHandler(filename=f"logs/Parse_{args.corpus_name}.SpaCy.log")
+	
+	# Custom format without module name
+	formatter = logging.Formatter('%(levelname)s: %(message)s')
+	console_hdlr.setFormatter(formatter)
+	file_hdlr.setFormatter(formatter)
+	
+	logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
+	
+	# Override with environment variables if set (useful for Docker)
+	import os
+	if os.getenv("SPACY_USE_DEPENDENCIES") is not None:
+		args.use_dependencies = os.getenv("SPACY_USE_DEPENDENCIES", "True")
+		logger.info(f"Using SPACY_USE_DEPENDENCIES environment variable: {args.use_dependencies}")
+	
+	if os.getenv("SPACY_USE_GERMALEMMA") is not None:
+		args.use_germalemma = os.getenv("SPACY_USE_GERMALEMMA", "True")
+		logger.info(f"Using SPACY_USE_GERMALEMMA environment variable: {args.use_germalemma}")
+	
+	logger.info(f"Chunking {args.corpus_name} Corpus in chunks of {CHUNK_SIZE} Sentences")
+	logger.info(f"Processing configuration: batch_size={SPACY_BATCH}, n_process={SPACY_PROC}")
+	
+	# =====================================================================================
+	#                    POS TAG DOCUMENTS
+	# =====================================================================================
+	# Configure which components to disable based on dependency parsing option
+	disabled_components = ["ner"]
+	if args.use_dependencies != "True":
+		disabled_components.append("parser")
+		logger.info("Dependency parsing disabled for faster processing")
+	else:
+		logger.info("Dependency parsing enabled (slower but includes HEAD/DEPREL)")
+	
+	spacy_de = spacy.load(args.spacy_model, disable=disabled_components)
+	spacy_de.tokenizer = WhitespaceTokenizer(spacy_de.vocab) # We won't re-tokenize to respect how the source CoNLL are tokenized!
+	
+	# Increase max_length to handle very long sentences (especially when parser is disabled)
+	spacy_de.max_length = 10000000  # 10M characters
+	
+	lemmatizer = GermaLemma()
+	
+	# Log version information
+	logger.info(f"spaCy version: {spacy.__version__}")
+	logger.info(f"spaCy model: {args.spacy_model}")
+	logger.info(f"spaCy model version: {spacy_de.meta.get('version', 'unknown')}")
+	try:
+		import germalemma
+		logger.info(f"GermaLemma version: {germalemma.__version__}")
+	except AttributeError:
+		logger.info("GermaLemma version: unknown (no __version__ attribute)")
+	
+	# Parse timeout and sentence length limits from environment variables
+	parse_timeout = float(os.getenv("SPACY_PARSE_TIMEOUT", str(DEFAULT_PARSE_TIMEOUT)))
+	max_sentence_length = int(os.getenv("SPACY_MAX_SENTENCE_LENGTH", str(DEFAULT_MAX_SENTENCE_LENGTH)))
+	
+	logger.info(f"Dependency parsing limits: timeout={parse_timeout}s, max_length={max_sentence_length} tokens")
+	
+	start = time.time()
+	total_processed_sents = 0
+	dependency_warnings = 0
+	
+	while file_has_next:
+		annos, file_has_next = fu.get_file_annos_chunk(stdin, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str, our_foundry="spacy")
+		if len(annos) == 0: break
+		total_processed_sents += len(annos)
+		
+		# Calculate progress statistics
+		elapsed_time = time.time() - start
+		sents_per_sec = total_processed_sents / elapsed_time if elapsed_time > 0 else 0
+		current_time = time.strftime("%Y-%m-%d %H:%M:%S")
+		
+		logger.info(f"{current_time} | Processed: {total_processed_sents} sentences | Elapsed: {elapsed_time:.1f}s | Speed: {sents_per_sec:.1f} sents/sec")
+		
+		sents = [a.get_sentence() for a in annos]
+		
+		# Process sentences individually when dependency parsing is enabled for timeout protection
+		if args.use_dependencies == "True":
+			for ix, sent in enumerate(sents):
+				doc, dependency_success, warning = safe_dependency_parse(
+					spacy_de, sent, timeout=parse_timeout, max_length=max_sentence_length
+				)
+				if warning:
+					dependency_warnings += 1
+					logger.warning(f"Sentence {total_processed_sents - len(sents) + ix + 1}: {warning}")
+				
+				# Override use_dependencies based on actual parsing success
+				actual_use_dependencies = "True" if dependency_success else "False"
+				conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=actual_use_dependencies)
+				print(conll_str+ "\n")
+		else:
+			# Use batch processing for faster processing when dependencies are disabled
+			# Use n_process=1 to avoid multiprocessing deadlocks and memory issues with large files
+			try:
+				for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=1)):
+					conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
+					print(conll_str+ "\n")
+			except Exception as e:
+				logger.error(f"Batch processing failed: {str(e)}")
+				logger.info("Falling back to individual sentence processing...")
+				# Fallback: process sentences individually
+				for ix, sent in enumerate(sents):
+					try:
+						doc = spacy_de(sent)
+						conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
+						print(conll_str+ "\n")
+					except Exception as sent_error:
+						logger.error(f"Failed to process sentence {total_processed_sents - len(sents) + ix + 1}: {str(sent_error)}")
+						logger.error(f"Sentence preview: {sent[:100]}...")
+						# Output a placeholder to maintain alignment
+						conll_str = get_conll_str(annos[ix], spacy_de("ERROR"), use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
+						print(conll_str+ "\n")
+			
+	end = time.time()
+	total_time = end - start
+	final_sents_per_sec = total_processed_sents / total_time if total_time > 0 else 0
+	
+	logger.info(f"=== Processing Complete ===")
+	logger.info(f"Total sentences: {total_processed_sents}")
+	logger.info(f"Total time: {total_time:.2f}s")
+	logger.info(f"Average speed: {final_sents_per_sec:.1f} sents/sec")
+	
+	if dependency_warnings > 0:
+		logger.info(f"Dependency parsing warnings: {dependency_warnings} sentences processed without dependencies")
+			
\ No newline at end of file
commit	8604485f28d54fef4e34fde53f0bd08161901722	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Nov 29 10:19:03 2025 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Nov 29 10:21:33 2025 +0100
tree	47796f9dd1e45a9b88a2dc493fa16a1c6b8b273e