Initial import
Change-Id: I6315233ee1bfbdf7cc985cb336d0df7a10274189
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..3ec93e5
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,24 @@
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+venv/
+ENV/
+*.log
+.pytest_cache/
+.coverage
+htmlcov/
+dist/
+build/
+*.egg-info/
+.DS_Store
+*.conllu
+*.zip
+models/
+logs/
+tmp/
+.git/
+.gitignore
+README.md
+.github/
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..73a96da
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,20 @@
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+venv/
+ENV/
+*.log
+.pytest_cache/
+.coverage
+htmlcov/
+dist/
+build/
+*.egg-info/
+.DS_Store
+*.conllu
+*.zip
+models/
+logs/
+tmp/
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..3f7be4d
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,71 @@
+# Multi-stage Docker build for size optimization
+FROM python:3.12-slim-bookworm AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+ gcc \
+ g++ \
+ && rm -rf /var/lib/apt/lists/*
+
+# Set environment variables
+ENV PIP_CACHE_DIR="/tmp/.cache/pip" \
+ PYTHONPATH="PYTHONPATH:."
+ENV VIRTUAL_ENV=/app/venv
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# Set the working directory and copy requirements
+WORKDIR /app
+COPY requirements.txt /app/requirements.txt
+
+# Install Python dependencies in virtual environment
+RUN python -m venv venv
+RUN venv/bin/pip install --upgrade pip
+RUN venv/bin/pip install -r requirements.txt
+
+# Production stage
+FROM python:3.12-slim-bookworm AS production
+
+# Install minimal runtime dependencies
+RUN apt-get update && apt-get install -y \
+ wget \
+ coreutils \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# Copy virtual environment from builder
+COPY --from=builder /app/venv /app/venv
+
+# Copy application code
+COPY lib /app/lib
+COPY systems /app/systems
+COPY my_utils /app/my_utils
+COPY docker-entrypoint.sh /docker-entrypoint.sh
+COPY download_with_progress.py /app/download_with_progress.py
+
+# Set environment variables
+ENV VIRTUAL_ENV=/app/venv
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+ENV PYTHONPATH="PYTHONPATH:."
+
+# spaCy processing configuration
+ENV SPACY_USE_DEPENDENCIES="True"
+ENV SPACY_USE_GERMALEMMA="True"
+ENV SPACY_PARSE_TIMEOUT="30"
+ENV SPACY_MAX_SENTENCE_LENGTH="500"
+ENV SPACY_N_PROCESS="10"
+ENV SPACY_BATCH_SIZE="2000"
+ENV SPACY_CHUNK_SIZE="20000"
+
+WORKDIR /app
+RUN mkdir -p "/app/logs" "/app/tmp" "/local/models"
+
+# Set temp directories to use app directory instead of system /tmp
+ENV TMPDIR="/app/tmp"
+ENV TEMP="/app/tmp"
+ENV TMP="/app/tmp"
+
+# Make entrypoint executable
+RUN chmod +x /docker-entrypoint.sh
+
+# Define the entry point
+ENTRYPOINT ["/docker-entrypoint.sh"]
diff --git a/Dockerfile.with-models b/Dockerfile.with-models
new file mode 100644
index 0000000..39c7bd4
--- /dev/null
+++ b/Dockerfile.with-models
@@ -0,0 +1,96 @@
+# Dockerfile with pre-installed models
+# Build: docker build -f Dockerfile.with-models -t korap/conllu-spacy:with-models .
+
+# Multi-stage Docker build for size optimization
+FROM python:3.12-slim-bookworm AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+ gcc \
+ g++ \
+ && rm -rf /var/lib/apt/lists/*
+
+# Set environment variables
+ENV PIP_CACHE_DIR="/tmp/.cache/pip" \
+ PYTHONPATH="PYTHONPATH:."
+ENV VIRTUAL_ENV=/app/venv
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# Set the working directory and copy requirements
+WORKDIR /app
+COPY requirements.txt /app/requirements.txt
+
+# Install Python dependencies in virtual environment
+RUN python -m venv venv
+RUN venv/bin/pip install --upgrade pip
+RUN venv/bin/pip install -r requirements.txt
+
+# Download spaCy models to /local/models
+RUN mkdir -p /local/models
+
+# Download the default model (de_core_news_lg)
+RUN venv/bin/python -m spacy download de_core_news_lg --no-cache-dir
+
+# Move model to /local/models for persistence
+RUN MODEL_PATH=$(venv/bin/python -c "import site; print(site.getsitepackages()[0] + '/de_core_news_lg')") && \
+ mv "$MODEL_PATH" /local/models/de_core_news_lg
+
+# Optionally download additional models
+# Uncomment to include medium model:
+# RUN venv/bin/python -m spacy download de_core_news_md --no-cache-dir && \
+# MODEL_PATH=$(venv/bin/python -c "import site; print(site.getsitepackages()[0] + '/de_core_news_md')") && \
+# mv "$MODEL_PATH" /local/models/de_core_news_md
+
+# Uncomment to include small model:
+# RUN venv/bin/python -m spacy download de_core_news_sm --no-cache-dir && \
+# MODEL_PATH=$(venv/bin/python -c "import site; print(site.getsitepackages()[0] + '/de_core_news_sm')") && \
+# mv "$MODEL_PATH" /local/models/de_core_news_sm
+
+# Production stage
+FROM python:3.12-slim-bookworm AS production
+
+# Install minimal runtime dependencies
+RUN apt-get update && apt-get install -y \
+ wget \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# Copy virtual environment from builder
+COPY --from=builder /app/venv /app/venv
+
+# Copy pre-downloaded models
+COPY --from=builder /local/models /local/models
+
+# Copy application code
+COPY lib /app/lib
+COPY systems /app/systems
+COPY my_utils /app/my_utils
+COPY docker-entrypoint.sh /docker-entrypoint.sh
+
+# Set environment variables
+ENV VIRTUAL_ENV=/app/venv
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+ENV PYTHONPATH="PYTHONPATH:."
+
+# spaCy processing configuration
+ENV SPACY_USE_DEPENDENCIES="True"
+ENV SPACY_USE_GERMALEMMA="True"
+ENV SPACY_PARSE_TIMEOUT="30"
+ENV SPACY_MAX_SENTENCE_LENGTH="500"
+ENV SPACY_N_PROCESS="10"
+ENV SPACY_BATCH_SIZE="2000"
+ENV SPACY_CHUNK_SIZE="20000"
+
+WORKDIR /app
+RUN mkdir -p "/app/logs" "/app/tmp"
+
+# Set temp directories to use app directory instead of system /tmp
+ENV TMPDIR="/app/tmp"
+ENV TEMP="/app/tmp"
+ENV TMP="/app/tmp"
+
+# Make entrypoint executable
+RUN chmod +x /docker-entrypoint.sh
+
+# Define the entry point
+ENTRYPOINT ["/docker-entrypoint.sh"]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..4cb360d
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,37 @@
+BSD 2-Clause License
+
+Copyright (c) 2025, IDS Mannheim
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+---
+
+This project includes components from:
+
+spaCy (https://spacy.io/)
+ License: MIT License
+ Copyright (c) 2016-2025 ExplosionAI GmbH
+
+GermaLemma (https://github.com/WZBSocialScienceCenter/germalemma)
+ License: Apache License 2.0
+ Copyright (c) 2017 Markus Konrad
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..5d1b22d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,27 @@
+.PHONY: build build-with-models preload-models run test clean
+
+build:
+ docker build -t korap/conllu-spacy:latest .
+
+build-with-models:
+ docker build -f Dockerfile.with-models -t korap/conllu-spacy:with-models .
+
+preload-models:
+ @echo "Preloading default model (de_core_news_lg) to ./models..."
+ ./preload-models.sh
+
+preload-models-all:
+ @echo "Preloading all models to ./models..."
+ ./preload-models.sh de_core_news_lg ./models
+ ./preload-models.sh de_core_news_md ./models
+ ./preload-models.sh de_core_news_sm ./models
+
+run:
+ docker run --rm -i korap/conllu-spacy:latest
+
+test:
+ @echo "Testing with sample input..."
+ @echo "Not implemented yet - add test input file"
+
+clean:
+ docker rmi korap/conllu-spacy:latest
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3ab2b6b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,235 @@
+# spaCy Docker Image with CoNLL-U Support
+
+Docker image for **spaCy** POS tagging, lemmatization and dependency parsing with support for input and output in [CoNLL-U format](https://universaldependencies.org/format.html).
+
+This is a slim, focused implementation extracted from [sota-pos-lemmatizers](https://korap.ids-mannheim.de/gerrit/plugins/gitiles/KorAP/sota-pos-lemmatizers), originally developed by José Angel Daza(@angel-daza), following the same pattern as [conllu-treetagger-docker](https://github.com/KorAP/conllu-treetagger-docker).
+
+## Features
+
+- **CoNLL-U input/output**: Reads and writes CoNLL-U format
+- **On-demand model fetching**: Models are downloaded on first run and cached in `/local/models`
+- **GermaLemma integration**: Enhanced lemmatization for German (optional)
+- **Morphological features**: Extracts and formats morphological features in CoNLL-U format
+- **Dependency parsing**: Optional dependency relations (HEAD/DEPREL columns)
+- **Flexible configuration**: Environment variables for batch size, chunk size, timeouts, etc.
+
+## Installation
+
+### From source
+
+```shell
+git clone https://github.com/KorAP/conllu-spacy-tagger-docker.git
+cd conllu-spacy-tagger-docker
+docker build -t korap/conllu-spacy .
+```
+
+## Usage
+
+### Basic usage
+
+```shell
+# Default: German model with dependency parsing and GermaLemma
+docker run --rm -i korap/conllu-spacy < input.conllu > output.conllu
+```
+
+### Faster processing without dependency parsing
+
+```shell
+# Disable dependency parsing for faster processing
+docker run --rm -i korap/conllu-spacy -d < input.conllu > output.conllu
+```
+
+### Using a different spaCy model
+
+```shell
+# Use a different model (will be downloaded if not available)
+docker run --rm -i korap/conllu-spacy -m de_core_news_sm < input.conllu > output.conllu
+```
+
+### Persisting Models
+
+To avoid downloading the language model on every run, mount a local directory to `/local/models`:
+
+```shell
+docker run --rm -i -v /path/to/local/models:/local/models korap/conllu-spacy < input.conllu > output.conllu
+```
+
+The first run will download the model to `/path/to/local/models/`, and subsequent runs will reuse it.
+
+### Preloading Models
+
+There are several ways to preload models before running the container:
+
+#### Option 1: Using the preload script (recommended)
+
+```shell
+# Preload the default model (de_core_news_lg)
+./preload-models.sh
+
+# Preload a specific model
+./preload-models.sh de_core_news_sm
+
+# Preload to a custom directory
+./preload-models.sh de_core_news_lg /path/to/models
+
+# Then run with the preloaded models
+docker run --rm -i -v ./models:/local/models korap/conllu-spacy < input.conllu
+```
+
+#### Option 2: Build image with models included
+
+```shell
+# Build an image with models pre-installed
+docker build -f Dockerfile.with-models -t korap/conllu-spacy:with-models .
+
+# Run without needing to mount volumes
+docker run --rm -i korap/conllu-spacy:with-models < input.conllu > output.conllu
+```
+
+Edit `Dockerfile.with-models` to include additional models (sm, md) by uncommenting the relevant lines.
+
+#### Option 3: Manual download
+
+```shell
+# Create models directory
+mkdir -p ./models
+
+# Download using a temporary container
+docker run --rm -v ./models:/models python:3.12-slim bash -c "
+ pip install -q spacy &&
+ python -m spacy download de_core_news_lg &&
+ python -c 'import spacy, shutil, site;
+ shutil.copytree(site.getsitepackages()[0] + \"/de_core_news_lg\", \"/models/de_core_news_lg\")'
+"
+
+# Use the preloaded model
+docker run --rm -i -v ./models:/local/models korap/conllu-spacy < input.conllu
+```
+
+### Running with korapxmltool
+
+`korapxmltool`, which includes `korapxml2conllu` as a shortcut, can be downloaded from [https://github.com/KorAP/korapxmltool](https://github.com/KorAP/korapxmltool).
+
+```shell
+korapxml2conllu goe.zip | docker run --rm -i korap/conllu-spacy
+```
+
+#### Generate a spaCy-tagged KorAP XML zip directly
+
+```shell
+korapxmltool -A "docker run --rm -i korap/conllu-spacy" -t zip goe.zip
+```
+
+### Command-line Options
+
+```
+Usage: docker run --rm -i korap/conllu-spacy [OPTIONS]
+
+Options:
+ -h Display help message
+ -m MODEL Specify spaCy model (default: de_core_news_lg)
+ -L List available/installed models
+ -d Disable dependency parsing (faster processing)
+ -g Disable GermaLemma (use spaCy lemmatizer only)
+```
+
+### Environment Variables
+
+You can customize processing behavior with environment variables:
+
+```shell
+docker run --rm -i \
+ -e SPACY_USE_DEPENDENCIES="False" \
+ -e SPACY_USE_GERMALEMMA="True" \
+ -e SPACY_CHUNK_SIZE="10000" \
+ -e SPACY_BATCH_SIZE="1000" \
+ -e SPACY_N_PROCESS="1" \
+ -e SPACY_PARSE_TIMEOUT="30" \
+ -e SPACY_MAX_SENTENCE_LENGTH="500" \
+ korap/conllu-spacy < input.conllu > output.conllu
+```
+
+**Available environment variables:**
+
+- `SPACY_USE_DEPENDENCIES`: Enable/disable dependency parsing (default: "True")
+- `SPACY_USE_GERMALEMMA`: Enable/disable GermaLemma (default: "True")
+- `SPACY_CHUNK_SIZE`: Number of sentences to process per chunk (default: 20000)
+- `SPACY_BATCH_SIZE`: Batch size for spaCy processing (default: 2000)
+- `SPACY_N_PROCESS`: Number of processes (default: 10)
+- `SPACY_PARSE_TIMEOUT`: Timeout for dependency parsing per sentence in seconds (default: 30)
+- `SPACY_MAX_SENTENCE_LENGTH`: Maximum sentence length for dependency parsing in tokens (default: 500)
+
+### Examples
+
+```shell
+# Fast processing: disable dependency parsing
+docker run --rm -i korap/conllu-spacy -d < input.conllu > output.conllu
+
+# Use spaCy lemmatizer only (without GermaLemma)
+docker run --rm -i korap/conllu-spacy -g < input.conllu > output.conllu
+
+# Smaller model for faster download
+docker run --rm -i korap/conllu-spacy -m de_core_news_sm < input.conllu > output.conllu
+
+# Persistent model storage
+docker run --rm -i -v ./models:/local/models korap/conllu-spacy < input.conllu > output.conllu
+```
+
+### Miscellaneous commands
+
+List installed models:
+
+```shell
+docker run --rm -i korap/conllu-spacy -L
+```
+
+Open a shell within the container:
+
+```shell
+docker run --rm -it --entrypoint /bin/bash korap/conllu-spacy
+```
+
+## Supported Models
+
+Any spaCy model can be specified with the `-m` option. Models will be downloaded automatically on first use.
+
+Common German models:
+- `de_core_news_lg` (default, 560MB) - Large German model
+- `de_core_news_md` (100MB) - Medium German model
+- `de_core_news_sm` (15MB) - Small German model
+
+See [spaCy Models](https://spacy.io/models) for a complete list.
+
+## Performance
+
+From the sota-pos-lemmatizers benchmarks on the TIGER corpus (50,472 sentences):
+
+| Configuration | Lemma Acc | POS Acc | POS F1 | sents/sec |
+|--------------------------------|-----------|---------|--------|-----------|
+| spaCy + GermaLemma | **90.98** | **99.07**| **95.84** | **1,230** |
+| spaCy (without GermaLemma) | 85.33 | 99.07 | 95.84 | 1,577 |
+
+**Note**: Disabling dependency parsing (`-d` flag) significantly improves processing speed while maintaining POS tagging and lemmatization quality.
+
+## Architecture
+
+The project consists of:
+
+- **Dockerfile**: Multi-stage build for optimized image size
+- **docker-entrypoint.sh**: Entry point script that handles model fetching and CLI argument parsing
+- **systems/parse_spacy_pipe.py**: Main spaCy processing pipeline
+- **lib/CoNLL_Annotation.py**: CoNLL-U format parsing and token classes
+- **my_utils/file_utils.py**: File handling utilities for chunked processing
+
+## Credits
+
+Based on the [sota-pos-lemmatizers](https://korap.ids-mannheim.de/gerrit/plugins/gitiles/KorAP/sota-pos-lemmatizers) evaluation project, originally by [José Angel Daza](https://github.com/angel-daza) and [Marc Kupietz](https://github.com/kupietz), with contributions by [Rebecca Wilm](https://github.com/rebecca-wilm), follows the pattern established by [conllu-treetagger-docker](https://github.com/KorAP/conllu-treetagger-docker).
+
+- **spaCy**: [https://spacy.io/](https://spacy.io/)
+- **GermaLemma**: [https://github.com/WZBSocialScienceCenter/germalemma](https://github.com/WZBSocialScienceCenter/germalemma)
+
+## License
+
+See the licenses of the individual components:
+- spaCy: MIT License
+- GermaLemma: Apache 2.0 License
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
new file mode 100755
index 0000000..1ada4a7
--- /dev/null
+++ b/docker-entrypoint.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+
+set -o pipefail
+
+# Default values
+model="de_core_news_lg"
+use_dependencies="True"
+use_germalemma="True"
+
+usage() {
+ echo "Usage: $0 [-h] [-m MODEL] [-L] [-d] [-g]"
+ echo " -h Display this help message"
+ echo " -m MODEL Specify spaCy model (default: $model)"
+ echo " -L List available/installed models"
+ echo " -d Disable dependency parsing (faster processing)"
+ echo " -g Disable GermaLemma (use spaCy lemmatizer only)"
+ exit 1
+}
+
+# Parse command line options
+while getopts "hm:Ldg" opt; do
+ case $opt in
+ h)
+ usage
+ ;;
+ m)
+ model="$OPTARG"
+ ;;
+ L)
+ python -m spacy info 2>/dev/null || echo "No models installed"
+ exit 0
+ ;;
+ d)
+ use_dependencies="False"
+ ;;
+ g)
+ use_germalemma="False"
+ ;;
+ \?)
+ echo "Invalid option: -$OPTARG" >&2
+ usage
+ ;;
+ :)
+ echo "Option -$OPTARG requires an argument" >&2
+ usage
+ ;;
+ esac
+done
+
+if [ $OPTIND -le $# ]; then
+ usage
+fi
+
+MODEL_DIR="/local/models"
+MODEL_PATH="$MODEL_DIR/$model"
+
+# Ensure MODEL_DIR exists
+mkdir -p "$MODEL_DIR"
+
+# Function to check if model is installed and usable
+is_model_installed() {
+ local model_name="$1"
+ # Check if model is installed in the venv
+ python -c "import spacy; spacy.load('$model_name')" 2>/dev/null
+ return $?
+}
+
+# Function to check if preloaded model exists and is valid
+has_preloaded_model() {
+ local model_path="$1"
+ # Check for config.cfg which indicates a valid spaCy model
+ if [ -f "$model_path/config.cfg" ]; then
+ return 0
+ fi
+ return 1
+}
+
+# Function to install model
+install_model() {
+ local model_name="$1"
+
+ # Check if model exists in /local/models - if so, we'll use absolute path
+ if has_preloaded_model "$MODEL_PATH"; then
+ echo "Found preloaded model in $MODEL_PATH" >&2
+ echo "Will use absolute path to avoid download" >&2
+ return 0
+ fi
+
+ # Check if already installed in venv
+ if is_model_installed "$model_name"; then
+ echo "Model $model_name already installed in venv" >&2
+ return 0
+ fi
+
+ # Try to download model to /local/models if writable
+ if [ -w "$MODEL_DIR" ]; then
+ # Download and install to /local/models with progress
+ if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
+ # Try to move the installed model to /local/models for persistence
+ SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
+ INSTALLED_MODEL="$SITE_PACKAGES/$model_name"
+
+ if [ -d "$INSTALLED_MODEL" ]; then
+ echo "Moving model to $MODEL_PATH for persistence..." >&2
+ mv "$INSTALLED_MODEL" "$MODEL_PATH" 2>/dev/null || true
+ # Create symlink back
+ ln -sf "$MODEL_PATH" "$INSTALLED_MODEL" 2>/dev/null || true
+ echo "Model saved to $MODEL_PATH" >&2
+ fi
+ return 0
+ else
+ echo "Failed to download model $model_name" >&2
+ return 1
+ fi
+ else
+ # MODEL_DIR not writable, install to venv (ephemeral)
+ echo "Cannot write to $MODEL_DIR, installing to venv (ephemeral)" >&2
+ if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
+ return 0
+ else
+ echo "Failed to download model $model_name" >&2
+ return 1
+ fi
+ fi
+}
+
+# Install or verify model
+if ! install_model "$model"; then
+ echo "ERROR: Could not install model $model, aborting." >&2
+ exit 1
+fi
+
+# Determine which model path to use
+# If preloaded model exists, use absolute path; otherwise use model name
+if has_preloaded_model "$MODEL_PATH"; then
+ MODEL_TO_USE="$MODEL_PATH"
+ echo "Using preloaded model at: $MODEL_TO_USE" >&2
+else
+ MODEL_TO_USE="$model"
+ echo "Using installed model: $MODEL_TO_USE" >&2
+fi
+
+# Set environment variables for the Python script
+export SPACY_USE_DEPENDENCIES="$use_dependencies"
+export SPACY_USE_GERMALEMMA="$use_germalemma"
+
+# Log configuration
+echo "Configuration:" >&2
+echo " Model: $MODEL_TO_USE" >&2
+echo " Use dependencies: $use_dependencies" >&2
+echo " Use GermaLemma: $use_germalemma" >&2
+
+# Run the spaCy tagging pipeline
+python /app/systems/parse_spacy_pipe.py \
+ --spacy_model "$MODEL_TO_USE" \
+ --corpus_name "stdin" \
+ --gld_token_type "CoNLLUP_Token" \
+ --comment_str "#"
diff --git a/download_with_progress.py b/download_with_progress.py
new file mode 100755
index 0000000..a907fe5
--- /dev/null
+++ b/download_with_progress.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""
+Download spaCy model with progress bar
+"""
+import sys
+import subprocess
+import re
+
+def main():
+ if len(sys.argv) < 2:
+ print("Usage: download_with_progress.py MODEL_NAME")
+ sys.exit(1)
+
+ model_name = sys.argv[1]
+
+ print(f"Downloading {model_name}...", file=sys.stderr)
+ print("This may take several minutes for large models (de_core_news_lg is ~560MB)", file=sys.stderr)
+ print("", file=sys.stderr)
+
+ # Run spacy download with unbuffered output
+ process = subprocess.Popen(
+ [sys.executable, "-u", "-m", "spacy", "download", model_name, "--no-cache-dir"],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ universal_newlines=True,
+ bufsize=1
+ )
+
+ download_started = False
+
+ for line in iter(process.stdout.readline, ''):
+ if not line:
+ break
+
+ # Print the line
+ print(line.rstrip(), file=sys.stderr)
+
+ # Detect download progress
+ if 'Downloading' in line and not download_started:
+ download_started = True
+ print("Download in progress...", file=sys.stderr)
+
+ # Look for percentage or size indicators
+ if '%' in line or 'MB' in line or 'KB' in line:
+ # Extract and show progress
+ match = re.search(r'(\d+)%', line)
+ if match:
+ percent = match.group(1)
+ bar_length = 40
+ filled = int(bar_length * int(percent) / 100)
+ bar = '█' * filled + '░' * (bar_length - filled)
+ print(f"\rProgress: [{bar}] {percent}%", end='', file=sys.stderr)
+
+ process.stdout.close()
+ return_code = process.wait()
+
+ if return_code != 0:
+ print(f"\nError: Download failed with code {return_code}", file=sys.stderr)
+ sys.exit(return_code)
+
+ print("\n✓ Download complete!", file=sys.stderr)
+ return 0
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/lib/CoNLL_Annotation.py b/lib/CoNLL_Annotation.py
new file mode 100644
index 0000000..1a8ddf0
--- /dev/null
+++ b/lib/CoNLL_Annotation.py
@@ -0,0 +1,220 @@
+from collections import defaultdict, OrderedDict
+import re
+
+# CoNLL-U Format - https://universaldependencies.org/format.html
+
+
+def get_token_type(type_str):
+ if type_str =="CoNLL09_Token":
+ return CoNLL09_Token
+ elif type_str == "RNNTagger_Token":
+ return RNNTagger_Token
+ elif type_str == "CoNLLUP_Token":
+ return CoNLLUP_Token
+ elif type_str == "TigerNew_Token":
+ return TigerNew_Token
+ else:
+ raise NotImplementedError(f"I don't know what to do with {type_str} token type!")
+
+
+class TigerNew_Token():
+ def __init__(self, raw_line, word_ix):
+ info = raw_line.split() # [FORM, XPOS]
+ self.info = info
+ self.id = word_ix + 1 # 1-based ID as in the CoNLL file
+ self.position = word_ix # 0-based position in sentence
+ self.word = info[0]
+ self.lemma = "_"
+ self.pos_universal = "_"
+ self.pos_tag = info[1]
+ self.detail_tag = "_"
+ self.head = "_"
+ self.dep_tag = "_"
+ self.blank = "_"
+ self.auto_score = "_"
+
+ def get_info(self):
+ return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
+ str(self.head), self.dep_tag, self.blank, self.auto_score]
+
+ def get_conllU_line(self, separator="\t"):
+ info = self.get_info()
+ return separator.join(info)
+
+
+class RNNTagger_Token():
+ def __init__(self, raw_line, word_ix):
+ info = raw_line.split() # [FORM, XPOS.FEATS, LEMMA]
+ self.info = info
+ self.id = word_ix + 1 # 1-based ID as in the CoNLL file
+ self.position = word_ix # 0-based position in sentence
+ self.word = info[0]
+ self.lemma = info[2]
+ self.pos_universal = "_"
+ self.pos_tag, self.detail_tag = self._process_tag(info[1]) # 'NN.Gen.Sg.Fem'
+ self.head = "_"
+ self.dep_tag = "_"
+ self.blank = "_"
+ self.auto_score = "_"
+
+ def _process_tag(self, tag):
+ if tag == "_" or "." not in tag: return tag, "_"
+ info = tag.split(".")
+ return info[0], "|".join(info[1:])
+
+ def get_info(self):
+ return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
+ str(self.head), self.dep_tag, self.blank, self.auto_score]
+
+ def get_conllU_line(self, separator="\t"):
+ info = self.get_info()
+ return separator.join(info)
+
+
+class CoNLLUP_Token():
+ def __init__(self, raw_line, word_ix):
+ info = raw_line.split()
+ # print(info)
+ # [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+ # [11, Prügel, Prügel, NN, NN, _, _, _, _, 1.000000]
+ self.info = info
+ self.id = info[0] # 1-based ID as in the CoNLL file
+ self.position = word_ix # 0-based position in sentence
+ self.word = info[1]
+ self.lemma = info[2]
+ self.pos_universal = info[3]
+ self.pos_tag = self._process_tag(info[4]) # 'XPOS=NE|Case=Nom|Gender=Masc|Number=Sing' TODO: Reuse MorphInfo in the self.detail_tag
+ self.detail_tag = info[5]
+ self.head = info[6]
+ self.dep_tag = info[7]
+ self.blank = info[8] # ???
+ self.auto_score = info[9]
+
+ def _process_tag(self, tag):
+ if tag == "_" or "|" not in tag: return tag # The XPOS=NE|Case=Nom... is only for Turku!
+ info = tag.split("|")
+ info = [x.split("=") for x in info]
+ return info[0][1]
+
+ def get_info(self):
+ return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
+ str(self.head), self.dep_tag, self.blank, self.auto_score]
+
+ def get_conllU_line(self, separator="\t"):
+ info = self.get_info()
+ return separator.join(info)
+
+
+
+class CoNLL09_Token():
+ def __init__(self, raw_line, word_ix):
+ info = raw_line.split()
+ # print(info)
+ # # ['1', 'Frau', 'Frau', 'Frau', 'NN', 'NN', '_', 'nom|sg|fem', '5', '5', 'CJ', 'CJ', '_', '_', 'AM-DIS', '_']
+ self.info = info
+ self.id = info[0] # 1-based ID as in the CoNLL file
+ self.position = word_ix # 0-based position in sentence
+ self.word = info[1]
+ self.lemma = info[2]
+ self.pos_universal = "_" # _convert_to_universal(self.pos_tag, self.lemma)
+ self.pos_tag = info[4]
+ self.head = info[8]
+ self.dep_tag = info[10]
+ self.detail_tag = "_"
+ self.is_pred = True if info[12] == "Y" else False
+ if self.is_pred:
+ self.pred_sense = info[13].strip("[]")
+ self.pred_sense_id = str(self.position) + "##" + self.pred_sense
+ else:
+ self.pred_sense = None
+ self.pred_sense_id = ""
+ if len(info) > 14:
+ self.labels = info[14:]
+ else:
+ self.labels = []
+
+ def get_conllU_line(self, separator="\t"):
+ # We want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+ tok_id = str(self.id) #.split("_")[0]
+ conllUinfo = [tok_id, self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag, self.head, self.dep_tag, "_", "_"]
+ return separator.join(conllUinfo)
+
+ def get_conll09_line(self, delim="\t"):
+ # We want:
+ # 1 Frau Frau Frau NN NN _ nom|sg|fem 5 5 CJ CJ _ _ AM-DIS _
+ # 10 fall fall fall VB VB _ _ 8 8 VC VC Y fall.01 _ _ _ _ _
+ is_pred_str = "Y" if self.is_pred else "_"
+ sense_str = self.pred_sense if self.is_pred else "_"
+ info = [self.id, self.word, self.lemma, self.lemma, self.pos_tag, self.pos_tag, "_", self.detail_tag,
+ self.head, self.head, self.dep_tag, self.dep_tag, is_pred_str, sense_str] + self.labels
+ return delim.join(info)
+
+
+
+################################# GETTING SENTENCE ANNOTATIONS ####################################
+class AnnotatedSentence():
+ def __init__(self):
+ self.metadata = []
+ self.tokens = []
+
+ def get_words(self):
+ return [tok.word for tok in self.tokens]
+
+ def get_sentence(self):
+ return " ".join([tok.word for tok in self.tokens])
+
+ def get_pos_tags(self, universal=False):
+ if universal:
+ return [tok.pos_universal for tok in self.tokens]
+ else:
+ return [tok.pos_tag for tok in self.tokens]
+
+
+def get_annotation(raw_lines, raw_meta, token_class):
+ ann = AnnotatedSentence()
+ ann.metadata = [m.strip("\n") for m in raw_meta]
+ # Annotate the predicates and senses
+ real_index = 0
+ for i, line in enumerate(raw_lines):
+ tok = token_class(line, real_index)
+ ann.tokens.append(tok)
+ real_index += 1
+ return ann
+
+
+def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token, comment_str="###C:", our_foundry="spacy"):
+ n_sents = 0
+ annotated_sentences, buffer_meta, buffer_lst = [], [], []
+ for i, line in enumerate(line_generator):
+ if line.startswith(comment_str):
+ line = re.sub(r'(foundry\s*=\s*).*', r"\1" + our_foundry, line)
+ line = re.sub(r'(filename\s*=\s* .[^/]*/[^/]+/[^/]+/).*', r"\1" + our_foundry + "/morpho.xml", line)
+ buffer_meta.append(line)
+ continue
+ if len(line.split()) > 0:
+ buffer_lst.append(line)
+ else:
+ ann = get_annotation(buffer_lst, buffer_meta, token_class)
+ n_sents += 1
+ buffer_lst, buffer_meta = [], []
+ annotated_sentences.append(ann)
+ if chunk_size > 0 and n_sents == chunk_size: break
+ # logger.info("Read {} Sentences!".format(n_sents))
+ return annotated_sentences, n_sents
+
+
+def read_conll_generator(filepath, token_class=CoNLLUP_Token, sent_sep=None, comment_str="###C:"):
+ buffer_meta, buffer_lst = [], []
+ sentence_finished = False
+ with open(filepath) as f:
+ for i, line in enumerate(f.readlines()):
+ if sent_sep and sent_sep in line: sentence_finished = True
+ if line.startswith(comment_str):
+ continue
+ if len(line.split()) > 0 and not sentence_finished:
+ buffer_lst.append(line)
+ else:
+ ann = get_annotation(buffer_lst, buffer_meta, token_class)
+ buffer_lst, buffer_meta = [], []
+ sentence_finished = False
+ yield ann
\ No newline at end of file
diff --git a/lib/__init__.py b/lib/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/lib/__init__.py
diff --git a/my_utils/__init__.py b/my_utils/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/my_utils/__init__.py
diff --git a/my_utils/file_utils.py b/my_utils/file_utils.py
new file mode 100644
index 0000000..c5eb8ed
--- /dev/null
+++ b/my_utils/file_utils.py
@@ -0,0 +1,124 @@
+import requests, logging, json
+import subprocess, time
+import glob, logging
+import os.path, sys
+from lib.CoNLL_Annotation import read_conll, read_conll_generator
+
+logger = logging.getLogger(__name__)
+
+
+def list_to_file(my_list, out_path):
+ with open(out_path, "w") as out:
+ for item_str in my_list:
+ out.write(f"{item_str}\n")
+
+def counter_to_file(my_counter, out_path):
+ with open(out_path, "w") as out:
+ for item, count in my_counter:
+ item_str = "\t".join(item)
+ out.write(f"{item_str}\t{count}\n")
+
+def dict_to_file(my_dict, out_path):
+ with open(out_path, "w", encoding='utf8') as out:
+ json.dump(my_dict, fp=out, ensure_ascii=False)
+
+
+def file_to_dict(file_path):
+ d = {}
+ with open(file_path) as f:
+ d = json.load(f)
+ return d
+
+
+def write_conll_file(conll_objs, out_path):
+ with open(out_path, "w", encoding='utf8') as out:
+ for obj in conll_objs:
+ for tok in obj.tokens:
+ out.write(tok.get_conllU_line()+"\n")
+ out.write("\n")
+
+def file_generator(file_path):
+ with open(file_path, "r") as data_file:
+ logger.info("Reading instances from lines in file at: %s", file_path)
+ for line in data_file:
+ if not line: continue
+ yield line
+
+
+def get_file_annos_chunk(line_generator, chunk_size, token_class, comment_str="###C:", our_foundry="spacy"):
+ file_has_next = True
+ chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str, our_foundry=our_foundry)
+ if n_sents == 0: file_has_next = False
+ sents, gld, meta = [], [], []
+ return chunk, file_has_next
+
+
+def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+ """ Same as get_file_annos_chunk but directly get (text, labels) pairs"""
+ file_has_next = True
+ chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
+ if n_sents == 0: file_has_next = False
+ sents, gld, meta = [], [], []
+ for anno in chunk:
+ if len(anno.metadata) > 0: meta.append("\n".join(anno.metadata))
+ sents.append(anno.get_sentence())
+ gld.append(anno.get_pos_tags())
+ return sents, gld, file_has_next
+
+
+def get_file_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+ file_has_next = True
+ chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
+ if n_sents < chunk_size: file_has_next = False
+ raw_text = ""
+ for anno in chunk:
+ if len(anno.metadata) > 0:
+ raw_text += "\n".join(anno.metadata) + "\n"
+ else:
+ raw_text += "\n"
+ for tok in anno.tokens:
+ raw_text += tok.get_conllU_line() + "\n"
+ raw_text += "\n"
+ return raw_text, file_has_next, n_sents
+
+
+def turku_parse_file(raw_text, filename, chunk_ix):
+ out_file_str = f"{filename}.parsed.{chunk_ix}.conllu"
+ # For each file make a request to obtain the parse back
+ logger.info(f"Sending Request {chunk_ix} to Parser Server...")
+ response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))
+ response_to_file(response.text, out_file_str)
+
+
+
+def response_to_file(response_str, fname):
+ fout = open(fname, "w")
+ fout.write(response_str)
+ fout.close()
+
+
+def expand_file(f, substitute_comment=False):
+ # Expand the .gz file
+ fname = f[:-3]
+ if not os.path.isfile(fname):
+ p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
+ if p == 0:
+ logger.info("Successfully uncompressed file")
+ else:
+ logger.info(f"Couldn't expand file {f}")
+ raise Exception
+ else:
+ logger.info(f"File {fname} is already uncompressed. Skipping this step...")
+
+ # Substitute the Commentary Lines on the Expanded file
+ if substitute_comment:
+ fixed_filename = f"{fname}.fixed"
+ p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
+ if p == 0:
+ logger.info("Successfully fixed comments on file")
+ else:
+ logger.info(f"Something went wrong when substituting commentaries")
+ raise Exception
+ return fixed_filename
+ else:
+ return fname
diff --git a/preload-models.sh b/preload-models.sh
new file mode 100755
index 0000000..ad770f8
--- /dev/null
+++ b/preload-models.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# Script to preload spaCy models to a local directory
+# Usage: ./preload-models.sh [MODEL_NAME] [TARGET_DIR]
+
+set -e
+
+MODEL_NAME="${1:-de_core_news_lg}"
+TARGET_DIR="${2:-./models}"
+
+echo "Preloading spaCy model: $MODEL_NAME"
+echo "Target directory: $TARGET_DIR"
+
+# Create target directory if it doesn't exist
+mkdir -p "$TARGET_DIR"
+
+# Check if model already exists
+if [ -d "$TARGET_DIR/$MODEL_NAME" ]; then
+ echo "Model $MODEL_NAME already exists in $TARGET_DIR"
+ echo "Remove it first if you want to re-download: rm -rf $TARGET_DIR/$MODEL_NAME"
+ exit 0
+fi
+
+echo "Downloading model using temporary Docker container..."
+
+# Use a temporary container to download the model
+docker run --rm -v "$(realpath $TARGET_DIR)":/models python:3.12-slim-bookworm bash -c "
+ set -e
+ echo 'Installing spaCy...'
+ pip install -q spacy
+
+ echo 'Downloading model $MODEL_NAME...'
+ echo 'This may take several minutes depending on your connection speed.'
+ python -m spacy download $MODEL_NAME --no-cache-dir 2>&1 | while IFS= read -r line; do
+ echo \"\$line\"
+ # Show progress dots for download
+ if [[ \"\$line\" == *\"Downloading\"* ]]; then
+ echo -n \"Progress: \"
+ fi
+ done
+
+ echo 'Moving model to /models...'
+ python -c \"
+import spacy
+import shutil
+import site
+import os
+
+# Get the installed model path
+site_packages = site.getsitepackages()[0]
+model_path = site_packages + '/$MODEL_NAME'
+
+# spaCy packages contain a subdirectory with the versioned model
+# Find the actual model directory (e.g., de_core_news_lg-3.8.0)
+items = os.listdir(model_path)
+model_subdir = None
+for item in items:
+ item_path = os.path.join(model_path, item)
+ if os.path.isdir(item_path) and '$MODEL_NAME' in item:
+ model_subdir = item_path
+ break
+
+if model_subdir:
+ # Copy the actual model directory
+ shutil.copytree(model_subdir, '/models/$MODEL_NAME')
+ print(f'Model copied successfully from {model_subdir}!')
+else:
+ # Fallback: copy the whole package
+ shutil.copytree(model_path, '/models/$MODEL_NAME')
+ print('Model copied successfully!')
+\"
+"
+
+if [ -d "$TARGET_DIR/$MODEL_NAME" ]; then
+ echo ""
+ echo "✓ Model $MODEL_NAME successfully preloaded to $TARGET_DIR/$MODEL_NAME"
+ echo ""
+ echo "You can now run the container with:"
+ echo " docker run --rm -i -v $(realpath $TARGET_DIR):/local/models korap/conllu-spacy"
+else
+ echo "✗ Error: Model download failed"
+ exit 1
+fi
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..46c867c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+pip
+wheel
+thinc
+spacy
+germalemma
diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
new file mode 100644
index 0000000..0abfb33
--- /dev/null
+++ b/systems/parse_spacy_pipe.py
@@ -0,0 +1,327 @@
+from sys import stdin
+import argparse, os
+import spacy
+from spacy.tokens import Doc
+import logging, sys, time, signal
+from lib.CoNLL_Annotation import get_token_type
+import my_utils.file_utils as fu
+from germalemma import GermaLemma
+
+# Dependency parsing safety limits
+DEFAULT_PARSE_TIMEOUT = 0.5 # seconds per sentence
+DEFAULT_MAX_SENTENCE_LENGTH = 500 # tokens
+
+class TimeoutException(Exception):
+ pass
+
+def timeout_handler(signum, frame):
+ raise TimeoutException("Dependency parsing timeout")
+
+def safe_dependency_parse(spacy_model, text, timeout=DEFAULT_PARSE_TIMEOUT, max_length=DEFAULT_MAX_SENTENCE_LENGTH):
+ """
+ Safely parse a sentence with timeout and length limits.
+
+ Args:
+ spacy_model: Loaded spaCy model
+ text: Text to parse
+ timeout: Maximum seconds to wait for parsing
+ max_length: Maximum sentence length in tokens
+
+ Returns:
+ tuple: (spacy_doc, success, warning_message)
+ """
+ # Check sentence length
+ if len(text.split()) > max_length:
+ # Process without dependency parsing for long sentences
+ disabled_components = ["ner", "parser"]
+ doc = spacy_model(text, disable=disabled_components)
+ return doc, False, f"Sentence too long ({len(text.split())} tokens > {max_length}), dependency parsing skipped"
+
+ # Set up timeout
+ old_handler = signal.signal(signal.SIGALRM, timeout_handler)
+ signal.setitimer(signal.ITIMER_REAL, timeout)
+
+ try:
+ doc = spacy_model(text)
+ signal.setitimer(signal.ITIMER_REAL, 0) # Cancel alarm
+ signal.signal(signal.SIGALRM, old_handler)
+ return doc, True, None
+ except TimeoutException:
+ signal.setitimer(signal.ITIMER_REAL, 0) # Cancel alarm
+ signal.signal(signal.SIGALRM, old_handler)
+ # Retry without dependency parsing
+ disabled_components = ["ner", "parser"]
+ doc = spacy_model(text, disable=disabled_components)
+ return doc, False, f"Dependency parsing timeout after {timeout}s, processed without dependencies"
+ except Exception as e:
+ signal.setitimer(signal.ITIMER_REAL, 0) # Cancel alarm
+ signal.signal(signal.SIGALRM, old_handler)
+ # Retry without dependency parsing
+ disabled_components = ["ner", "parser"]
+ doc = spacy_model(text, disable=disabled_components)
+ return doc, False, f"Dependency parsing error: {str(e)}, processed without dependencies"
+
+def format_morphological_features(token):
+ """
+ Extract and format morphological features from a spaCy token for CoNLL-U output.
+
+ Args:
+ token: spaCy token object
+
+ Returns:
+ str: Formatted morphological features string for CoNLL-U 5th column
+ Returns "_" if no features are available
+ """
+ if not hasattr(token, 'morph') or not token.morph:
+ return "_"
+
+ morph_dict = token.morph.to_dict()
+ if not morph_dict:
+ return "_"
+
+ # Format as CoNLL-U format: Feature=Value|Feature2=Value2
+ features = []
+ for feature, value in sorted(morph_dict.items()):
+ features.append(f"{feature}={value}")
+
+ return "|".join(features)
+
+
+def format_dependency_relations(doc):
+ """
+ Extract and format dependency relations from a spaCy doc for CoNLL-U output.
+
+ Args:
+ doc: spaCy Doc object
+
+ Returns:
+ list: List of tuples (head_id, deprel) for each token
+ """
+ dependencies = []
+ for i, token in enumerate(doc):
+ # HEAD column: 1-based index of the head token (0 for root)
+ if token.dep_ == "ROOT":
+ head_id = 0
+ else:
+ # Find the 1-based index of the head token
+ head_id = None
+ for j, potential_head in enumerate(doc):
+ if potential_head == token.head:
+ head_id = j + 1
+ break
+ if head_id is None:
+ head_id = 0 # Fallback to root if head not found
+
+ # DEPREL column: dependency relation
+ deprel = token.dep_ if token.dep_ else "_"
+
+ dependencies.append((head_id, deprel))
+
+ return dependencies
+
+
+class WhitespaceTokenizer(object):
+ def __init__(self, vocab):
+ self.vocab = vocab
+
+ def __call__(self, text):
+ words = text.split(' ')
+ # Filter out empty strings to avoid spaCy errors
+ words = [w for w in words if w]
+ # Handle edge case of empty input - use a placeholder token
+ if not words:
+ words = ['_EMPTY_']
+ # All tokens 'own' a subsequent space character in this tokenizer
+ spaces = [True] * len(words)
+ return Doc(self.vocab, words=words, spaces=spaces)
+
+
+def get_conll_str(anno_obj, spacy_doc, use_germalemma, use_dependencies):
+ # First lines are comments. (metadata)
+ conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+
+ # Get dependency relations if enabled
+ dependencies = format_dependency_relations(spacy_doc) if use_dependencies == "True" else None
+
+ for ix, token in enumerate(spacy_doc):
+ morph_features = format_morphological_features(token)
+
+ # Get HEAD and DEPREL columns
+ if dependencies:
+ head_id, deprel = dependencies[ix]
+ else:
+ head_id, deprel = "_", "_"
+
+ if use_germalemma == "True":
+ content = (str(ix+1), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, str(head_id), deprel, "_", "_")
+ else:
+ content = (str(ix+1), token.text, token.lemma_, token.pos_, token.tag_, morph_features, str(head_id), deprel, "_", "_") # Pure SpaCy!
+ conll_lines.append("\t".join(content))
+ return "\n".join(conll_lines)
+
+
+def find_germalemma(word, pos, spacy_lemma):
+ simplify_pos = {"ADJA":"ADJ", "ADJD":"ADJ",
+ "NA":"N", "NE":"N", "NN":"N",
+ "ADV":"ADV", "PAV":"ADV", "PROAV":"ADV", "PAVREL":"ADV", "PWAV":"ADV", "PWAVREL":"ADV",
+ "VAFIN":"V", "VAIMP":"V", "VAINF":"V", "VAPP":"V", "VMFIN":"V", "VMINF":"V",
+ "VMPP":"V", "VVFIN":"V", "VVIMP":"V", "VVINF":"V", "VVIZU":"V","VVPP":"V"
+ }
+ # simplify_pos = {"VERB": "V", "ADV": "ADV", "ADJ": "ADJ", "NOUN":"N", "PROPN": "N"}
+ try:
+ return lemmatizer.find_lemma(word, simplify_pos.get(pos, "UNK"))
+ except:
+ return spacy_lemma
+
+
+if __name__ == "__main__":
+ """
+ --- Example Real Data TEST ---
+
+ cat /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/zca18.conllu | python systems/parse_spacy_pipe.py \
+ --corpus_name DeReKo_zca18 --comment_str "#" > output_zca18.conll
+ """
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
+ parser.add_argument("-sm", "--spacy_model", help="Spacy model containing the pipeline to tag", default="de_core_news_lg")
+ parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLLUP_Token")
+ parser.add_argument("-ugl", "--use_germalemma", help="Use Germalemma lemmatizer on top of SpaCy", default="True")
+ parser.add_argument("-udp", "--use_dependencies", help="Include dependency parsing (adds HEAD/DEPREL columns, set to False for faster processing)", default="True")
+ parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
+ args = parser.parse_args()
+
+ file_has_next, chunk_ix = True, 0
+ CHUNK_SIZE = int(os.getenv("SPACY_CHUNK_SIZE", "20000"))
+ SPACY_BATCH = int(os.getenv("SPACY_BATCH_SIZE", "2000"))
+ SPACY_PROC = int(os.getenv("SPACY_N_PROCESS", "1"))
+
+ # =====================================================================================
+ # LOGGING INFO ...
+ # =====================================================================================
+ logger = logging.getLogger(__name__)
+ console_hdlr = logging.StreamHandler(sys.stderr)
+ file_hdlr = logging.FileHandler(filename=f"logs/Parse_{args.corpus_name}.SpaCy.log")
+
+ # Custom format without module name
+ formatter = logging.Formatter('%(levelname)s: %(message)s')
+ console_hdlr.setFormatter(formatter)
+ file_hdlr.setFormatter(formatter)
+
+ logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
+
+ # Override with environment variables if set (useful for Docker)
+ import os
+ if os.getenv("SPACY_USE_DEPENDENCIES") is not None:
+ args.use_dependencies = os.getenv("SPACY_USE_DEPENDENCIES", "True")
+ logger.info(f"Using SPACY_USE_DEPENDENCIES environment variable: {args.use_dependencies}")
+
+ if os.getenv("SPACY_USE_GERMALEMMA") is not None:
+ args.use_germalemma = os.getenv("SPACY_USE_GERMALEMMA", "True")
+ logger.info(f"Using SPACY_USE_GERMALEMMA environment variable: {args.use_germalemma}")
+
+ logger.info(f"Chunking {args.corpus_name} Corpus in chunks of {CHUNK_SIZE} Sentences")
+ logger.info(f"Processing configuration: batch_size={SPACY_BATCH}, n_process={SPACY_PROC}")
+
+ # =====================================================================================
+ # POS TAG DOCUMENTS
+ # =====================================================================================
+ # Configure which components to disable based on dependency parsing option
+ disabled_components = ["ner"]
+ if args.use_dependencies != "True":
+ disabled_components.append("parser")
+ logger.info("Dependency parsing disabled for faster processing")
+ else:
+ logger.info("Dependency parsing enabled (slower but includes HEAD/DEPREL)")
+
+ spacy_de = spacy.load(args.spacy_model, disable=disabled_components)
+ spacy_de.tokenizer = WhitespaceTokenizer(spacy_de.vocab) # We won't re-tokenize to respect how the source CoNLL are tokenized!
+
+ # Increase max_length to handle very long sentences (especially when parser is disabled)
+ spacy_de.max_length = 10000000 # 10M characters
+
+ lemmatizer = GermaLemma()
+
+ # Log version information
+ logger.info(f"spaCy version: {spacy.__version__}")
+ logger.info(f"spaCy model: {args.spacy_model}")
+ logger.info(f"spaCy model version: {spacy_de.meta.get('version', 'unknown')}")
+ try:
+ import germalemma
+ logger.info(f"GermaLemma version: {germalemma.__version__}")
+ except AttributeError:
+ logger.info("GermaLemma version: unknown (no __version__ attribute)")
+
+ # Parse timeout and sentence length limits from environment variables
+ parse_timeout = float(os.getenv("SPACY_PARSE_TIMEOUT", str(DEFAULT_PARSE_TIMEOUT)))
+ max_sentence_length = int(os.getenv("SPACY_MAX_SENTENCE_LENGTH", str(DEFAULT_MAX_SENTENCE_LENGTH)))
+
+ logger.info(f"Dependency parsing limits: timeout={parse_timeout}s, max_length={max_sentence_length} tokens")
+
+ start = time.time()
+ total_processed_sents = 0
+ dependency_warnings = 0
+
+ while file_has_next:
+ annos, file_has_next = fu.get_file_annos_chunk(stdin, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str, our_foundry="spacy")
+ if len(annos) == 0: break
+ total_processed_sents += len(annos)
+
+ # Calculate progress statistics
+ elapsed_time = time.time() - start
+ sents_per_sec = total_processed_sents / elapsed_time if elapsed_time > 0 else 0
+ current_time = time.strftime("%Y-%m-%d %H:%M:%S")
+
+ logger.info(f"{current_time} | Processed: {total_processed_sents} sentences | Elapsed: {elapsed_time:.1f}s | Speed: {sents_per_sec:.1f} sents/sec")
+
+ sents = [a.get_sentence() for a in annos]
+
+ # Process sentences individually when dependency parsing is enabled for timeout protection
+ if args.use_dependencies == "True":
+ for ix, sent in enumerate(sents):
+ doc, dependency_success, warning = safe_dependency_parse(
+ spacy_de, sent, timeout=parse_timeout, max_length=max_sentence_length
+ )
+ if warning:
+ dependency_warnings += 1
+ logger.warning(f"Sentence {total_processed_sents - len(sents) + ix + 1}: {warning}")
+
+ # Override use_dependencies based on actual parsing success
+ actual_use_dependencies = "True" if dependency_success else "False"
+ conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=actual_use_dependencies)
+ print(conll_str+ "\n")
+ else:
+ # Use batch processing for faster processing when dependencies are disabled
+ # Use n_process=1 to avoid multiprocessing deadlocks and memory issues with large files
+ try:
+ for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=1)):
+ conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
+ print(conll_str+ "\n")
+ except Exception as e:
+ logger.error(f"Batch processing failed: {str(e)}")
+ logger.info("Falling back to individual sentence processing...")
+ # Fallback: process sentences individually
+ for ix, sent in enumerate(sents):
+ try:
+ doc = spacy_de(sent)
+ conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
+ print(conll_str+ "\n")
+ except Exception as sent_error:
+ logger.error(f"Failed to process sentence {total_processed_sents - len(sents) + ix + 1}: {str(sent_error)}")
+ logger.error(f"Sentence preview: {sent[:100]}...")
+ # Output a placeholder to maintain alignment
+ conll_str = get_conll_str(annos[ix], spacy_de("ERROR"), use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
+ print(conll_str+ "\n")
+
+ end = time.time()
+ total_time = end - start
+ final_sents_per_sec = total_processed_sents / total_time if total_time > 0 else 0
+
+ logger.info(f"=== Processing Complete ===")
+ logger.info(f"Total sentences: {total_processed_sents}")
+ logger.info(f"Total time: {total_time:.2f}s")
+ logger.info(f"Average speed: {final_sents_per_sec:.1f} sents/sec")
+
+ if dependency_warnings > 0:
+ logger.info(f"Dependency parsing warnings: {dependency_warnings} sentences processed without dependencies")
+
\ No newline at end of file