Prepare release
Change-Id: Ie41a3ba57103ca99d00a512c1699077ad5633135
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..a7e0b7f
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,31 @@
+name: CI
+
+on:
+ push:
+ branches: [ main, master ]
+ pull_request:
+ branches: [ main, master ]
+
+jobs:
+ test-docker-image:
+ name: Test Docker Image
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3
+
+ - name: Build Docker image
+ run: docker build -t korap/conllu-spacy:test .
+
+ - name: Test version display
+ run: docker run --rm korap/conllu-spacy:test -V
+
+ - name: Test model listing
+ run: docker run --rm korap/conllu-spacy:test -L
+
+ - name: Test help output
+ run: docker run --rm korap/conllu-spacy:test -h || true
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..ddc676e
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,80 @@
+stages:
+ - test
+ - build
+ - deploy
+
+test-docker-image:
+ image: docker:latest
+ stage: test
+ variables:
+ FF_NETWORK_PER_BUILD: "true"
+ services:
+ - name: docker:dind
+ alias: docker
+ command: ["--dns=127.0.0.11", "--dns=8.8.8.8"]
+ before_script:
+ - apk update
+ - apk add --no-cache bash
+ script:
+ - docker build -t korap/conllu-spacy:test .
+ - echo "Testing version display..."
+ - docker run --rm korap/conllu-spacy:test -V
+ - echo "Testing model listing..."
+ - docker run --rm korap/conllu-spacy:test -L
+
+build-docker-image:
+ image: docker:latest
+ stage: build
+ variables:
+ FF_NETWORK_PER_BUILD: "true"
+ services:
+ - name: docker:dind
+ alias: docker
+ command: ["--dns=127.0.0.11", "--dns=8.8.8.8"]
+ rules:
+ - if: $CI_COMMIT_TAG =~ /.+/
+ variables:
+ VID: $CI_COMMIT_TAG
+ - when: manual
+ variables:
+ VID_ALT: $CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA
+ VID: snapshot
+ cache:
+ key: conllu-spacy
+ paths:
+ - cache/
+ before_script:
+ - mkdir -p cache
+ - apk update
+ - apk add --no-cache bash make xz
+ script:
+ - make build
+ - docker tag korap/conllu-spacy:latest korap/conllu-spacy:$VID
+ - |
+ if [ -n "$VID_ALT" ]; then
+ docker tag korap/conllu-spacy:latest korap/conllu-spacy:$VID_ALT
+ fi
+ - docker save korap/conllu-spacy | xz -T0 > conllu-spacy.xz
+ artifacts:
+ paths:
+ - conllu-spacy.xz
+
+deploy-to-docker-hub:
+ image: docker:latest
+ stage: deploy
+ variables:
+ FF_NETWORK_PER_BUILD: "true"
+ services:
+ - name: docker:dind
+ alias: docker
+ command: ["--dns=127.0.0.11", "--dns=8.8.8.8"]
+ rules:
+ - if: $CI_COMMIT_TAG =~ /.+/
+ - when: manual
+ dependencies:
+ - build-docker-image
+ before_script:
+ - echo "$DOCKER_HUB_PASSWORD" | docker login -u "$DOCKER_HUB_USER" --password-stdin
+ script:
+ - xz -d -c conllu-spacy.xz | docker load
+ - docker push --all-tags korap/conllu-spacy
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..85a1a6f
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,44 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+Version numbers follow the pattern: `<spaCy-version>-<release-number>`
+
+## [3.8.11-1] - 2025-01-29
+
+### Added
+- Initial release of conllu-spacy-docker
+- Multi-language support for 70+ languages via spaCy models
+- CoNLL-U input/output format support
+- On-demand model fetching with caching in `/local/models`
+- Optional GermaLemma integration for enhanced German lemmatization
+- Morphological features extraction in CoNLL-U format
+- Optional dependency parsing (HEAD/DEPREL columns)
+- Command-line options: `-h`, `-m MODEL`, `-L`, `-V`, `-d`, `-g`
+- Environment variables for configuration (batch size, chunk size, timeouts, etc.)
+- Model preloading via `preload-models.sh` script
+- Three Docker image variants:
+ - Standard (662 MB) - with GermaLemma
+ - Slim (490 MB) - without GermaLemma
+ - With-models (1.22 GB) - includes pre-installed de_core_news_lg model
+- Optimized Docker image using `COPY --chown` to avoid layer duplication
+- CI/CD pipelines for GitLab and GitHub
+- Progress indicators for model downloads
+- Non-root user execution for security
+- List available/installed models with `-L` flag
+- Display version information with `-V` flag
+
+### Features
+- Based on spaCy 3.8.11
+- Python 3.12.1
+- GermaLemma 0.1.3 (optional)
+- Multi-stage Docker build for size optimization
+- Configurable dependency parsing with timeout protection
+- Safe handling of long sentences
+- Batch processing for performance
+- Compatible with korapxmltool
+
+[3.8.11-1]: https://github.com/KorAP/conllu-spacy-docker/releases/tag/3.8.11-1
diff --git a/README.md b/README.md
index ede7a7c..6c55593 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,16 @@
# spaCy Docker Image with CoNLL-U Support
+[](https://github.com/KorAP/conllu-spacy-docker/actions/workflows/ci.yml)
+[](https://hub.docker.com/r/korap/conllu-spacy)
+[](https://hub.docker.com/r/korap/conllu-spacy)
+[](https://github.com/KorAP/conllu-spacy-docker/issues)
+[](https://github.com/KorAP/conllu-spacy-docker/issues?q=is%3Aissue+is%3Aclosed)
+[](https://github.com/KorAP/conllu-spacy-docker/commits/master)
+[](LICENSE)
+
Docker image for **spaCy** POS tagging, lemmatization and dependency parsing with support for input and output in [CoNLL-U format](https://universaldependencies.org/format.html).
-This is a slim, focused implementation extracted from [sota-pos-lemmatizers](https://korap.ids-mannheim.de/gerrit/plugins/gitiles/KorAP/sota-pos-lemmatizers), originally developed by José Angel Daza(@angel-daza), following the same pattern as [conllu-treetagger-docker](https://github.com/KorAP/conllu-treetagger-docker).
+This is a slim, focused implementation extracted from [sota-pos-lemmatizers](https://korap.ids-mannheim.de/gerrit/plugins/gitiles/KorAP/sota-pos-lemmatizers), originally developed by José Angel Daza (@angel-daza), following the same pattern as [conllu-treetagger-docker](https://github.com/KorAP/conllu-treetagger-docker).
## Features
@@ -16,12 +24,18 @@
## Installation
+### From [Docker Hub](https://hub.docker.com/r/korap/conllu-spacy)
+
+```shell
+docker pull korap/conllu-spacy
+```
+
### From source
```shell
-git clone https://github.com/KorAP/conllu-spacy-tagger-docker.git
-cd conllu-spacy-tagger-docker
-docker build -t korap/conllu-spacy .
+git clone https://github.com/KorAP/conllu-spacy-docker.git
+cd conllu-spacy-docker
+make
```
## Usage
@@ -33,7 +47,7 @@
docker run --rm -i korap/conllu-spacy < input.conllu > output.conllu
```
-### Faster processing without dependency parsing
+### Without dependency parsing
```shell
# Disable dependency parsing for faster processing
@@ -58,6 +72,7 @@
To avoid downloading the language model on every run, mount a local directory to `/local/models`:
```shell
+chmod 777 /path/to/local/models
docker run --rm -i -v /path/to/local/models:/local/models korap/conllu-spacy < input.conllu > output.conllu
```
@@ -143,7 +158,7 @@
### Version Information
-To check which version of spaCy and other components are installed:
+To check which version of conllu-spacy-docker and its components are installed:
```shell
docker run --rm korap/conllu-spacy -V
@@ -151,7 +166,8 @@
Example output:
```
-=== spaCy Version Information ===
+=== Version Information ===
+conllu-spacy-docker version: 3.8.11-1
spaCy version: 3.8.11
GermaLemma version: 0.1.3
Python version: 3.12.1
@@ -217,7 +233,7 @@
Any spaCy model can be specified with the `-m` option. Models will be downloaded automatically on first use.
-spaCy provides trained models for **70+ languages**. See [spaCy Models](https://spacy.io/models) for the complete list.
+spaCy provides trained models for 70+ languages. See [spaCy Models](https://spacy.io/models) for the complete list.
### Example: German models (default)
- `de_core_news_lg` (default, 560MB) - Large model, best accuracy
@@ -242,10 +258,6 @@
- `en_core_web_md` (100MB) - Medium English model
- `en_core_web_sm` (15MB) - Small English model
-### Other supported languages
-
-Models are available for: Catalan, Chinese, Croatian, Danish, Dutch, Finnish, Greek, Italian, Japanese, Korean, Lithuanian, Macedonian, Norwegian, Polish, Portuguese, Romanian, Russian, Spanish, Swedish, Ukrainian, and many more.
-
**Note**: GermaLemma integration only works with German models. For other languages, the standard spaCy lemmatizer is used (with `-g` flag to disable GermaLemma).
## Performance
@@ -278,6 +290,8 @@
## License
-See the licenses of the individual components:
+This project's source code is licensed under the [BSD 2-Clause License](LICENSE).
+
+See, however, the licenses of the individual components:
- spaCy: MIT License
- GermaLemma: Apache 2.0 License
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
index 639e371..6e6c989 100755
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -61,7 +61,8 @@
exit 0
;;
V)
- echo "=== spaCy Version Information ===" >&2
+ echo "=== Version Information ===" >&2
+ echo "conllu-spacy-docker version: 3.8.11-1" >&2
python -c "import spacy; print(f'spaCy version: {spacy.__version__}')" >&2
# Check for GermaLemma