Put executable into build/bin
Change-Id: I0908d61a50be2faa3367cb3182950855dff9ff16
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b909769..c29556e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,4 +1,4 @@
-image: eclipse-temurin:21
+image: eclipse-temurin:25
stages:
- build
@@ -21,10 +21,10 @@
artifacts:
paths:
- app/build/libs/korapxmltool-*.jar
- - app/build/libs/korapxmltool
+ - build/bin/korapxmltool
cache:
key: "$CI_COMMIT_REF_NAME"
policy: push
paths:
- build
- - .gradle
+ - .gradle
\ No newline at end of file
diff --git a/Readme.md b/Readme.md
index 13f87d1..a86dab9 100644
--- a/Readme.md
+++ b/Readme.md
@@ -16,10 +16,12 @@
./gradlew build
```
+After building, the executable will be available at `./build/bin/korapxmltool`.
+
## Conversion to [CoNLL-U format](https://universaldependencies.org/format.html)
```shell script
-$ java -jar ./app/build/libs/korapxmltool.jar app/src/test/resources/wdf19.zip | head -10
+$ ./build/bin/korapxmltool app/src/test/resources/wdf19.zip | head -10
# foundry = base
# filename = WDF19/A0000/13072/base/tokens.xml
@@ -37,7 +39,7 @@
## Conversion to language model training data input format from KorAP-XML
```shell script
-$ java -jar ./app/build/libs/korapxmltool.jar --word2vec t/data/wdf19.zip
+$ ./build/bin/korapxmltool --word2vec t/data/wdf19.zip
Arts visuels Pourquoi toujours vouloir séparer BD et Manga ?
Ffx 18:20 fév 25 , 2003 ( CET ) soit on ne sépara pas , soit alors on distingue aussi , le comics , le manwa , le manga ..
@@ -50,7 +52,7 @@
### Example producing language model training input with preceding metadata columns
```shell script
-java -jar ./app/build/libs/korapxmltool.jar -m '<textSigle>([^<]+)' -m '<creatDate>([^<]+)' --word2vec t/data/wdf19.zip
+./build/bin/korapxmltool -m '<textSigle>([^<]+)' -m '<creatDate>([^<]+)' --word2vec t/data/wdf19.zip
```
```
WDF19/A0000.10894 2014.08.28 Arts visuels Pourquoi toujours vouloir séparer BD et Manga ?
@@ -65,7 +67,7 @@
One text per line with `<p>` as sentence delimiter.
```shell script
-java -jar korapxmltool.jar -f now /vol/corpora/DeReKo/current/KorAP/zip/*24.zip | pv > dach24.txt
+./build/bin/korapxmltool -f now /vol/corpora/DeReKo/current/KorAP/zip/*24.zip | pv > dach24.txt
```
### Using lemmas instead of surface forms in word2vec / NOW output
@@ -74,10 +76,10 @@
```shell script
# Word2Vec style output with lemmas where available
-java -jar ./app/build/libs/korapxmltool.jar --lemma -f w2v app/src/test/resources/goe.tree_tagger.zip | head -3
+./build/bin/korapxmltool --lemma -f w2v app/src/test/resources/goe.tree_tagger.zip | head -3
# NOW corpus style output with lemmas
-java -jar ./app/build/libs/korapxmltool.jar --lemma -f now app/src/test/resources/goe.tree_tagger.zip | head -1
+./build/bin/korapxmltool --lemma -f now app/src/test/resources/goe.tree_tagger.zip | head -1
```
If a lemma for a token is missing (`_`) the surface form is used as fallback.
@@ -92,8 +94,8 @@
Example for large NOW export with progress and exclusions:
```
-java -Xmx64G -XX:+UseG1GC -Djdk.util.zip.disableMemoryMapping=true -Djdk.util.zip.reuseInflater=true \
- -jar korapxmltool.jar -l info --threads 100 --zip-parallelism 8 \
+KORAPXMLTOOL_XMX_MB=65536 KORAPXMLTOOL_JAVA_OPTS="-XX:+UseG1GC -Djdk.util.zip.disableMemoryMapping=true -Djdk.util.zip.reuseInflater=true" \
+ ./build/bin/korapxmltool -l info --threads 100 --zip-parallelism 8 \
--lemma-only --sequential -f now \
--exclude-zip-glob 'w?d24.tree_tagger.zip' \
/vol/corpora/DeReKo/current/KorAP/zip/*24.tree_tagger.zip | pv > dach2024.lemma.txt
@@ -108,7 +110,7 @@
Generate a tar archive containing gzipped Krill/KoralQuery JSON files across all provided foundries.
```shell script
-java -jar ./app/build/libs/korapxmltool.jar -f krill -D out/krill \
+./build/bin/korapxmltool -f krill -D out/krill \
app/src/test/resources/wud24_sample.zip \
app/src/test/resources/wud24_sample.spacy.zip \
app/src/test/resources/wud24_sample.marmot-malt.zip
@@ -123,14 +125,14 @@
You need to download the pre-trained MarMoT models from the [MarMoT models repository](http://cistern.cis.lmu.de/marmot/models/CURRENT/).
```shell script
-java -jar ./app/build/libs/korapxmltool.jar -f zip -t marmot:models/de.marmot app/src/test/resources/goe.zip
+./build/bin/korapxmltool -f zip -t marmot:models/de.marmot app/src/test/resources/goe.zip
```
### Tagging with integrated OpenNLP POS tagger directly to a new KorAP-XML ZIP file
You need to download the pre-trained OpenNLP models from the [OpenNLP model download page](https://opennlp.apache.org/models.html#part_of_speech_tagging) or older models from the [legacy OpenNLP models archive](http://opennlp.sourceforge.net/models-1.5/).
```shell script
-java -jar ./app/build/libs/korapxmltool.jar -f zip -t opennlp:/usr/local/kl/korap/Ingestion/lib/models/opennlp/de-pos-maxent.bin /tmp/zca24.zip
+./build/bin/korapxmltool -f zip -t opennlp:/usr/local/kl/korap/Ingestion/lib/models/opennlp/de-pos-maxent.bin /tmp/zca24.zip
```
### Tag and lemmatize with TreeTagger
@@ -139,7 +141,7 @@
Language models are downloaded automatically.
```shell script
-java -jar app/build/libs/korapxmltool.jar app/src/test/resources/wdf19.zip | docker run --rm -i korap/conllu2treetagger -l french | conllu2korapxml
+./build/bin/korapxmltool app/src/test/resources/wdf19.zip | docker run --rm -i korap/conllu2treetagger -l french | conllu2korapxml
```
### Tag and lemmatize with spaCy directly to a new KorAP-XML ZIP file
@@ -147,13 +149,13 @@
This requires the [spaCy Docker Image with CoNLL-U Support](https://gitlab.ids-mannheim.de/KorAP/sota-pos-lemmatizers) and is only available for German.
```shell script
-java -jar app/build/libs/korapxmltool.jar -T4 -A "docker run -e SPACY_USE_DEPENDENCIES=False --rm -i korap/conllu2spacy:latest 2> /dev/null" -f zip ./app/src/test/resources/goe.zip
+./build/bin/korapxmltool -T4 -A "docker run -e SPACY_USE_DEPENDENCIES=False --rm -i korap/conllu2spacy:latest 2> /dev/null" -f zip ./app/src/test/resources/goe.zip
```
### Tag, lemmatize and dependency parse with spaCy directly to a new KorAP-XML ZIP file
```shell script
-java -jar app/build/libs/korapxmltool.jar -T4 -A "docker run -e SPACY_USE_DEPENDENCIES=True --rm -i korap/conllu2spacy:latest 2> /dev/null" -f zip ./app/src/test/resources/goe.zip
+./build/bin/korapxmltool -T4 -A "docker run -e SPACY_USE_DEPENDENCIES=True --rm -i korap/conllu2spacy:latest 2> /dev/null" -f zip ./app/src/test/resources/goe.zip
```
### Tag, lemmatize and constituency parse with CoreNLP (3.X) directly to a new KorAP-XML ZIP file
@@ -161,7 +163,7 @@
Download the Stanford CoreNLP v3.X POS tagger and constituency parser models (e.g., `german-fast.tagger` and `germanSR.ser.gz`) into `libs/`.
```shell script
-java -jar ./app/build/libs/korapxmltool.jar -f zip -D out \
+./build/bin/korapxmltool -f zip -D out \
-t corenlp:libs/german-fast.tagger \
-P corenlp:libs/germanSR.ser.gz \
app/src/test/resources/wud24_sample.zip
@@ -175,12 +177,12 @@
Note that parsers take POS tagged input.
```shell script
-java -jar ./app/build/libs/korapxmltool.jar -f zip -T2 -P malt:libs/german.mco goe.tree_tagger.zip
+./build/bin/korapxmltool -f zip -T2 -P malt:libs/german.mco goe.tree_tagger.zip
```
### Tag with MarMoT and parse with Maltparser in one run directly to a new KorAP-XML ZIP file
```shell script
-java -jar ./app/build/libs/korapxmltool.jar -f zip -t marmot:models/de.marmot -P malt:libs/german.mco goe.zip
+./build/bin/korapxmltool -f zip -t marmot:models/de.marmot -P malt:libs/german.mco goe.zip
```
## Development and License
diff --git a/app/build.gradle b/app/build.gradle
index 8ae215f..0fad0bc 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -144,7 +144,11 @@
tasks.register('assembleShebangExecutable') {
dependsOn shadowJar
inputs.file(rootProject.file("korapxmltool.shebang"))
- outputs.file(new File(shadowJar.archiveFile.get().asFile.parent, "korapxmltool"))
+ inputs.file(shadowJar.archiveFile)
+
+ def binDir = rootProject.file("build/bin")
+ def targetExec = new File(binDir, "korapxmltool")
+ outputs.file(targetExec)
doLast {
def shebang = rootProject.file("korapxmltool.shebang")
@@ -152,8 +156,10 @@
throw new GradleException("Missing shebang stub: ${shebang}")
}
+ // Ensure bin directory exists
+ binDir.mkdirs()
+
def targetJar = shadowJar.archiveFile.get().asFile
- def targetExec = new File(targetJar.parent, "korapxmltool")
targetExec.withOutputStream { os ->
os << shebang.bytes
@@ -164,6 +170,11 @@
}
}
+tasks.named('assemble') {
+ dependsOn createJarSymlink
+ dependsOn assembleShebangExecutable
+}
+
tasks.named('build') {
dependsOn createJarSymlink
dependsOn assembleShebangExecutable