Update Readme.md and help examples
Change-Id: I53886de5bdf84f0289e29a309dcb1d8e65abfd2f
diff --git a/Readme.md b/Readme.md
index a86dab9..2d9060a 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,16 +1,12 @@
# korapxmltool
-Tool package to convert and annotate KorAP-XML ZIP files.
+Converts between KorAP-XML ZIP format and formats like CoNLL-U, Krill, word2vec, NOW and annotates KorAP XML ZIPs with various taggers and parsers.
-Up to 200 times faster and more accurate drop-in replacement for the korapxml2conllu part of [KorAP-XML-CoNLL-U](https://github.com/KorAP/KorAP-XML-CoNLL-U).
+Drop-in replacement for korapxml2conllu [KorAP-XML-CoNLL-U](https://github.com/KorAP/KorAP-XML-CoNLL-U) and korapxml2krill [KorAP-XML-Krill](https://github.com/KorAP/KorAP-XML-Krill)
-For some conversion tasks, however, you currently need the conllu2korapxml part of [KorAP-XML-CoNLL-U](https://github.com/KorAP/KorAP-XML-CoNLL-U).
-## Download
-You can download the latest jar build from the build artifacts [here](https://gitlab.ids-mannheim.de/KorAP/korapxml2conllu/-/jobs/artifacts/master/raw/app/build/libs/korapxmltool.jar?job=build).
-
-## Build it yourself
+## Build
```shell script
./gradlew build
@@ -149,13 +145,13 @@
This requires the [spaCy Docker Image with CoNLL-U Support](https://gitlab.ids-mannheim.de/KorAP/sota-pos-lemmatizers) and is only available for German.
```shell script
-./build/bin/korapxmltool -T4 -A "docker run -e SPACY_USE_DEPENDENCIES=False --rm -i korap/conllu2spacy:latest 2> /dev/null" -f zip ./app/src/test/resources/goe.zip
+./build/bin/korapxmltool -T4 -A "docker run -e SPACY_USE_DEPENDENCIES=False --rm -i korap/conllu2spacy:latest" -f zip ./app/src/test/resources/goe.zip
```
### Tag, lemmatize and dependency parse with spaCy directly to a new KorAP-XML ZIP file
```shell script
-./build/bin/korapxmltool -T4 -A "docker run -e SPACY_USE_DEPENDENCIES=True --rm -i korap/conllu2spacy:latest 2> /dev/null" -f zip ./app/src/test/resources/goe.zip
+./build/bin/korapxmltool -T4 -A "docker run -e SPACY_USE_DEPENDENCIES=True --rm -i korap/conllu2spacy:latest" -f zip ./app/src/test/resources/goe.zip
```
### Tag, lemmatize and constituency parse with CoreNLP (3.X) directly to a new KorAP-XML ZIP file
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index dc82a41..8b74787 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -51,12 +51,41 @@
val ZIP_ENTRY_UNIX_MODE = parseInt("644", 8)
@Command(
- name = "KorapXmlTool",
+ name = "korapxmltool",
mixinStandardHelpOptions = true,
- version = ["KorapXmlTool 2.99"],
- description = ["Converts KorAP-XML <https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml> base or " +
- "morpho zips to (annotated) CoNLL(-U) format with all information necessary for " +
- "reconstruction in comment lines."]
+ version = ["KorAPXmlTool 2.99"],
+ usageHelpAutoWidth = false,
+ usageHelpWidth = 200,
+ description = ["Converts between KorAP-XML ZIP format and formats like CoNLL-U, Krill, word2vec, NOW\n"+
+ "and annotates KorAP XML ZIPs with various taggers and parsers.\n" +
+ "Drop-in replacement for korapxml2conllu (https://github.com/KorAP/KorAP-XML-CoNLL-U) and\n" +
+ "korapxml2krill (https://github.com/KorAP/KorAP-XML-Krill)\n"],
+ footer = ["%nExamples:",
+ " Basic conversion to CoNLL-U format:",
+ " ./build/bin/korapxmltool app/src/test/resources/wdf19.zip | head -10",
+ "",
+ " Word2Vec style output:",
+ " ./build/bin/korapxmltool --word2vec t/data/wdf19.zip",
+ "",
+ " Extract metadata and convert:",
+ " ./build/bin/korapxmltool -m '<textSigle>([^<]+)' -m '<creatDate>([^<]+)' --word2vec t/data/wdf19.zip",
+ "",
+ " NOW corpus export:",
+ " ./build/bin/korapxmltool -f now /vol/corpora/DeReKo/current/KorAP/zip/*24.zip | pv > dach24.txt",
+ "",
+ " Tag with external POS tagger:",
+ " ./build/bin/korapxmltool -f zip -t marmot:models/de.marmot app/src/test/resources/goe.zip",
+ "",
+ " Use external spaCy annotation (without dependencies):",
+ " ./build/bin/korapxmltool -T4 -A \"docker run -e SPACY_USE_DEPENDENCIES=False --rm -i korap/conllu2spacy:latest\" -f zip ./app/src/test/resources/goe.zip",
+ "",
+ " Generate Krill format with multiple foundries:",
+ " ./build/bin/korapxmltool -f krill -D out/krill app/src/test/resources/wud24_sample.zip app/src/test/resources/wud24_sample.spacy.zip app/src/test/resources/wud24_sample.marmot-malt.zip",
+ "",
+ " Large corpus processing with custom memory and performance settings:",
+ " KORAPXMLTOOL_XMX_MB=512000 KORAPXMLTOOL_JAVA_OPTS=\"-XX:+UseG1GC\" \\",
+ " ./build/bin/korapxmltool --threads 100 -f zip -t marmot:models/de.marmot -P maltparser:models/de.malt wpd25*.zip"
+ ]
)
class KorapXmlTool : Callable<Int> {
@@ -80,7 +109,7 @@
"korapxml, xml, zip: KorAP-XML format zip",
"word2vec, w2v: Print text in LM training format: tokens separated by space, sentences separated by newlines",
"now, NOW: NOW corpus export format: w2v-like format with <p> tags for sentence ends and @@<text-sigle> prefix",
- "krill: Krill JSON format (tar file with gzipped JSON files, one per text)",
+ "krill: Krill JSON format (tar file with gzipped JSON files, one per text)"
],
converter = [OutputFormatConverter::class]
)