Overhaul and standardize command line options
Change-Id: I976da707c29b0bc9aac241398f834ef7198d0482
diff --git a/Readme.md b/Readme.md
index 3fce1e8..cc7ae27 100644
--- a/Readme.md
+++ b/Readme.md
@@ -84,15 +84,14 @@
- `--lemma-only`: For `-f w2v` and `-f now`, skip loading `data.xml` and output only lemmas from `morpho.xml`. This reduces memory and speeds up throughput.
- `--sequential`: Process entries inside each zip sequentially (zips can still run in parallel). Recommended for `w2v`/`now` to keep locality and lower memory.
-- `--zip-parallelism N`: Limit how many zips are processed concurrently (defaults to `--threads`). Helps avoid disk thrash and native inflater pressure.
- `--exclude-zip-glob GLOB` (repeatable): Skip zip basenames that match the glob (e.g., `--exclude-zip-glob 'w?d24.tree_tagger.zip'`).
Example for large NOW export with progress and exclusions:
```
KORAPXMLTOOL_XMX=64g KORAPXMLTOOL_MODELS_PATH=/data/models KORAPXMLTOOL_JAVA_OPTS="-XX:+UseG1GC -Djdk.util.zip.disableMemoryMapping=true -Djdk.util.zip.reuseInflater=true" \
- ./build/bin/korapxmltool -l info --threads 100 --zip-parallelism 8 \
- --lemma-only --sequential -f now \
+ ./build/bin/korapxmltool -l info -j 100 \
+ --lemma-only --sequential -t now \
--exclude-zip-glob 'w?d24.tree_tagger.zip' \
/vol/corpora/DeReKo/current/KorAP/zip/*24.tree_tagger.zip | pv > dach2024.lemma.txt
```
@@ -124,14 +123,14 @@
```shell script
# With full path
-./build/bin/korapxmltool -f zip -t marmot:models/de.marmot app/src/test/resources/goe.zip
+./build/bin/korapxmltool -t zip -T marmot:models/de.marmot app/src/test/resources/goe.zip
# With KORAPXMLTOOL_MODELS_PATH (searches in /data/models/ if model not found locally)
export KORAPXMLTOOL_MODELS_PATH=/data/models
-./build/bin/korapxmltool -f zip -t marmot:de.marmot app/src/test/resources/goe.zip
+./build/bin/korapxmltool -t zip -T marmot:de.marmot app/src/test/resources/goe.zip
-# Without setting KORAPXMLTOOL_MODELS_PATH (uses default ../lib/models from executable)
-./build/bin/korapxmltool -f zip -t marmot:de.marmot app/src/test/resources/goe.zip
+# Without setting KORAPXMLTOOL_MODELS_PATH (searches current directory only)
+./build/bin/korapxmltool -t zip -T marmot:models/de.marmot app/src/test/resources/goe.zip
```
### Tagging with integrated OpenNLP POS tagger directly to a new KorAP-XML ZIP file
@@ -183,12 +182,12 @@
Note that parsers take POS tagged input.
```shell script
-./build/bin/korapxmltool -f zip -T2 -P malt:german.mco goe.tree_tagger.zip
+./build/bin/korapxmltool -t zip -j2 -P malt:german.mco goe.tree_tagger.zip
```
### Tag with MarMoT and parse with Maltparser in one run directly to a new KorAP-XML ZIP file
```shell script
-./build/bin/korapxmltool -f zip -t marmot:models/de.marmot -P malt:german.mco goe.zip
+./build/bin/korapxmltool -t zip -T marmot:models/de.marmot -P malt:german.mco goe.zip
```
## Development and License
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 6c06c7f..6e325c7 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -72,27 +72,27 @@
" ./build/bin/korapxmltool app/src/test/resources/wdf19.tree_tagger.zip | head -10",
"",
" Word2Vec style output:",
- " ./build/bin/korapxmltool -f w2v app/src/test/resources/wud24_sample.zip",
+ " ./build/bin/korapxmltool -t w2v app/src/test/resources/wud24_sample.zip",
"",
" Extract metadata and convert:",
" ./build/bin/korapxmltool -m '<textSigle>([^<]+)' -m '<creatDate>([^<]+)' --word2vec t/data/wdf19.zip",
"",
" NOW corpus export:",
- " ./build/bin/korapxmltool -f now /vol/corpora/DeReKo/current/KorAP/zip/*24.zip | pv > dach24.txt",
+ " ./build/bin/korapxmltool -t now /vol/corpora/DeReKo/current/KorAP/zip/*24.zip | pv > dach24.txt",
"",
" Tag with integrated MarMot POS tagger, and parse with internal Malt parser:",
- " ./build/bin/korapxmltool -f zip -t marmot:de.marmot -P malt:german.mco app/src/test/resources/goe.zip",
- " # (uses KORAPXMLTOOL_MODELS_PATH if model not found in current directory; defaults to ../lib/models)",
+ " ./build/bin/korapxmltool -t zip -T marmot:de.marmot -P malt:german.mco app/src/test/resources/goe.zip",
+ " # (uses KORAPXMLTOOL_MODELS_PATH if model not found in current directory)",
"",
" Use external spaCy annotation (without dependencies):",
- " ./build/bin/korapxmltool -T4 -A \"docker run -e SPACY_USE_DEPENDENCIES=False --rm -i korap/conllu2spacy:latest\" -f zip ./app/src/test/resources/goe.zip",
+ " ./build/bin/korapxmltool -j4 -A \"docker run -e SPACY_USE_DEPENDENCIES=False --rm -i korap/conllu2spacy:latest\" -t zip ./app/src/test/resources/goe.zip",
"",
" Generate Krill tar from wud24_sample with multiple annotation foundries:",
- " ./build/bin/korapxmltool -f krill -D . app/src/test/resources/wud24_sample*.zip",
+ " ./build/bin/korapxmltool -t krill -D . app/src/test/resources/wud24_sample*.zip",
"",
" Large corpus annotation with custom memory and performance and default model settings:",
" KORAPXMLTOOL_XMX=500g KORAPXMLTOOL_MODELS_PATH=/data/models KORAPXMLTOOL_JAVA_OPTS=\"-XX:+UseG1GC\" \\",
- " ./build/bin/korapxmltool --threads 100 -f zip -t marmot -P malt wpd25*.zip"
+ " ./build/bin/korapxmltool -j 100 -t zip -T marmot -P malt wpd25*.zip"
]
)
@@ -111,7 +111,7 @@
var zipFileNames: Array<String>? = null
@Option(
- names = ["-f", "--output-format"],
+ names = ["-t", "--to"],
description = ["Output format: ${ConlluOutputFormat.NAME}, ${Word2VecOutputFormat.NAME}, ${KorapXmlOutputFormat.NAME}, ${NowOutputFormat.NAME}, ${KrillOutputFormat.NAME}",
"conllu: CoNLL-U format",
"korapxml, xml, zip: KorAP-XML format zip",
@@ -169,16 +169,6 @@
)
var columns: Int = 10
- @Option(
- names = ["--word2vec", "-w"],
- description = ["Print text in LM training format: tokens separated by space, sentences separated by newline",
- "Deprecated: use -f word2vec"]
- )
- fun setWord2Vec(word2vec: Boolean) {
- if (word2vec) {
- outputFormat = OutputFormat.WORD2VEC
- }
- }
@Option(
names = ["--exclude-zip-glob"],
@@ -237,7 +227,7 @@
var quiet: Boolean = false
@Option(
- names = ["--threads", "-T"],
+ names = ["-j", "--jobs", "--threads"],
paramLabel = "THREADS",
description = ["Maximum number of threads to use. Default: ${"$"}{DEFAULT-VALUE}"]
)
@@ -250,12 +240,6 @@
System.setProperty("java.util.concurrent.ForkJoinPool.common.parallelism", threads.toString())
}
- @Option(
- names = ["--zip-parallelism"],
- paramLabel = "N",
- description = ["Maximum number of zip files to process concurrently. Defaults to --threads."]
- )
- var zipParallelism: Int? = null
@Option(
names = ["--sequential"],
@@ -266,7 +250,7 @@
var sequentialInZip: Boolean = false
@Option(
- names = ["--overwrite", "-o"],
+ names = ["-f", "--force"],
description = ["Overwrite existing files"]
)
var overwrite: Boolean = false
@@ -295,7 +279,7 @@
names = ["--lemma-only"],
description = [
"Do not load texts from data.xml and output only lemmas (requires morpho.xml).",
- "Only valid with -f word2vec or -f now; implies --lemma."
+ "Only valid with -t word2vec or -t now; implies --lemma."
]
)
var lemmaOnly: Boolean = false
@@ -347,7 +331,7 @@
return null
}
@Option(
- names = ["--tag-with", "-t"],
+ names = ["-T", "--tag-with"],
paramLabel = "TAGGER[:MODEL]",
description = ["Specify a tagger and optionally a model: ${taggerFoundries}[:<path/to/model>].",
"If model is omitted, defaults are: marmot→de.marmot, opennlp→de-pos-maxent.bin, corenlp→german-fast.tagger"]
@@ -394,7 +378,7 @@
private var parserName: String? = null
private var parserModel: String? = null
@Option(
- names = ["--parse-with", "-P"],
+ names = ["-P", "--parse-with"],
paramLabel = "PARSER[:MODEL]",
description = ["Specify a parser and optionally a model: ${parserFoundries}[:<path/to/model>].",
"If model is omitted, defaults are: malt→german.mco, corenlp→germanSR.ser.gz"]
@@ -465,7 +449,7 @@
if (lemmaOnly) {
useLemma = true
if (outputFormat != OutputFormat.WORD2VEC && outputFormat != OutputFormat.NOW) {
- throw ParameterException(spec.commandLine(), "--lemma-only is supported only with -f word2vec or -f now")
+ throw ParameterException(spec.commandLine(), "--lemma-only is supported only with -t word2vec or -t now")
}
}
@@ -811,7 +795,7 @@
LOGGER.info("Initializing krill TAR output: $krillOutputFileName")
if (File(krillOutputFileName!!).exists() && !overwrite) {
- LOGGER.severe("Output file $krillOutputFileName already exists. Use --overwrite to overwrite.")
+ LOGGER.severe("Output file $krillOutputFileName already exists. Use --force to overwrite.")
exitProcess(1)
}
@@ -892,7 +876,7 @@
// Check for existing output file BEFORE redirecting logging, so user sees the message
if (File(outputMorphoZipFileName).exists() && !overwrite) {
- val errorMsg = "Output file $outputMorphoZipFileName already exists. Use --overwrite to overwrite."
+ val errorMsg = "Output file $outputMorphoZipFileName already exists. Use --force to overwrite."
System.err.println("ERROR: $errorMsg")
LOGGER.severe(errorMsg)
exitProcess(1)
@@ -987,13 +971,13 @@
if (sequentialInZip) {
if (outputFormat != OutputFormat.WORD2VEC && outputFormat != OutputFormat.NOW) {
- throw ParameterException(spec.commandLine(), "--sequential is supported only with -f word2vec or -f now")
+ throw ParameterException(spec.commandLine(), "--sequential is supported only with -t word2vec or -t now")
}
}
if (maxThreads > 1) {
val foundry = getFoundryFromZipFileNames(zips)
- val parallelism = (zipParallelism ?: maxThreads).coerceAtLeast(1)
+ val parallelism = maxThreads.coerceAtLeast(1)
LOGGER.info("Processing zips with ordered queue; parallelism=$parallelism; entries ${if (sequentialInZip) "sequential" else "parallel"}")
processZipsWithQueue(zips, foundry, parallelism)
} else {
@@ -1350,7 +1334,7 @@
// Check for existing output file BEFORE redirecting logging, so user sees the message
if (File(outputMorphoZipFileName).exists() && !overwrite) {
- val errorMsg = "Output file $outputMorphoZipFileName already exists. Use --overwrite to overwrite."
+ val errorMsg = "Output file $outputMorphoZipFileName already exists. Use --force to overwrite."
System.err.println("ERROR: $errorMsg")
LOGGER.severe(errorMsg)
exitProcess(1)
@@ -4206,7 +4190,7 @@
zipInventory.clear()
// Scan ZIPs in parallel for faster startup
- val scanParallelism = (zipParallelism ?: maxThreads).coerceAtLeast(1)
+ val scanParallelism = maxThreads.coerceAtLeast(1)
val executor = java.util.concurrent.Executors.newFixedThreadPool(scanParallelism)
try {
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index bfbc622..ea1860f 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
@@ -178,18 +178,16 @@
@Test
fun deprecatedW2vOptionWorks() {
+ // Test that the old -w option no longer works (should fail for v3.0)
val args = arrayOf("-w", loadResource("wdf19.zip").path)
- debug(args)
- assertContains(
- outContent.toString(),
- "\nje ne suis pas du tout d'accord !\n"
- )
- assertFalse { outContent.toString().contains("WDF19_A0000.13865") }
+ val exitCode = debug(args)
+ // Should fail since -w was removed
+ assertTrue(exitCode != 0, "Old -w option should no longer work in v3.0")
}
@Test
fun w2vOptionWorks() {
- val args = arrayOf("-f", "w2v", loadResource("wdf19.zip").path)
+ val args = arrayOf("-t", "w2v", loadResource("wdf19.zip").path)
debug(args)
assertContains(
outContent.toString(),
@@ -200,7 +198,7 @@
@Test
fun nowOptionWorks() {
- val args = arrayOf("-f", "now", loadResource("wdf19.zip").path)
+ val args = arrayOf("-t", "now", loadResource("wdf19.zip").path)
debug(args)
val output = outContent.toString()
// Check that output starts with @@<text-sigle>
@@ -218,7 +216,7 @@
@Test
fun canConvertXMLwithInvalidComments() {
- val args = arrayOf("-w", zca20scrambled)
+ val args = arrayOf("-t", "w2v", zca20scrambled)
debug(args)
assertContains(
outContent.toString(),
@@ -250,7 +248,7 @@
@Test
fun canExtractMetadata() {
- val args = arrayOf("--word2vec", "-m" ,"<textSigle>([^<]+)", "-m", "<creatDate>([^<]+)", loadResource("wdf19.zip").path)
+ val args = arrayOf("-t", "w2v", "-m" ,"<textSigle>([^<]+)", "-m", "<creatDate>([^<]+)", loadResource("wdf19.zip").path)
debug(args)
assertContains(
outContent.toString(),
@@ -260,7 +258,7 @@
@Test
fun canHandleNonBmpText() {
- val args = arrayOf("--word2vec", wdd17)
+ val args = arrayOf("-t", "w2v", wdd17)
debug(args)
assertContains(
outContent.toString(),
@@ -317,7 +315,7 @@
val tmpSourceFileName = tmpSourceFile.absolutePath
File(sourceFile).copyTo(File(tmpSourceFileName), true)
val outputDir = File(tmpSourceFileName).parentFile.absolutePath
- val args = arrayOf("-D", outputDir, "-o", "-f", "zip", tmpSourceFileName)
+ val args = arrayOf("-D", outputDir, "-f", "-t", "zip", tmpSourceFileName)
debug(args)
val resultFile = tmpSourceFileName.toString().replace(".zip", ".base.zip")
@@ -333,7 +331,7 @@
val resultFile = tmpSourceFileName.toString().replace(".zip", ".base.zip")
File(resultFile).createNewFile()
val outputDir = File(tmpSourceFileName).parentFile.absolutePath
- val args = arrayOf("-D", outputDir, "-o", "-f", "zip", tmpSourceFileName)
+ val args = arrayOf("-D", outputDir, "-f", "-t", "zip", tmpSourceFileName)
debug(args)
assert(File(resultFile).exists())
assert(File(resultFile).length() > 0)
@@ -341,7 +339,7 @@
@Test
fun canWord2VecLemma() {
- val args = arrayOf("--lemma", "-f", "w2v", loadResource("goe.tree_tagger.zip").path)
+ val args = arrayOf("--lemma", "-t", "w2v", loadResource("goe.tree_tagger.zip").path)
debug(args)
val out = outContent.toString()
// Expect lemma sequence containing "mein Ankunft" (surface would include inflected form elsewhere)
@@ -350,7 +348,7 @@
@Test
fun canNowLemma() {
- val args = arrayOf("--lemma", "-f", "now", loadResource("goe.tree_tagger.zip").path)
+ val args = arrayOf("--lemma", "-t", "now", loadResource("goe.tree_tagger.zip").path)
debug(args)
val out = outContent.toString()
assertContains(out, "@@")
@@ -360,7 +358,7 @@
@Test
fun lemmaOnlyWord2VecWorks() {
- val args = arrayOf("--lemma-only", "-f", "w2v", loadResource("goe.tree_tagger.zip").path)
+ val args = arrayOf("--lemma-only", "-t", "w2v", loadResource("goe.tree_tagger.zip").path)
debug(args)
val out = outContent.toString()
// Should produce some lemma tokens without requiring data.xml
@@ -369,7 +367,7 @@
@Test
fun lemmaOnlyNowWorks() {
- val args = arrayOf("--lemma-only", "-f", "now", loadResource("goe.tree_tagger.zip").path)
+ val args = arrayOf("--lemma-only", "-t", "now", loadResource("goe.tree_tagger.zip").path)
debug(args)
val out = outContent.toString()
assertContains(out, "@@")
@@ -393,7 +391,7 @@
val rc = debug(args)
// Non-zero is expected; and error message should be present
assertTrue(rc != 0)
- assertContains(errContent.toString(), "--sequential is supported only with -f word2vec or -f now")
+ assertContains(errContent.toString(), "--sequential is supported only with -t word2vec or -t now")
}
@Test
@@ -475,7 +473,7 @@
val generatedTar = ensureKrillTar("wud24_full_foundries") { outputDir ->
arrayOf(
- "-f", "krill",
+ "-t", "krill",
"-l", "info",
"-D", outputDir.path,
baseZip,
@@ -617,7 +615,7 @@
val spacyZip = loadResource("wud24_sample.spacy.zip").path
val generatedTar = ensureKrillTar("wud24_base_spacy") { outputDir ->
- arrayOf("-f", "krill", "-D", outputDir.path, baseZip, spacyZip)
+ arrayOf("-t", "krill", "-D", outputDir.path, baseZip, spacyZip)
}
assertTrue(generatedTar.exists())
@@ -659,7 +657,7 @@
val spacyZip = loadResource("wud24_sample.spacy.zip").path
val generatedTar = ensureKrillTar("wud24_base_spacy") { outputDir ->
- arrayOf("-f", "krill", "-D", outputDir.path, baseZip, spacyZip)
+ arrayOf("-t", "krill", "-D", outputDir.path, baseZip, spacyZip)
}
assertTrue(generatedTar.exists())
@@ -707,7 +705,7 @@
val treeTaggerZip = loadResource("wud24_sample.tree_tagger.zip").path
val generatedTar = ensureKrillTar("wud24_full_foundries") { outputDir ->
- arrayOf("-f", "krill", "-D", outputDir.path, baseZip, spacyZip, marmotZip, opennlpZip, treeTaggerZip)
+ arrayOf("-t", "krill", "-D", outputDir.path, baseZip, spacyZip, marmotZip, opennlpZip, treeTaggerZip)
}
assertTrue(generatedTar.exists())
@@ -749,7 +747,7 @@
val spacyZip = loadResource("wud24_sample.spacy.zip").path
val defaultTar = ensureKrillTar("wud24_default_corenlp") { outputDir ->
- arrayOf("-f", "krill", "-D", outputDir.path, baseZip, spacyZip, wud24Corenlp)
+ arrayOf("-t", "krill", "-D", outputDir.path, baseZip, spacyZip, wud24Corenlp)
}
assertTrue(defaultTar.exists(), "Default krill tar should exist")
@@ -765,7 +763,7 @@
)
val flagTar = ensureKrillTar("wud24_default_corenlp_nwt") { outputDir ->
- arrayOf("-f", "krill", "--non-word-tokens", "-D", outputDir.path, baseZip, spacyZip, wud24Corenlp)
+ arrayOf("-t", "krill", "--non-word-tokens", "-D", outputDir.path, baseZip, spacyZip, wud24Corenlp)
}
assertTrue(flagTar.exists(), "Krill tar should exist when --non-word-tokens is set")
@@ -794,7 +792,7 @@
val kotlinTar = ensureKrillTar("wud24_reference_default") { outputDir ->
arrayOf(
- "-f", "krill",
+ "-t", "krill",
"-D", outputDir.path,
baseZip,
spacyZip,
@@ -837,7 +835,7 @@
val kotlinTar = ensureKrillTar("wud24_reference_nwt") { outputDir ->
arrayOf(
- "-f", "krill",
+ "-t", "krill",
"--non-word-tokens",
"-D", outputDir.path,
baseZip,
@@ -898,7 +896,7 @@
try {
// Run CoreNLP with both tagger and parser
val args = arrayOf(
- "-f", "zip",
+ "-t", "zip",
"-o",
"-D", outputDir.path,
"-t", "corenlp:${taggerModel.path}",