Use archive-order plain streaming mode also for base zip conllu output
Change-Id: Ibbde7155a06e5467af8fd0c058e839ac256fe5c4
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1b973f8..f8d6aeb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@
- Plain NOW export now opens ZIP input with `java.util.zip.ZipFile` in streaming mode instead of Apache Commons `ZipFile`, removing the multi-minute startup delay on very large archives with huge entry counts and allowing extraction to begin almost immediately
- Plain NOW export startup and progress diagnostics now log ZIP open time and first-output timing more explicitly, making it easier to distinguish ZIP indexing overhead from actual extraction work
- Plain Word2Vec export now uses the same archive-order streaming ZIP path as plain NOW output, including the faster `java.util.zip.ZipFile` opener for large archives with many entries
+- Plain CoNLL-U export now also uses the archive-order streaming ZIP path when exactly one base ZIP is given, while multi-ZIP and foundry-paired CoNLL-U input stays on the ordered pipeline
## [v3.3.0] - 2026-03-26
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index d929e85..b039795 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -972,16 +972,28 @@
outputFormat == OutputFormat.WORD2VEC ||
outputFormat == OutputFormat.NOW)
+ private fun hasSingleBaseZipInput(): Boolean {
+ val inputs = zipFileNames ?: return false
+ if (inputs.size != 1) return false
+ val name = File(inputs[0]).name
+ return name.matches(Regex(".*\\.zip$")) && !name.matches(Regex(".*\\.[^/.]+\\.zip$"))
+ }
+
internal fun canUseArchiveOrderTextStreaming(): Boolean =
- (outputFormat == OutputFormat.NOW || outputFormat == OutputFormat.WORD2VEC) &&
- annotationWorkerPool == null &&
+ annotationWorkerPool == null &&
taggerName == null &&
- parserName == null
+ parserName == null &&
+ when (outputFormat) {
+ OutputFormat.NOW, OutputFormat.WORD2VEC -> true
+ OutputFormat.CONLLU -> hasSingleBaseZipInput()
+ else -> false
+ }
private fun textStreamingModeLabel(): String =
when (outputFormat) {
OutputFormat.NOW -> "NOW"
OutputFormat.WORD2VEC -> "Word2Vec"
+ OutputFormat.CONLLU -> "CoNLL-U"
else -> outputFormat.name
}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt
index 61c4ebe..5a7bf73 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt
@@ -158,9 +158,30 @@
}
@Test
- fun conlluOutputKeepsOrderedPipeline() {
+ fun singleBaseConlluOutputCanUseArchiveOrderStreaming() {
val tool = KorapXmlTool()
tool.outputFormat = OutputFormat.CONLLU
+ tool.zipFileNames = arrayOf("/tmp/sample.zip")
+
+ assertTrue(tool.canUseArchiveOrderTextStreaming())
+ assertTrue(tool.canUseStaxTextParsing())
+ }
+
+ @Test
+ fun conlluOutputWithFoundryZipKeepsOrderedPipeline() {
+ val tool = KorapXmlTool()
+ tool.outputFormat = OutputFormat.CONLLU
+ tool.zipFileNames = arrayOf("/tmp/sample.spacy.zip")
+
+ assertTrue(!tool.canUseArchiveOrderTextStreaming())
+ assertTrue(tool.canUseStaxTextParsing())
+ }
+
+ @Test
+ fun conlluOutputWithMultipleZipInputsKeepsOrderedPipeline() {
+ val tool = KorapXmlTool()
+ tool.outputFormat = OutputFormat.CONLLU
+ tool.zipFileNames = arrayOf("/tmp/sample.zip", "/tmp/sample.spacy.zip")
assertTrue(!tool.canUseArchiveOrderTextStreaming())
assertTrue(tool.canUseStaxTextParsing())