Use archive-order plain streaming mode also for w2v output
Change-Id: I42a03c401c16b17922277024cb1663f655a692f6
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 498a763..1b973f8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@
- Plain NOW export now opens ZIP input with `java.util.zip.ZipFile` in streaming mode instead of Apache Commons `ZipFile`, removing the multi-minute startup delay on very large archives with huge entry counts and allowing extraction to begin almost immediately
- Plain NOW export startup and progress diagnostics now log ZIP open time and first-output timing more explicitly, making it easier to distinguish ZIP indexing overhead from actual extraction work
+- Plain Word2Vec export now uses the same archive-order streaming ZIP path as plain NOW output, including the faster `java.util.zip.ZipFile` opener for large archives with many entries
## [v3.3.0] - 2026-03-26
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 7051227..d929e85 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -972,12 +972,19 @@
outputFormat == OutputFormat.WORD2VEC ||
outputFormat == OutputFormat.NOW)
- internal fun canStreamNowEntriesImmediately(): Boolean =
- outputFormat == OutputFormat.NOW &&
+ internal fun canUseArchiveOrderTextStreaming(): Boolean =
+ (outputFormat == OutputFormat.NOW || outputFormat == OutputFormat.WORD2VEC) &&
annotationWorkerPool == null &&
taggerName == null &&
parserName == null
+ private fun textStreamingModeLabel(): String =
+ when (outputFormat) {
+ OutputFormat.NOW -> "NOW"
+ OutputFormat.WORD2VEC -> "Word2Vec"
+ else -> outputFormat.name
+ }
+
internal fun canUseStaxTextParsing(): Boolean =
outputFormat == OutputFormat.CONLLU ||
outputFormat == OutputFormat.WORD2VEC ||
@@ -1353,13 +1360,13 @@
Thread(r, "KrillWorker-${Thread.currentThread().threadId()}")
}
LOGGER.info("Initialized work-stealing scheduler with $maxThreads worker threads for Krill output")
- } else if (canStreamNowEntriesImmediately()) {
+ } else if (canUseArchiveOrderTextStreaming()) {
entryExecutor = java.util.concurrent.Executors.newFixedThreadPool(maxThreads) { r ->
- Thread(r, "NowEntryWorker-${Thread.currentThread().threadId()}")
+ Thread(r, "TextStreamWorker-${Thread.currentThread().threadId()}")
}
val textParserMode = if (shouldParseDataXmlWithStax()) "StAX" else "DOM"
LOGGER.info(
- "Initialized NOW streaming mode: archive-order entries, parallel entry processing, " +
+ "Initialized ${textStreamingModeLabel()} streaming mode: archive-order entries, parallel entry processing, " +
"no text-ID scheduling, data.xml via $textParserMode"
)
} else {
@@ -1620,8 +1627,8 @@
if (maxThreads > 1) {
val foundry = getFoundryFromZipFileNames(zips)
val parallelism = maxThreads.coerceAtLeast(1)
- if (canStreamNowEntriesImmediately()) {
- LOGGER.info("Processing zips in NOW streaming mode; zip parallelism=$parallelism; entry order=archive")
+ if (canUseArchiveOrderTextStreaming()) {
+ LOGGER.info("Processing zips in ${textStreamingModeLabel()} streaming mode; zip parallelism=$parallelism; entry order=archive")
} else {
LOGGER.info("Processing zips with ordered queue; parallelism=$parallelism; entries ${if (sequentialInZip) "sequential" else "parallel"}")
}
@@ -2222,7 +2229,7 @@
private fun compareTextIds(a: String, b: String): Int =
monthAwareSortKey(a).compareTo(monthAwareSortKey(b))
- private fun shouldUseUnicodeExtraFieldsOnOpen(): Boolean = !canStreamNowEntriesImmediately()
+ private fun shouldUseUnicodeExtraFieldsOnOpen(): Boolean = !canUseArchiveOrderTextStreaming()
private interface ReadableZipEntry {
val name: String
@@ -2291,7 +2298,7 @@
return "base"
}
- private fun useJavaZipForNow(): Boolean = canStreamNowEntriesImmediately()
+ private fun useJavaZipForTextStreaming(): Boolean = canUseArchiveOrderTextStreaming()
private fun processZipFile(zipFilePath: String, foundry: String = "base") {
val ord = zipOrdinals[zipFilePath] ?: 0
@@ -2415,7 +2422,7 @@
LOGGER.fine("About to process ZIP entries: hasCorrespondingBaseZip=${zipFilePath.hasCorrespondingBaseZip()}")
if (zipFilePath.hasCorrespondingBaseZip()) {
val baseZip = zipFilePath.correspondingBaseZip()!!
- val relatedZips = if (canStreamNowEntriesImmediately() && !lemmaOnly) {
+ val relatedZips = if (canUseArchiveOrderTextStreaming() && !lemmaOnly) {
arrayOf(baseZip, zipFilePath)
} else {
arrayOf(zipFilePath, baseZip)
@@ -2428,9 +2435,9 @@
} else {
foundry // Keep original foundry for non-krill formats
}
- if (useJavaZipForNow()) {
+ if (useJavaZipForTextStreaming()) {
openJavaZipFile(zip).use { zipFile ->
- LOGGER.info("Using NOW streaming mode for $zip: archive-order entries, no text-ID sorting")
+ LOGGER.info("Using ${textStreamingModeLabel()} streaming mode for $zip: archive-order entries, no text-ID sorting")
processZipEntriesStreaming(zipFile, zip, zipFoundry, true)
}
} else {
@@ -2443,9 +2450,9 @@
LOGGER.fine("Opening ZipFile for processing: $zipFilePath")
try {
// If no corresponding base ZIP exists, this IS the base ZIP
- if (useJavaZipForNow()) {
+ if (useJavaZipForTextStreaming()) {
openJavaZipFile(zipFilePath).use { zipFile ->
- LOGGER.info("Using NOW streaming mode for $zipFilePath: archive-order entries, no text-ID sorting")
+ LOGGER.info("Using ${textStreamingModeLabel()} streaming mode for $zipFilePath: archive-order entries, no text-ID sorting")
processZipEntriesStreaming(zipFile, zipFilePath, foundry, false)
}
} else {
@@ -2482,7 +2489,7 @@
if (zipFilePath.hasCorrespondingBaseZip()) {
// Process the two related zips strictly sequentially to limit memory growth
val baseZip = zipFilePath.correspondingBaseZip()!!
- val zips = if (canStreamNowEntriesImmediately() && !lemmaOnly) {
+ val zips = if (canUseArchiveOrderTextStreaming() && !lemmaOnly) {
arrayOf(baseZip, zipFilePath)
} else {
arrayOf(zipFilePath, baseZip)
@@ -2494,9 +2501,9 @@
} else {
foundry // Keep original foundry for non-krill formats
}
- if (useJavaZipForNow()) {
+ if (useJavaZipForTextStreaming()) {
openJavaZipFile(zip).use { zipFile ->
- LOGGER.info("Using NOW streaming mode for $zip: archive-order entries, no text-ID sorting")
+ LOGGER.info("Using ${textStreamingModeLabel()} streaming mode for $zip: archive-order entries, no text-ID sorting")
processZipEntriesStreaming(zipFile, zip, zipFoundry, true)
}
} else {
@@ -2512,9 +2519,9 @@
}
}
} else {
- if (useJavaZipForNow()) {
+ if (useJavaZipForTextStreaming()) {
openJavaZipFile(zipFilePath).use { zipFile ->
- LOGGER.info("Using NOW streaming mode for $zipFilePath: archive-order entries, no text-ID sorting")
+ LOGGER.info("Using ${textStreamingModeLabel()} streaming mode for $zipFilePath: archive-order entries, no text-ID sorting")
processZipEntriesStreaming(zipFile, zipFilePath, foundry, false)
}
} else {
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt
index 6668220..61c4ebe 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt
@@ -137,11 +137,11 @@
}
@Test
- fun plainNowOutputCanStreamWithoutSorting() {
+ fun plainNowOutputCanUseArchiveOrderStreaming() {
val tool = KorapXmlTool()
tool.outputFormat = OutputFormat.NOW
- assertTrue(tool.canStreamNowEntriesImmediately())
+ assertTrue(tool.canUseArchiveOrderTextStreaming())
assertTrue(tool.canUseStaxTextParsing())
assertTrue(!tool.shouldParseDataXmlWithStax())
tool.useStaxTextParser = true
@@ -149,11 +149,20 @@
}
@Test
- fun nonNowOutputKeepsOrderedPipeline() {
+ fun plainWord2VecOutputCanUseArchiveOrderStreaming() {
+ val tool = KorapXmlTool()
+ tool.outputFormat = OutputFormat.WORD2VEC
+
+ assertTrue(tool.canUseArchiveOrderTextStreaming())
+ assertTrue(tool.canUseStaxTextParsing())
+ }
+
+ @Test
+ fun conlluOutputKeepsOrderedPipeline() {
val tool = KorapXmlTool()
tool.outputFormat = OutputFormat.CONLLU
- assertTrue(!tool.canStreamNowEntriesImmediately())
+ assertTrue(!tool.canUseArchiveOrderTextStreaming())
assertTrue(tool.canUseStaxTextParsing())
}