Scan zips only once
Change-Id: I707728a17d81d974c9022e000d7331b1b96c866e
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 4335966..5d6292f 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -1066,32 +1066,8 @@
}
LOGGER.info("Expected foundries for Krill output: ${expectedFoundries.sorted()}")
- // Build inventory of which texts exist in which ZIPs for incremental output
- buildZipInventory(args)
-
- // Initialize progress bar for incremental output
- if (!quiet) {
- val totalTexts = zipInventory.values.flatten().toSet().size
- if (totalTexts > 0) {
- incrementalProgressBar = ProgressBarBuilder()
- .setTaskName("$baseZipName.krill.tar")
- .setInitialMax(totalTexts.toLong())
- .setStyle(ProgressBarStyle.COLORFUL_UNICODE_BAR)
- .setUpdateIntervalMillis(500)
- .showSpeed()
- .build()
- }
- }
-
- // Start dedicated writer thread for incremental output
- // Only enable if we have multiple texts to benefit from incremental processing
- val totalTexts = zipInventory.values.flatten().toSet().size
- if (totalTexts > 1) {
- startIncrementalWriterThread()
- LOGGER.info("Enabled incremental output for $totalTexts texts")
- } else {
- LOGGER.info("Disabled incremental output (only $totalTexts text)")
- }
+ // Note: Progress bar and incremental writer will be initialized in processZipsInterleavedForKrill
+ // after scanning ZIPs (to avoid double scanning)
}
if (annotateWith.isNotEmpty()) {
@@ -1458,31 +1434,79 @@
val foundryDataList = mutableListOf<FoundryData>()
try {
- // Open all ZIPs and keep them open
- zips.forEach { zipPath ->
- val zipFoundry = getFoundryFromZipFileName(zipPath)
- LOGGER.info("Opening ZIP: $zipPath for foundry=$zipFoundry")
-
- try {
- val zipFile = ApacheZipFile(File(zipPath))
- val entries = zipFile.entries.toList()
- .filter { !it.isDirectory && it.name.matches(Regex(".*(data|tokens|structure|morpho|dependency|sentences|constituency)\\.xml$")) }
+ // Open all ZIPs in parallel for faster startup
+ val scanParallelism = maxThreads.coerceAtLeast(1)
+ val executor = java.util.concurrent.Executors.newFixedThreadPool(scanParallelism)
+
+ val futures = zips.map { zipPath ->
+ executor.submit<FoundryData?> {
+ val zipFoundry = getFoundryFromZipFileName(zipPath)
+ LOGGER.info("Opening ZIP: $zipPath for foundry=$zipFoundry")
- val entriesByTextId = entries.groupBy { getTextIdFromPath(it.name) }
- foundryDataList.add(FoundryData(zipFile, zipPath, zipFoundry, entriesByTextId))
- LOGGER.info("Found ${entriesByTextId.size} texts in $zipFoundry")
- } catch (e: Exception) {
- LOGGER.severe("Failed to open ZIP $zipPath: ${e.message}")
+ try {
+ val zipFile = ApacheZipFile(File(zipPath))
+ val entries = zipFile.entries.toList()
+ .filter { !it.isDirectory && it.name.matches(Regex(".*(data|tokens|structure|morpho|dependency|sentences|constituency)\\.xml$")) }
+
+ val entriesByTextId = entries.groupBy { getTextIdFromPath(it.name) }
+
+ // Build inventory for this ZIP (used for old flow fallback and logging)
+ zipInventory[zipPath] = entriesByTextId.keys.toMutableSet()
+
+ // Use appropriate wording: base ZIP contains texts, annotation foundries have annotations on texts
+ if (zipFoundry == "base") {
+ LOGGER.info(" $zipPath contains ${entriesByTextId.size} texts")
+ } else {
+ LOGGER.info(" $zipPath has annotations on ${entriesByTextId.size} texts")
+ }
+ FoundryData(zipFile, zipPath, zipFoundry, entriesByTextId)
+ } catch (e: Exception) {
+ LOGGER.severe("Failed to open ZIP $zipPath: ${e.message}")
+ null
+ }
}
}
+ // Collect results
+ futures.forEach { future ->
+ val foundryData = future.get()
+ if (foundryData != null) {
+ foundryDataList.add(foundryData)
+ }
+ }
+
+ executor.shutdown()
+
// Get all unique text IDs across all foundries, sorted
val allTextIds = foundryDataList
.flatMap { it.entriesByTextId.keys }
.toSet()
.sortedWith(this::compareTextIds)
+ // Set expected text order for the scanner
+ expectedTextOrder = allTextIds
+ nextTextOrderIndex = 0
+ scanOrderLogged = false
+
LOGGER.info("Processing ${allTextIds.size} texts across ${foundryDataList.size} foundries in interleaved order")
+ LOGGER.info(" Text processing order (first 20): ${expectedTextOrder.take(20)}")
+
+ // Initialize progress bar now that we know the text count
+ if (!quiet && allTextIds.size > 0) {
+ incrementalProgressBar = ProgressBarBuilder()
+ .setTaskName("${File(zips[0]).nameWithoutExtension.substringBefore('.')}.krill.tar")
+ .setInitialMax(allTextIds.size.toLong())
+ .setStyle(ProgressBarStyle.COLORFUL_UNICODE_BAR)
+ .setUpdateIntervalMillis(500)
+ .showSpeed()
+ .build()
+ }
+
+ // Start incremental writer thread if we have multiple texts
+ if (allTextIds.size > 1) {
+ startIncrementalWriterThread()
+ LOGGER.info("Enabled incremental output for ${allTextIds.size} texts")
+ }
// Start workers
repeat(maxThreads) {