Read ZIP contents in parallel Change-Id: Iae250e6c200b7229779542229382f6fc07ee12a3

commit: b447a8b1d24bd1c639733a6dbd3196db9a8a325c [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Nov 10 18:33:07 2025 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Mon Nov 10 18:33:07 2025 +0100
tree: 9fd9ca6b303006bb9d5db4d8c8318c7185b5aec0
parent: 7397838726cb5855998200a8b172a48fc8261b77 [diff]
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 959fda6..9f7723d 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt

@@ -3039,46 +3039,63 @@
                 .build()
         } else null
 
-        val dbFactory = DocumentBuilderFactory.newInstance()
-        val dBuilder = dbFactory.newDocumentBuilder()
+        // Scan ZIPs in parallel for faster startup
+        val scanParallelism = (zipParallelism ?: maxThreads).coerceAtLeast(1)
+        val executor = java.util.concurrent.Executors.newFixedThreadPool(scanParallelism)
 
-        zipPaths.forEach { zipPath ->
-            val textsInThisZip = mutableSetOf<String>()
-            LOGGER.info("Scanning $zipPath...")
+        try {
+            val futures = zipPaths.map { zipPath ->
+                executor.submit<Pair<String, MutableSet<String>>> {
+                    val textsInThisZip = mutableSetOf<String>()
+                    LOGGER.info("Scanning $zipPath...")
 
-            try {
-                ApacheZipFile(File(zipPath)).use { zipFile ->
-                    val entries = zipFile.entries
-                    while (entries.hasMoreElements()) {
-                        val entry = entries.nextElement()
-                        // Look for data.xml or tokens.xml to identify texts
-                        if (entry.name.matches(Regex(".*(data|tokens)\\.xml$"))) {
-                            try {
-                                // Parse XML to extract docId attribute
-                                val doc = zipFile.getInputStream(entry).use { inputStream ->
-                                    XMLCommentFilterReader(inputStream, "UTF-8").use { reader ->
-                                        dBuilder.parse(InputSource(reader))
+                    try {
+                        val dbFactory = DocumentBuilderFactory.newInstance()
+                        val dBuilder = dbFactory.newDocumentBuilder()
+
+                        ApacheZipFile(File(zipPath)).use { zipFile ->
+                            val entries = zipFile.entries
+                            while (entries.hasMoreElements()) {
+                                val entry = entries.nextElement()
+                                // Look for data.xml or tokens.xml to identify texts
+                                if (entry.name.matches(Regex(".*(data|tokens)\\.xml$"))) {
+                                    try {
+                                        // Parse XML to extract docId attribute
+                                        val doc = zipFile.getInputStream(entry).use { inputStream ->
+                                            XMLCommentFilterReader(inputStream, "UTF-8").use { reader ->
+                                                dBuilder.parse(InputSource(reader))
+                                            }
+                                        }
+                                        doc.documentElement.normalize()
+                                        val docId = doc.documentElement.getAttribute("docid")
+                                        if (docId.isNotEmpty()) {
+                                            textsInThisZip.add(docId)
+                                        }
+                                    } catch (e: Exception) {
+                                        // Skip entries that can't be parsed
+                                        LOGGER.fine("Skipped entry ${entry.name}: ${e.message}")
                                     }
                                 }
-                                doc.documentElement.normalize()
-                                val docId = doc.documentElement.getAttribute("docid")
-                                if (docId.isNotEmpty()) {
-                                    textsInThisZip.add(docId)
-                                }
-                            } catch (e: Exception) {
-                                // Skip entries that can't be parsed
-                                LOGGER.fine("Skipped entry ${entry.name}: ${e.message}")
                             }
                         }
+                        LOGGER.info("  $zipPath contains ${textsInThisZip.size} texts")
+                    } catch (e: Exception) {
+                        LOGGER.warning("Failed to scan $zipPath: ${e.message}")
                     }
+
+                    scanProgressBar?.step()
+                    Pair(zipPath, textsInThisZip)
                 }
-                zipInventory[zipPath] = textsInThisZip
-                LOGGER.info("  $zipPath contains ${textsInThisZip.size} texts")
-            } catch (e: Exception) {
-                LOGGER.warning("Failed to scan $zipPath: ${e.message}")
             }
 
-            scanProgressBar?.step()
+            // Collect results
+            futures.forEach { future ->
+                val (zipPath, texts) = future.get()
+                zipInventory[zipPath] = texts
+            }
+        } finally {
+            executor.shutdown()
+            executor.awaitTermination(1, java.util.concurrent.TimeUnit.HOURS)
         }
 
         scanProgressBar?.close()
commit	b447a8b1d24bd1c639733a6dbd3196db9a8a325c	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Nov 10 18:33:07 2025 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Nov 10 18:33:07 2025 +0100
tree	9fd9ca6b303006bb9d5db4d8c8318c7185b5aec0
parent	7397838726cb5855998200a8b172a48fc8261b77 [diff]