Fix ZIP scanning Change-Id: Ic0eab7017fab5199023f4d3dc1ef2f1a5e0809ed

commit: 8749d1af4c8b7ae5d9619ccade59dd9469a8ecdb [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Tue Nov 18 13:13:04 2025 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Tue Nov 18 13:49:15 2025 +0100
tree: bd4f30d51da1264f5dad44e687c3af9e8da0f2b5
parent: 9708a405612f5f5ae049b49e06aae857bb3cccfd [diff]
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 67e90d3..01cdff5 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt

@@ -3823,16 +3823,6 @@
         LOGGER.info("Building per-ZIP inventory to track text completeness...")
         zipInventory.clear()
 
-        // Show progress bar for ZIP scanning phase
-        val scanProgressBar = if (!quiet && zipPaths.size > 1) {
-            ProgressBarBuilder()
-                .setTaskName("Scanning ZIPs")
-                .setInitialMax(zipPaths.size.toLong())
-                .setStyle(ProgressBarStyle.COLORFUL_UNICODE_BAR)
-                .setUpdateIntervalMillis(500)
-                .build()
-        } else null
-
         // Scan ZIPs in parallel for faster startup
         val scanParallelism = (zipParallelism ?: maxThreads).coerceAtLeast(1)
         val executor = java.util.concurrent.Executors.newFixedThreadPool(scanParallelism)
@@ -3848,38 +3838,35 @@
                     val isAnnotationFoundry = zipName.matches(Regex(".*\\.[^/.]+\\.zip$"))
 
                     try {
-                        // Use thread-local DocumentBuilder
-                        val dBuilder = threadLocalBuilder.get()
-
                         openZipFile(zipPath).use { zipFile ->
                             val entries = zipFile.entries
+                            
+                            // For base ZIPs: only count data.xml (the actual text content)
+                            // For annotation foundries: count their annotation files
+                            val pattern = if (isAnnotationFoundry) {
+                                Regex(".*/(?:morpho|dependency|constituency)\\.xml$")
+                            } else {
+                                Regex(".*/data\\.xml$")
+                            }
+                            
+                            // Extract docId from path instead of parsing XML
+                            // Base ZIP path: CORPUS/DOC/TEXT/data.xml
+                            // Annotation ZIP path: CORPUS/DOC/TEXT/foundry/dependency.xml
+                            val pathPattern = if (isAnnotationFoundry) {
+                                Regex("([^/]+)/([^/]+)/([^/]+)/[^/]+/(?:morpho|dependency|constituency)\\.xml$")
+                            } else {
+                                Regex("([^/]+)/([^/]+)/([^/]+)/data\\.xml$")
+                            }
+                            
                             while (entries.hasMoreElements()) {
                                 val entry = entries.nextElement()
-                                // For base ZIPs: look for data.xml or tokens.xml
-                                // For annotation foundries: also look for morpho.xml or dependency.xml
-                                val pattern = if (isAnnotationFoundry) {
-                                    Regex(".*(data|tokens|morpho|dependency)\\.xml$")
-                                } else {
-                                    Regex(".*(data|tokens)\\.xml$")
-                                }
-
+                                
                                 if (entry.name.matches(pattern)) {
-                                    try {
-                                        dBuilder.reset()
-                                        // Parse XML to extract docId attribute
-                                        val doc = zipFile.getInputStream(entry).use { inputStream ->
-                                            XMLCommentFilterReader(inputStream, "UTF-8").use { reader ->
-                                                dBuilder.parse(InputSource(reader))
-                                            }
-                                        }
-                                        doc.documentElement.normalize()
-                                        val docId = doc.documentElement.getAttribute("docid")
-                                        if (docId.isNotEmpty()) {
-                                            textsInThisZip.add(docId)
-                                        }
-                                    } catch (e: Exception) {
-                                        // Skip entries that can't be parsed
-                                        LOGGER.fine("Skipped entry ${entry.name}: ${e.message}")
+                                    val matchResult = pathPattern.find(entry.name)
+                                    if (matchResult != null) {
+                                        val (corpus, doc, text) = matchResult.destructured
+                                        val docId = "${corpus}_${doc}.${text}"
+                                        textsInThisZip.add(docId)
                                     }
                                 }
                             }
@@ -3895,7 +3882,6 @@
                         LOGGER.warning("Failed to scan $zipPath: ${e.message}")
                     }
 
-                    scanProgressBar?.step()
                     Pair(zipPath, textsInThisZip)
                 }
             }
@@ -3910,8 +3896,6 @@
             executor.awaitTermination(1, java.util.concurrent.TimeUnit.HOURS)
         }
 
-        scanProgressBar?.close()
-
         LOGGER.info("ZIP inventory built: ${zipPaths.size} ZIPs scanned")
         // Calculate total unique texts
         val allTexts = zipInventory.values.flatten().toSet()
commit	8749d1af4c8b7ae5d9619ccade59dd9469a8ecdb	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Nov 18 13:13:04 2025 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Nov 18 13:49:15 2025 +0100
tree	bd4f30d51da1264f5dad44e687c3af9e8da0f2b5
parent	9708a405612f5f5ae049b49e06aae857bb3cccfd [diff]