Fix wrong text count

Change-Id: I49ad3fddfdfc574f3aca71a5c6dda154a4334bcc
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 671e70f..c32fee3 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -1450,14 +1450,21 @@
                         
                         val entriesByTextId = entries.groupBy { getTextIdFromPath(it.name) }
                         
+                        // Count only non-header entries for text count (header.xml is metadata, not a text)
+                        val textCount = entries.count { !it.name.contains("header.xml") && (it.name.endsWith("data.xml") || it.name.endsWith("morpho.xml")) }
+                        
                         // Build inventory for this ZIP (used for old flow fallback and logging)
-                        zipInventory[zipPath] = entriesByTextId.keys.toMutableSet()
+                        // Only include actual text IDs (not corpus/doc level header IDs)
+                        zipInventory[zipPath] = entries
+                            .filter { !it.name.contains("header.xml") }
+                            .map { getTextIdFromPath(it.name) }
+                            .toMutableSet()
                         
                         // Use appropriate wording: base ZIP contains texts, annotation foundries have annotations on texts
                         if (zipFoundry == "base") {
-                            LOGGER.info("  $zipPath contains ${entriesByTextId.size} texts")
+                            LOGGER.info("  $zipPath contains $textCount texts")
                         } else {
-                            LOGGER.info("  $zipPath has annotations on ${entriesByTextId.size} texts")
+                            LOGGER.info("  $zipPath has annotations on $textCount texts")
                         }
                         FoundryData(zipFile, zipPath, zipFoundry, entriesByTextId)
                     } catch (e: Exception) {
@@ -1478,8 +1485,13 @@
             executor.shutdown()
             
             // Get all unique text IDs across all foundries, sorted
+            // Count only texts with data.xml (the actual text content)
             val allTextIds = foundryDataList
-                .flatMap { it.entriesByTextId.keys }
+                .flatMap { foundryData ->
+                    foundryData.entriesByTextId
+                        .filterValues { entries -> entries.any { it.name.endsWith("data.xml") } }
+                        .keys
+                }
                 .toSet()
                 .sortedWith(this::compareTextIds)
             
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
index 28f1db5..76b9b92 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
@@ -433,4 +433,41 @@
         }
     }
 
+    @Test
+    fun testCorrectTextCount() {
+        val baseZip = loadResource("wud24_sample.zip").path
+        
+        val generatedTar = ensureKrillTar("text_count_test") { outputDir ->
+            arrayOf(
+                "-t", "krill",
+                "-D", outputDir.path,
+                baseZip
+            )
+        }
+
+        // Extract and count JSON files
+        val extractDir = File.createTempFile("extract", "").let { it.delete(); it.mkdirs(); it }
+        try {
+            val tarProcess = ProcessBuilder("tar", "-xf", generatedTar.path, "-C", extractDir.path)
+                .redirectErrorStream(true).start()
+            assertTrue(tarProcess.waitFor() == 0, "TAR extraction should succeed")
+
+            val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.gz") } ?: emptyList()
+            
+            // wud24_sample.zip contains exactly 3 texts (based on data.xml files)
+            assertEquals(3, jsonFiles.size, "Should have exactly 3 JSON files for wud24_sample")
+            
+            // Verify we can read each JSON
+            jsonFiles.forEach { jsonFile ->
+                val jsonContent = ProcessBuilder("gunzip", "-c", jsonFile.path)
+                    .redirectOutput(ProcessBuilder.Redirect.PIPE)
+                    .start().inputStream.bufferedReader().readText()
+                
+                assertTrue(jsonContent.contains("\"@type\":\"koral:corpus\""), 
+                    "Each JSON should be a valid Krill corpus document")
+            }
+        } finally {
+            extractDir.deleteRecursively()
+        }
+    }
 }