Fix wrong text count
Change-Id: I49ad3fddfdfc574f3aca71a5c6dda154a4334bcc
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 671e70f..c32fee3 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -1450,14 +1450,21 @@
val entriesByTextId = entries.groupBy { getTextIdFromPath(it.name) }
+ // Count only non-header entries for text count (header.xml is metadata, not a text)
+ val textCount = entries.count { !it.name.contains("header.xml") && (it.name.endsWith("data.xml") || it.name.endsWith("morpho.xml")) }
+
// Build inventory for this ZIP (used for old flow fallback and logging)
- zipInventory[zipPath] = entriesByTextId.keys.toMutableSet()
+ // Only include actual text IDs (not corpus/doc level header IDs)
+ zipInventory[zipPath] = entries
+ .filter { !it.name.contains("header.xml") }
+ .map { getTextIdFromPath(it.name) }
+ .toMutableSet()
// Use appropriate wording: base ZIP contains texts, annotation foundries have annotations on texts
if (zipFoundry == "base") {
- LOGGER.info(" $zipPath contains ${entriesByTextId.size} texts")
+ LOGGER.info(" $zipPath contains $textCount texts")
} else {
- LOGGER.info(" $zipPath has annotations on ${entriesByTextId.size} texts")
+ LOGGER.info(" $zipPath has annotations on $textCount texts")
}
FoundryData(zipFile, zipPath, zipFoundry, entriesByTextId)
} catch (e: Exception) {
@@ -1478,8 +1485,13 @@
executor.shutdown()
// Get all unique text IDs across all foundries, sorted
+ // Count only texts with data.xml (the actual text content)
val allTextIds = foundryDataList
- .flatMap { it.entriesByTextId.keys }
+ .flatMap { foundryData ->
+ foundryData.entriesByTextId
+ .filterValues { entries -> entries.any { it.name.endsWith("data.xml") } }
+ .keys
+ }
.toSet()
.sortedWith(this::compareTextIds)
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
index 28f1db5..76b9b92 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
@@ -433,4 +433,41 @@
}
}
+ @Test
+ fun testCorrectTextCount() {
+ val baseZip = loadResource("wud24_sample.zip").path
+
+ val generatedTar = ensureKrillTar("text_count_test") { outputDir ->
+ arrayOf(
+ "-t", "krill",
+ "-D", outputDir.path,
+ baseZip
+ )
+ }
+
+ // Extract and count JSON files
+ val extractDir = File.createTempFile("extract", "").let { it.delete(); it.mkdirs(); it }
+ try {
+ val tarProcess = ProcessBuilder("tar", "-xf", generatedTar.path, "-C", extractDir.path)
+ .redirectErrorStream(true).start()
+ assertTrue(tarProcess.waitFor() == 0, "TAR extraction should succeed")
+
+ val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.gz") } ?: emptyList()
+
+ // wud24_sample.zip contains exactly 3 texts (based on data.xml files)
+ assertEquals(3, jsonFiles.size, "Should have exactly 3 JSON files for wud24_sample")
+
+ // Verify we can read each JSON
+ jsonFiles.forEach { jsonFile ->
+ val jsonContent = ProcessBuilder("gunzip", "-c", jsonFile.path)
+ .redirectOutput(ProcessBuilder.Redirect.PIPE)
+ .start().inputStream.bufferedReader().readText()
+
+ assertTrue(jsonContent.contains("\"@type\":\"koral:corpus\""),
+ "Each JSON should be a valid Krill corpus document")
+ }
+ } finally {
+ extractDir.deleteRecursively()
+ }
+ }
}