Fix missing paragraph spans
Resolves #21
Change-Id: If60c4ac4132f5a43a3aec5af3390761af8f51922
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 47d22e4..7c70feb 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -4433,11 +4433,15 @@
}
synchronized(textData) {
- // Only collect if not already collected (avoid duplicates from multiple ZIP processing)
- if (textData.structureSpans.isNotEmpty()) {
- LOGGER.fine("Structure spans already collected for $docId, skipping")
- return
+ // Only clear dereko structure spans when re-collecting (preserve external foundry spans)
+ // This allows multiple ZIPs to be processed without losing annotation layer structure
+ val nonDerekoSpans = textData.structureSpans.filter { !it.layer.startsWith("dereko/") }
+ if (textData.structureSpans.size > nonDerekoSpans.size) {
+ LOGGER.fine("Clearing ${textData.structureSpans.size - nonDerekoSpans.size} dereko structure spans for $docId before re-collecting")
+ textData.structureSpans.clear()
+ textData.structureSpans.addAll(nonDerekoSpans)
}
+
for (i in 0 until spans.length) {
val span = spans.item(i) as? Element ?: continue
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
index 11f26e8..20767ef 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
@@ -415,12 +415,16 @@
// Add base structure spans (sentences, paragraphs, text)
val baseStructureSpans = mutableListOf<StructureSpan>()
+ // Determine the actual end of text from raw tokens (before filtering)
+ // This ensures base spans cover the full text including punctuation
+ val textEnd = if (rawTokens.isNotEmpty()) rawTokens.last().to else 0
+
// Add text span covering entire document (from start of text to end, tokenTo is exclusive)
if (tokens.isNotEmpty()) {
baseStructureSpans.add(StructureSpan(
layer = "base/s:t",
from = 0, // Start at beginning of text
- to = tokens.last().to,
+ to = textEnd, // Use raw tokens to include all punctuation
tokenFrom = 0,
tokenTo = tokens.size, // Exclusive end: one past last token index
depth = 0,
@@ -428,6 +432,21 @@
))
}
+ // Create base/s:p spans that mirror dereko/s:p elements
+ // For each dereko/s:p, create a corresponding base/s:p at depth 1
+ textData.structureSpans.filter { it.layer.endsWith(":p") }.forEach { derekoP ->
+ baseStructureSpans.add(StructureSpan(
+ layer = "base/s:p",
+ from = derekoP.from,
+ to = derekoP.to,
+ tokenFrom = -1, // Will be resolved later
+ tokenTo = -1,
+ depth = 1,
+ attributes = emptyMap()
+ ))
+ }
+
+
// Build token-to-sentence map for ROOT edge generation
data class SentenceInfo(val from: Int, val to: Int, val tokenFrom: Int, val tokenTo: Int)
val tokenToSentence = mutableMapOf<Int, SentenceInfo>()
@@ -498,8 +517,9 @@
spansByToken.getOrPut(span.tokenFrom) { mutableListOf() }.add(span)
}
- // Count paragraph spans (name="p")
- val paragraphCount = allStructureSpans.count { it.layer.endsWith(":p") }
+ // Count paragraph spans (name="p") from original document structure only
+ // Don't count the base/s:p wrapper we added programmatically
+ val paragraphCount = textData.structureSpans.count { it.layer.endsWith(":p") }
val sentenceCountsByFoundry = resolvedStructureSpans.sentenceCountsByFoundry()
val externalSentenceCounts = sentenceCountsByFoundry.entries
.filter { (foundry, _) -> foundry !in BASE_STRUCTURE_FOUNDRIES }
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
index d61187c..900708c 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
@@ -698,4 +698,71 @@
extractDir.deleteRecursively()
}
}
+ /**
+ * Regression test for GitHub issue #21: Missing base/s:p paragraph spans
+ *
+ * Ensures that for each dereko/s:p element in the input structure.xml,
+ * a corresponding base/s:p span is generated in the Krill JSON output.
+ */
+ @Test
+ fun testBaseParagraphSpansPresent() {
+ val ndySample = loadResource("ndy_sample.zip").path
+
+ val generatedTar = ensureKrillTar("ndy_base_paragraph_test", "ndy_sample.krill.tar") { outputDir ->
+ arrayOf(
+ "-t", "krill",
+ "-q",
+ "-D", outputDir.path,
+ ndySample
+ )
+ }
+
+ val kotlinJsons = readKrillJson(generatedTar)
+ assertTrue(kotlinJsons.isNotEmpty(), "Should have generated Krill JSON files from NDY sample")
+
+ // Test NDY/266/006701 - a document that should have base/s:p spans
+ val testDoc266 = "NDY-266-006701.json"
+ assertTrue(kotlinJsons.containsKey(testDoc266), "Should have JSON for test document $testDoc266")
+
+ val testJson266 = kotlinJsons.getValue(testDoc266)
+
+ // Verify the specific base/s:p span from issue #21 is present
+ // "<>:base/s:p$<b>64<i>0<i>1<i>1<b>1"
+ assertTrue(
+ testJson266.contains("<>:base/s:p\$"),
+ "JSON should contain base/s:p span marker"
+ )
+ assertTrue(
+ testJson266.contains("<>:base/s:p\$<b>64<i>0<i>1<i>1<b>1"),
+ "NDY-266-006701 should contain the specific base/s:p span from issue #21: '<>:base/s:p\$<b>64<i>0<i>1<i>1<b>1'"
+ )
+
+ // Test NDY/115/005255 - another document with paragraphs
+ val testDoc115 = "NDY-115-005255.json"
+ assertTrue(kotlinJsons.containsKey(testDoc115), "Should have JSON for test document $testDoc115")
+
+ val testJson115 = kotlinJsons.getValue(testDoc115)
+ assertTrue(
+ testJson115.contains("<>:base/s:p\$"),
+ "NDY-115-005255 should also contain base/s:p spans"
+ )
+
+ // Verify paragraph count metadata matches the number of base/s:p spans
+ kotlinJsons.forEach { (docId, json) ->
+ // Extract paragraph count from metadata
+ val paragraphCountMatch = Regex("""-:base/paragraphs\$<i>(\d+)""").find(json)
+ if (paragraphCountMatch != null) {
+ val paragraphCount = paragraphCountMatch.groupValues[1].toInt()
+
+ // Count base/s:p spans in the stream
+ val basePCount = Regex("""<>:base/s:p\$""").findAll(json).count()
+
+ assertEquals(
+ paragraphCount,
+ basePCount,
+ "Document $docId: Number of base/s:p spans ($basePCount) should match paragraph count metadata ($paragraphCount)"
+ )
+ }
+ }
+ }
}