Fix missing metadata inherited from corpus and document Resolves #22 Change-Id: I4cca99133caaba95daf60a73183697f97d3444ed

commit: e260f9140866f5f7cef64e42eea6d546b49c36fb [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Wed Dec 17 18:40:01 2025 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Wed Dec 17 20:06:15 2025 +0100
tree: 57f25691a2df0a9d8c7e5aee8b2255e673e7a53d
parent: 06511f111e36d3ec4b69ad6f4881033a7ad5611b [diff]
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 7c70feb..10e71cf 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt

@@ -1778,6 +1778,42 @@
                 expectedFoundriesPerText[textId] = foundriesForThisText
             }
             
+            // CRITICAL: Process all headers FIRST to populate corpus/doc/text metadata
+            // Headers must be processed before text entries so metadata is available for inheritance
+            LOGGER.info("Processing headers for metadata collection...")
+            foundryDataList.forEach { foundryData ->
+                val allEntries = foundryData.entriesByTextId.values.flatten()
+                val headerEntries = allEntries.filter { it.name.contains("header.xml") }
+                
+                headerEntries.forEach { headerEntry ->
+                    try {
+                        val headerBytes = foundryData.zipFile.getInputStream(headerEntry).readBytes()
+                        val headerDoc = safeDomFactory.newDocumentBuilder().parse(ByteArrayInputStream(headerBytes))
+                        val headerRoot = headerDoc.documentElement
+                        headerRoot.normalize()
+
+                        val textSigle = headerRoot.firstText("textSigle")
+                        val docSigle = headerRoot.firstText("dokumentSigle")
+                        val corpusSigle = headerRoot.firstText("korpusSigle")
+                        val docId = textSigle?.replace('/', '_')
+
+                        // Call appropriate metadata collection function based on what the header contains
+                        if (corpusSigle != null) {
+                            collectCorpusMetadata(corpusSigle, headerRoot)
+                        }
+                        if (docSigle != null) {
+                            collectDocMetadata(docSigle, headerRoot)
+                        }
+                        if (docId != null) {
+                            collectKrillMetadata(docId, headerRoot)
+                        }
+                    } catch (e: Exception) {
+                        LOGGER.warning("Error processing header ${headerEntry.name}: ${e.message}")
+                    }
+                }
+            }
+            LOGGER.info("Completed header processing for metadata")
+            
             // Submit tasks in text-ID order, cycling through all foundries for each text
             allTextIds.forEach { textId ->
                 foundryDataList.forEach { foundryData ->
@@ -2559,6 +2595,16 @@
                     }
 
                     "tokens.xml" -> {
+                        // For krill format, set tokenSource based on the foundry containing the tokens
+                        if (outputFormat == OutputFormat.KRILL && foundry == "base") {
+                            val textData = krillData.getOrPut(docId) { 
+                                KrillJsonGenerator.KrillTextData(textId = docId) 
+                            }
+                            // Extract foundry from path like "CORPUS/DOC/TEXT/base/tokens.xml" -> "base"
+                            val pathParts = zipEntry.name.split('/')
+                            val tokensFoundry = pathParts.getOrNull(pathParts.size - 2) ?: "base"
+                            textData.headerMetadata["tokenSource"] = "${tokensFoundry}#tokens"
+                        }           
                         if (!fnames.contains(docId)) {
                             fnames[docId] = zipEntry.name
                         }
@@ -2723,12 +2769,17 @@
 
                 val docId = textSigle?.replace('/', '_')
                 LOGGER.fine("Processing header file: " + zipEntry.name + " docId: " + docId + " corpusSigle: " + corpusSigle + " docSigle: " + docSigle)
-
                 if (outputFormat == OutputFormat.KRILL) {
-                    when {
-                        corpusSigle != null -> collectCorpusMetadata(corpusSigle, headerRoot)
-                        docSigle != null -> collectDocMetadata(docSigle, headerRoot)
-                        docId != null -> collectKrillMetadata(docId, headerRoot)
+                    // Collect metadata at the appropriate level(s)
+                    // Note: corpus, doc, and text headers each have their own sigle fields
+                    if (corpusSigle != null) {
+                        collectCorpusMetadata(corpusSigle, headerRoot)
+                    }
+                    if (docSigle != null) {
+                        collectDocMetadata(docSigle, headerRoot)
+                    }
+                    if (docId != null) {
+                        collectKrillMetadata(docId, headerRoot)
                     }
                 }
 
@@ -2807,6 +2858,15 @@
                     if (!fnames.contains(docId)) fnames[docId] = zipEntry.name
                     tokens[docId] = extractSpansStax(reader, docId)
                     if (outputFormat == OutputFormat.KRILL && foundry == "base") {
+                        // Set tokenSource based on the foundry containing the tokens
+                        // Extract foundry from path like "CORPUS/DOC/TEXT/base/tokens.xml" -> "base"
+                        val textData = krillData.getOrPut(docId) { 
+                            KrillJsonGenerator.KrillTextData(textId = docId) 
+                        }
+                        val pathParts = zipEntry.name.split('/')
+                        val tokensFoundry = pathParts.getOrNull(pathParts.size - 2) ?: "base"
+                        textData.headerMetadata["tokenSource"] = "${tokensFoundry}#tokens"
+                        
                         collectKrillBaseData(docId)
                     }
                 }
@@ -4805,6 +4865,13 @@
             headerRoot.firstElement("ref") { it.getAttribute("type") == "page_url" }
                 ?.getAttribute("target")?.takeIf { it.isNotBlank() }?.let { metadata["externalLink"] = it }
 
+            // Extract textExternalLinks from biblNote[@n='url']
+            val biblNoteUrl = analytic.firstElement("biblNote") { it.getAttribute("n") == "url" }
+                ?.textContent?.trim()?.takeIf { it.isNotEmpty() }
+                ?: monogr.firstElement("biblNote") { it.getAttribute("n") == "url" }
+                    ?.textContent?.trim()?.takeIf { it.isNotEmpty() }
+            metadata.putIfNotBlank("textExternalLinks", biblNoteUrl)
+
             if (!metadata.containsKey("language")) {
                 metadata["language"] = "de"
             }
@@ -4834,7 +4901,9 @@
             metadata.putIfNotBlank("corpusEditor", headerRoot.firstElement("monogr").firstText("editor"))
             metadata.putIfNotBlank("publisher", headerRoot.firstText("publisher"))
             metadata.putIfNotBlank("distributor", headerRoot.firstText("distributor"))
+            metadata.putIfNotBlank("pubPlace", headerRoot.firstText("pubPlace"))
             metadata.putIfNotBlank("textType", headerRoot.firstElement("textDesc").firstText("textType"))
+            
             LOGGER.fine("Collected ${metadata.size} corpus-level metadata fields for $corpusSigle")
         }
     }
@@ -5161,17 +5230,43 @@
                 val corpusSigle = textId.substringBefore('_')
                 val docSigle = textId.substringBeforeLast('.')
 
-                // Apply corpus-level metadata
+                // Apply corpus-level metadata (only if not already set with a non-empty value)
                 corpusMetadata[corpusSigle]?.forEach { (key, value) ->
-                    if (!textData.headerMetadata.containsKey(key)) {
-                        textData.headerMetadata[key] = value
+                    val currentValue = textData.headerMetadata[key]
+                    // Inherit if: key doesn't exist, OR current value is empty/blank
+                    val shouldInherit = when (currentValue) {
+                        null -> true
+                        is String -> currentValue.isBlank()
+                        else -> false
+                    }
+                    if (shouldInherit && value != null) {
+                        // Only set non-empty values
+                        when (value) {
+                            is String -> if (value.isNotBlank()) textData.headerMetadata[key] = value
+                            is List<*> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+                            is Map<*, *> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+                            else -> textData.headerMetadata[key] = value
+                        }
                     }
                 }
 
-                // Apply doc-level metadata
+                // Apply doc-level metadata (only if not already set with a non-empty value)
                 docMetadata[docSigle]?.forEach { (key, value) ->
-                    if (!textData.headerMetadata.containsKey(key)) {
-                        textData.headerMetadata[key] = value
+                    val currentValue = textData.headerMetadata[key]
+                    // Inherit if: key doesn't exist, OR current value is empty/blank
+                    val shouldInherit = when (currentValue) {
+                        null -> true
+                        is String -> currentValue.isBlank()
+                        else -> false
+                    }
+                    if (shouldInherit && value != null) {
+                        // Only set non-empty values
+                        when (value) {
+                            is String -> if (value.isNotBlank()) textData.headerMetadata[key] = value
+                            is List<*> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+                            is Map<*, *> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+                            else -> textData.headerMetadata[key] = value
+                        }
                     }
                 }
 

diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
index 20767ef..d5a7ca4 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt

@@ -100,17 +100,39 @@
         val corpusSigle = textIdWithSlashes.split("/")[0]
         val docSigle = textIdWithSlashes.split("/").take(2).joinToString("/")
 
-        // First apply corpus-level metadata (lowest priority)
+        // First apply corpus-level metadata (lowest priority) - only if not already set with non-empty value
         corpusMetadata[corpusSigle]?.forEach { (key, value) ->
-            if (!textData.headerMetadata.containsKey(key)) {
-                textData.headerMetadata[key] = value
+            val currentValue = textData.headerMetadata[key]
+            val shouldInherit = when (currentValue) {
+                null -> true
+                is String -> currentValue.isBlank()
+                else -> false
+            }
+            if (shouldInherit && value != null) {
+                when (value) {
+                    is String -> if (value.isNotBlank()) textData.headerMetadata[key] = value
+                    is List<*> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+                    is Map<*, *> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+                    else -> textData.headerMetadata[key] = value
+                }
             }
         }
 
-        // Then apply doc-level metadata (medium priority)
+        // Then apply doc-level metadata (medium priority) - only if not already set with non-empty value
         docMetadata[docSigle]?.forEach { (key, value) ->
-            if (!textData.headerMetadata.containsKey(key)) {
-                textData.headerMetadata[key] = value
+            val currentValue = textData.headerMetadata[key]
+            val shouldInherit = when (currentValue) {
+                null -> true
+                is String -> currentValue.isBlank()
+                else -> false
+            }
+            if (shouldInherit && value != null) {
+                when (value) {
+                    is String -> if (value.isNotBlank()) textData.headerMetadata[key] = value
+                    is List<*> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+                    is Map<*, *> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+                    else -> textData.headerMetadata[key] = value
+                }
             }
         }
 
@@ -122,7 +144,8 @@
             "creationDate", "pubDate", "textClass", "award", "availability", "language",
             "ISBN", "URN", "pubPlace", "pubPlaceKey",
             "textType", "textTypeArt", "textTypeRef", "textDomain", "textColumn",
-            "author", "title", "subTitle", "corpusTitle", "corpusSubTitle", "docTitle"
+            "author", "title", "subTitle", "corpusTitle", "corpusSubTitle", "docTitle", "docAuthor",
+            "textExternalLinks", "tokenSource"
         )
 
         fieldOrder.forEach { key ->
@@ -179,7 +202,7 @@
                         "type:attachement" to jsonString("data:,${value.toString()}")
                     }
                 }
-                "author", "title", "subTitle", "corpusTitle", "corpusSubTitle", "docTitle" -> {
+                "author", "title", "subTitle", "corpusTitle", "corpusSubTitle", "docTitle", "docAuthor" -> {
                     "type:text" to jsonString(value.toString())
                 }
                 "externalLink" -> {
@@ -189,6 +212,16 @@
                     val encodedUrl = url.replace(":", "%3A").replace("/", "%2F")
                     "type:attachement" to jsonString("data:application/x.korap-link;title=$title,$encodedUrl")
                 }
+                "textExternalLinks" -> {
+                    val url = value.toString()
+                    val title = textData.headerMetadata["publisher"]?.toString() ?: "Link"
+                    val encodedUrl = url.replace(":", "%3A").replace("/", "%2F")
+                    "type:attachement" to jsonString("data:application/x.korap-link;title=$title,$encodedUrl")
+                }
+                "tokenSource" -> {
+                    // tokenSource is a string foundry reference like "base#tokens"
+                    "type:string" to jsonString(value.toString())
+                }
                 else -> {
                     // corpusEditor, distributor, editor, reference - all ATTACHMENT
                     "type:attachement" to jsonString("data:,${value.toString()}")

diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
index 900708c..53048b0 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt

@@ -12,6 +12,7 @@
 import kotlin.test.assertEquals
 import kotlin.test.assertTrue
 import kotlin.test.assertFalse
+import kotlin.test.assertNotNull
 
 /**
  * Tests for Krill JSON format output (-t krill)
@@ -698,6 +699,72 @@
             extractDir.deleteRecursively()
         }
     }
+
+    @Test
+    fun testKrillMetadataInheritance() {
+        // Test for GitHub issue #22: Ensure all metadata fields are correctly extracted and inherited
+        val ndySample = loadResource("ndy_sample.zip").path
+        
+        val generatedTar = ensureKrillTar("ndy_metadata_test", "ndy_sample.krill.tar") { outputDir ->
+            arrayOf(
+                "-t", "krill",
+                "-q",
+                "-D", outputDir.path,
+                ndySample
+            )
+        }
+
+        val kotlinJsons = readKrillJson(generatedTar)
+        assertTrue(kotlinJsons.isNotEmpty(), "Should have generated Krill JSON files from NDY sample")
+
+        // Test specific metadata fields that were previously missing (GitHub issue #22)
+        val requiredFields = mapOf(
+            "corpusTitle" to "Nottinghamer Korpus Deutscher YouTube-Sprache",
+            "docTitle" to "Info Video",
+            "docAuthor" to "User_A",  // Anonymized
+            "distributor" to "Institut für Deutsche Sprache",
+            "pubPlace" to "San Bruno, California",
+            "textExternalLinks" to "youtube.googleapis.com",  // Partial match for URL
+            "tokenSource" to "base#tokens"
+        )
+
+        // Test on one of the documents (NDY/115/005255)
+        val testDocId = "NDY-115-005255.json"
+        assertTrue(kotlinJsons.containsKey(testDocId), "Should have JSON for test document $testDocId")
+        
+        val testJson = kotlinJsons.getValue(testDocId)
+        
+        requiredFields.forEach { (fieldName, expectedValue) ->
+            assertTrue(
+                testJson.contains("\"$fieldName\""),
+                "JSON should contain field: $fieldName"
+            )
+            assertTrue(
+                testJson.contains(expectedValue),
+                "Field $fieldName should contain value: $expectedValue"
+            )
+        }
+
+        // Verify corpus-level metadata inheritance works
+        // pubPlace should be inherited from corpus level (not empty from text level)
+        val pubPlaceMatch = Regex(""""key"\s*:\s*"pubPlace".*?"value"\s*:\s*"([^"]+)"""").find(testJson)
+        assertNotNull(pubPlaceMatch, "Should find pubPlace field")
+        assertEquals(
+            "San Bruno, California",
+            pubPlaceMatch.groupValues[1],
+            "pubPlace should be inherited from corpus level"
+        )
+
+        // Verify tokenSource is dynamically extracted from tokens.xml path
+        val tokenSourceMatch = Regex(""""key"\s*:\s*"tokenSource".*?"value"\s*:\s*"([^"]+)"""").find(testJson)
+        assertNotNull(tokenSourceMatch, "Should find tokenSource field")
+        assertEquals(
+            "base#tokens",
+            tokenSourceMatch.groupValues[1],
+            "tokenSource should be extracted from foundry path"
+        )
+    }
+
     /**
      * Regression test for GitHub issue #21: Missing base/s:p paragraph spans
      * 

diff --git a/app/src/test/resources/ndy_sample.zip b/app/src/test/resources/ndy_sample.zip
new file mode 100644
index 0000000..5167d62
--- /dev/null
+++ b/app/src/test/resources/ndy_sample.zip
Binary files differ
commit	e260f9140866f5f7cef64e42eea6d546b49c36fb	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Dec 17 18:40:01 2025 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Dec 17 20:06:15 2025 +0100
tree	57f25691a2df0a9d8c7e5aee8b2255e673e7a53d
parent	06511f111e36d3ec4b69ad6f4881033a7ad5611b [diff]