Fix missing metadata inherited from corpus and document
Resolves #22
Change-Id: I4cca99133caaba95daf60a73183697f97d3444ed
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 7c70feb..10e71cf 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -1778,6 +1778,42 @@
expectedFoundriesPerText[textId] = foundriesForThisText
}
+ // CRITICAL: Process all headers FIRST to populate corpus/doc/text metadata
+ // Headers must be processed before text entries so metadata is available for inheritance
+ LOGGER.info("Processing headers for metadata collection...")
+ foundryDataList.forEach { foundryData ->
+ val allEntries = foundryData.entriesByTextId.values.flatten()
+ val headerEntries = allEntries.filter { it.name.contains("header.xml") }
+
+ headerEntries.forEach { headerEntry ->
+ try {
+ val headerBytes = foundryData.zipFile.getInputStream(headerEntry).readBytes()
+ val headerDoc = safeDomFactory.newDocumentBuilder().parse(ByteArrayInputStream(headerBytes))
+ val headerRoot = headerDoc.documentElement
+ headerRoot.normalize()
+
+ val textSigle = headerRoot.firstText("textSigle")
+ val docSigle = headerRoot.firstText("dokumentSigle")
+ val corpusSigle = headerRoot.firstText("korpusSigle")
+ val docId = textSigle?.replace('/', '_')
+
+ // Call appropriate metadata collection function based on what the header contains
+ if (corpusSigle != null) {
+ collectCorpusMetadata(corpusSigle, headerRoot)
+ }
+ if (docSigle != null) {
+ collectDocMetadata(docSigle, headerRoot)
+ }
+ if (docId != null) {
+ collectKrillMetadata(docId, headerRoot)
+ }
+ } catch (e: Exception) {
+ LOGGER.warning("Error processing header ${headerEntry.name}: ${e.message}")
+ }
+ }
+ }
+ LOGGER.info("Completed header processing for metadata")
+
// Submit tasks in text-ID order, cycling through all foundries for each text
allTextIds.forEach { textId ->
foundryDataList.forEach { foundryData ->
@@ -2559,6 +2595,16 @@
}
"tokens.xml" -> {
+ // For krill format, set tokenSource based on the foundry containing the tokens
+ if (outputFormat == OutputFormat.KRILL && foundry == "base") {
+ val textData = krillData.getOrPut(docId) {
+ KrillJsonGenerator.KrillTextData(textId = docId)
+ }
+ // Extract foundry from path like "CORPUS/DOC/TEXT/base/tokens.xml" -> "base"
+ val pathParts = zipEntry.name.split('/')
+ val tokensFoundry = pathParts.getOrNull(pathParts.size - 2) ?: "base"
+ textData.headerMetadata["tokenSource"] = "${tokensFoundry}#tokens"
+ }
if (!fnames.contains(docId)) {
fnames[docId] = zipEntry.name
}
@@ -2723,12 +2769,17 @@
val docId = textSigle?.replace('/', '_')
LOGGER.fine("Processing header file: " + zipEntry.name + " docId: " + docId + " corpusSigle: " + corpusSigle + " docSigle: " + docSigle)
-
if (outputFormat == OutputFormat.KRILL) {
- when {
- corpusSigle != null -> collectCorpusMetadata(corpusSigle, headerRoot)
- docSigle != null -> collectDocMetadata(docSigle, headerRoot)
- docId != null -> collectKrillMetadata(docId, headerRoot)
+ // Collect metadata at the appropriate level(s)
+ // Note: corpus, doc, and text headers each have their own sigle fields
+ if (corpusSigle != null) {
+ collectCorpusMetadata(corpusSigle, headerRoot)
+ }
+ if (docSigle != null) {
+ collectDocMetadata(docSigle, headerRoot)
+ }
+ if (docId != null) {
+ collectKrillMetadata(docId, headerRoot)
}
}
@@ -2807,6 +2858,15 @@
if (!fnames.contains(docId)) fnames[docId] = zipEntry.name
tokens[docId] = extractSpansStax(reader, docId)
if (outputFormat == OutputFormat.KRILL && foundry == "base") {
+ // Set tokenSource based on the foundry containing the tokens
+ // Extract foundry from path like "CORPUS/DOC/TEXT/base/tokens.xml" -> "base"
+ val textData = krillData.getOrPut(docId) {
+ KrillJsonGenerator.KrillTextData(textId = docId)
+ }
+ val pathParts = zipEntry.name.split('/')
+ val tokensFoundry = pathParts.getOrNull(pathParts.size - 2) ?: "base"
+ textData.headerMetadata["tokenSource"] = "${tokensFoundry}#tokens"
+
collectKrillBaseData(docId)
}
}
@@ -4805,6 +4865,13 @@
headerRoot.firstElement("ref") { it.getAttribute("type") == "page_url" }
?.getAttribute("target")?.takeIf { it.isNotBlank() }?.let { metadata["externalLink"] = it }
+ // Extract textExternalLinks from biblNote[@n='url']
+ val biblNoteUrl = analytic.firstElement("biblNote") { it.getAttribute("n") == "url" }
+ ?.textContent?.trim()?.takeIf { it.isNotEmpty() }
+ ?: monogr.firstElement("biblNote") { it.getAttribute("n") == "url" }
+ ?.textContent?.trim()?.takeIf { it.isNotEmpty() }
+ metadata.putIfNotBlank("textExternalLinks", biblNoteUrl)
+
if (!metadata.containsKey("language")) {
metadata["language"] = "de"
}
@@ -4834,7 +4901,9 @@
metadata.putIfNotBlank("corpusEditor", headerRoot.firstElement("monogr").firstText("editor"))
metadata.putIfNotBlank("publisher", headerRoot.firstText("publisher"))
metadata.putIfNotBlank("distributor", headerRoot.firstText("distributor"))
+ metadata.putIfNotBlank("pubPlace", headerRoot.firstText("pubPlace"))
metadata.putIfNotBlank("textType", headerRoot.firstElement("textDesc").firstText("textType"))
+
LOGGER.fine("Collected ${metadata.size} corpus-level metadata fields for $corpusSigle")
}
}
@@ -5161,17 +5230,43 @@
val corpusSigle = textId.substringBefore('_')
val docSigle = textId.substringBeforeLast('.')
- // Apply corpus-level metadata
+ // Apply corpus-level metadata (only if not already set with a non-empty value)
corpusMetadata[corpusSigle]?.forEach { (key, value) ->
- if (!textData.headerMetadata.containsKey(key)) {
- textData.headerMetadata[key] = value
+ val currentValue = textData.headerMetadata[key]
+ // Inherit if: key doesn't exist, OR current value is empty/blank
+ val shouldInherit = when (currentValue) {
+ null -> true
+ is String -> currentValue.isBlank()
+ else -> false
+ }
+ if (shouldInherit && value != null) {
+ // Only set non-empty values
+ when (value) {
+ is String -> if (value.isNotBlank()) textData.headerMetadata[key] = value
+ is List<*> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+ is Map<*, *> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+ else -> textData.headerMetadata[key] = value
+ }
}
}
- // Apply doc-level metadata
+ // Apply doc-level metadata (only if not already set with a non-empty value)
docMetadata[docSigle]?.forEach { (key, value) ->
- if (!textData.headerMetadata.containsKey(key)) {
- textData.headerMetadata[key] = value
+ val currentValue = textData.headerMetadata[key]
+ // Inherit if: key doesn't exist, OR current value is empty/blank
+ val shouldInherit = when (currentValue) {
+ null -> true
+ is String -> currentValue.isBlank()
+ else -> false
+ }
+ if (shouldInherit && value != null) {
+ // Only set non-empty values
+ when (value) {
+ is String -> if (value.isNotBlank()) textData.headerMetadata[key] = value
+ is List<*> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+ is Map<*, *> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+ else -> textData.headerMetadata[key] = value
+ }
}
}
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
index 20767ef..d5a7ca4 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
@@ -100,17 +100,39 @@
val corpusSigle = textIdWithSlashes.split("/")[0]
val docSigle = textIdWithSlashes.split("/").take(2).joinToString("/")
- // First apply corpus-level metadata (lowest priority)
+ // First apply corpus-level metadata (lowest priority) - only if not already set with non-empty value
corpusMetadata[corpusSigle]?.forEach { (key, value) ->
- if (!textData.headerMetadata.containsKey(key)) {
- textData.headerMetadata[key] = value
+ val currentValue = textData.headerMetadata[key]
+ val shouldInherit = when (currentValue) {
+ null -> true
+ is String -> currentValue.isBlank()
+ else -> false
+ }
+ if (shouldInherit && value != null) {
+ when (value) {
+ is String -> if (value.isNotBlank()) textData.headerMetadata[key] = value
+ is List<*> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+ is Map<*, *> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+ else -> textData.headerMetadata[key] = value
+ }
}
}
- // Then apply doc-level metadata (medium priority)
+ // Then apply doc-level metadata (medium priority) - only if not already set with non-empty value
docMetadata[docSigle]?.forEach { (key, value) ->
- if (!textData.headerMetadata.containsKey(key)) {
- textData.headerMetadata[key] = value
+ val currentValue = textData.headerMetadata[key]
+ val shouldInherit = when (currentValue) {
+ null -> true
+ is String -> currentValue.isBlank()
+ else -> false
+ }
+ if (shouldInherit && value != null) {
+ when (value) {
+ is String -> if (value.isNotBlank()) textData.headerMetadata[key] = value
+ is List<*> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+ is Map<*, *> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
+ else -> textData.headerMetadata[key] = value
+ }
}
}
@@ -122,7 +144,8 @@
"creationDate", "pubDate", "textClass", "award", "availability", "language",
"ISBN", "URN", "pubPlace", "pubPlaceKey",
"textType", "textTypeArt", "textTypeRef", "textDomain", "textColumn",
- "author", "title", "subTitle", "corpusTitle", "corpusSubTitle", "docTitle"
+ "author", "title", "subTitle", "corpusTitle", "corpusSubTitle", "docTitle", "docAuthor",
+ "textExternalLinks", "tokenSource"
)
fieldOrder.forEach { key ->
@@ -179,7 +202,7 @@
"type:attachement" to jsonString("data:,${value.toString()}")
}
}
- "author", "title", "subTitle", "corpusTitle", "corpusSubTitle", "docTitle" -> {
+ "author", "title", "subTitle", "corpusTitle", "corpusSubTitle", "docTitle", "docAuthor" -> {
"type:text" to jsonString(value.toString())
}
"externalLink" -> {
@@ -189,6 +212,16 @@
val encodedUrl = url.replace(":", "%3A").replace("/", "%2F")
"type:attachement" to jsonString("data:application/x.korap-link;title=$title,$encodedUrl")
}
+ "textExternalLinks" -> {
+ val url = value.toString()
+ val title = textData.headerMetadata["publisher"]?.toString() ?: "Link"
+ val encodedUrl = url.replace(":", "%3A").replace("/", "%2F")
+ "type:attachement" to jsonString("data:application/x.korap-link;title=$title,$encodedUrl")
+ }
+ "tokenSource" -> {
+ // tokenSource is a string foundry reference like "base#tokens"
+ "type:string" to jsonString(value.toString())
+ }
else -> {
// corpusEditor, distributor, editor, reference - all ATTACHMENT
"type:attachement" to jsonString("data:,${value.toString()}")
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
index 900708c..53048b0 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
@@ -12,6 +12,7 @@
import kotlin.test.assertEquals
import kotlin.test.assertTrue
import kotlin.test.assertFalse
+import kotlin.test.assertNotNull
/**
* Tests for Krill JSON format output (-t krill)
@@ -698,6 +699,72 @@
extractDir.deleteRecursively()
}
}
+
+ @Test
+ fun testKrillMetadataInheritance() {
+ // Test for GitHub issue #22: Ensure all metadata fields are correctly extracted and inherited
+ val ndySample = loadResource("ndy_sample.zip").path
+
+ val generatedTar = ensureKrillTar("ndy_metadata_test", "ndy_sample.krill.tar") { outputDir ->
+ arrayOf(
+ "-t", "krill",
+ "-q",
+ "-D", outputDir.path,
+ ndySample
+ )
+ }
+
+ val kotlinJsons = readKrillJson(generatedTar)
+ assertTrue(kotlinJsons.isNotEmpty(), "Should have generated Krill JSON files from NDY sample")
+
+ // Test specific metadata fields that were previously missing (GitHub issue #22)
+ val requiredFields = mapOf(
+ "corpusTitle" to "Nottinghamer Korpus Deutscher YouTube-Sprache",
+ "docTitle" to "Info Video",
+ "docAuthor" to "User_A", // Anonymized
+ "distributor" to "Institut für Deutsche Sprache",
+ "pubPlace" to "San Bruno, California",
+ "textExternalLinks" to "youtube.googleapis.com", // Partial match for URL
+ "tokenSource" to "base#tokens"
+ )
+
+ // Test on one of the documents (NDY/115/005255)
+ val testDocId = "NDY-115-005255.json"
+ assertTrue(kotlinJsons.containsKey(testDocId), "Should have JSON for test document $testDocId")
+
+ val testJson = kotlinJsons.getValue(testDocId)
+
+ requiredFields.forEach { (fieldName, expectedValue) ->
+ assertTrue(
+ testJson.contains("\"$fieldName\""),
+ "JSON should contain field: $fieldName"
+ )
+ assertTrue(
+ testJson.contains(expectedValue),
+ "Field $fieldName should contain value: $expectedValue"
+ )
+ }
+
+ // Verify corpus-level metadata inheritance works
+ // pubPlace should be inherited from corpus level (not empty from text level)
+ val pubPlaceMatch = Regex(""""key"\s*:\s*"pubPlace".*?"value"\s*:\s*"([^"]+)"""").find(testJson)
+ assertNotNull(pubPlaceMatch, "Should find pubPlace field")
+ assertEquals(
+ "San Bruno, California",
+ pubPlaceMatch.groupValues[1],
+ "pubPlace should be inherited from corpus level"
+ )
+
+ // Verify tokenSource is dynamically extracted from tokens.xml path
+ val tokenSourceMatch = Regex(""""key"\s*:\s*"tokenSource".*?"value"\s*:\s*"([^"]+)"""").find(testJson)
+ assertNotNull(tokenSourceMatch, "Should find tokenSource field")
+ assertEquals(
+ "base#tokens",
+ tokenSourceMatch.groupValues[1],
+ "tokenSource should be extracted from foundry path"
+ )
+ }
+
/**
* Regression test for GitHub issue #21: Missing base/s:p paragraph spans
*
diff --git a/app/src/test/resources/ndy_sample.zip b/app/src/test/resources/ndy_sample.zip
new file mode 100644
index 0000000..5167d62
--- /dev/null
+++ b/app/src/test/resources/ndy_sample.zip
Binary files differ