Make sure that pubDate and creationDate are inherited if necessary
Change-Id: I8a2cdeee31b115532b6a82c960dd9284f2c1393e
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b6cd32d..a821718 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,12 @@
# Changelog
+## [Unreleased]
+
+### Fixed
+
+- Krill metadata inheritance now ignores empty text-level `creatDate`/`pubDate` elements, inherits metadata consistently from corpus and document headers, and backfills `creationDate` and `pubDate` from each other so both dates are always present once either one is available
+- Corpus and document headers now expose the same common Krill metadata fields for downstream text-level inheritance, including title/author-style fields and publication metadata
+
## [v3.3.2] - 2026-04-06
### Fixed
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 471535d..fd05dbb 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -5366,125 +5366,151 @@
val textData = krillData.getOrPut(docId) { KrillJsonGenerator.KrillTextData(textId = docId) }
synchronized(textData) {
- val metadata = textData.headerMetadata
- val analytic = headerRoot.firstElement("analytic")
- val monogr = headerRoot.firstElement("monogr")
- val textDesc = headerRoot.firstElement("textDesc")
- val textClassElement = headerRoot.firstElement("textClass")
+ mergeExtractedKrillMetadata(textData.headerMetadata, extractKrillHeaderMetadata(headerRoot))
+ LOGGER.fine("Collected ${textData.headerMetadata.size} metadata fields for $docId")
+ }
+ }
- metadata.putIfNotBlank("author", analytic.firstText("h.author") ?: monogr.firstText("h.author"))
+ private fun extractKrillHeaderMetadata(headerRoot: Element): MutableMap<String, Any> {
+ val metadata = mutableMapOf<String, Any>()
+ val analytic = headerRoot.firstElement("analytic")
+ val monogr = headerRoot.firstElement("monogr")
+ val textDesc = headerRoot.firstElement("textDesc")
+ val textClassElement = headerRoot.firstElement("textClass")
- val mainTitle = analytic.firstText("h.title") { it.getAttribute("type") == "main" }
- ?: analytic.firstText("h.title")
- ?: monogr.firstText("h.title") { it.getAttribute("type") == "main" }
- ?: monogr.firstText("h.title")
- metadata.putIfNotBlank("title", mainTitle)
+ metadata.putIfNotBlank("author", analytic.firstText("h.author") ?: monogr.firstText("h.author") ?: headerRoot.firstText("h.author"))
- metadata.putIfNotBlank(
- "subTitle",
- analytic.firstText("h.title") { it.getAttribute("type") == "sub" }
- ?: monogr.firstText("h.title") { it.getAttribute("type") == "sub" }
- )
+ val mainTitle = analytic.firstText("h.title") { it.getAttribute("type") == "main" }
+ ?: analytic.firstText("h.title")
+ ?: monogr.firstText("h.title") { it.getAttribute("type") == "main" }
+ ?: monogr.firstText("h.title")
+ ?: headerRoot.firstText("d.title")
+ ?: headerRoot.firstText("c.title") { it.getAttribute("type") == "main" }
+ ?: headerRoot.firstText("c.title")
+ metadata.putIfNotBlank("title", mainTitle)
- val translator = headerRoot.firstText("editor") { it.getAttribute("role") == "translator" }
- if (translator != null) {
- metadata["translator"] = translator
- } else {
- metadata.putIfNotBlank("editor", analytic.firstText("editor") ?: monogr.firstText("editor"))
+ metadata.putIfNotBlank(
+ "subTitle",
+ analytic.firstText("h.title") { it.getAttribute("type") == "sub" }
+ ?: monogr.firstText("h.title") { it.getAttribute("type") == "sub" }
+ ?: headerRoot.firstText("c.title") { it.getAttribute("type") == "sub" }
+ )
+
+ val translator = headerRoot.firstText("editor") { it.getAttribute("role") == "translator" }
+ if (translator != null) {
+ metadata["translator"] = translator
+ } else {
+ metadata.putIfNotBlank("editor", analytic.firstText("editor") ?: monogr.firstText("editor") ?: headerRoot.firstText("editor"))
+ }
+
+ metadata.putIfNotBlank("publisher", headerRoot.firstText("publisher"))
+ metadata.putIfNotBlank("distributor", headerRoot.firstText("distributor"))
+ metadata.putIfNotBlank("availability", headerRoot.firstText("availability"))
+ metadata.putIfNotBlank("ISBN", headerRoot.firstText("idno") { it.getAttribute("type") == "ISBN" })
+
+ headerRoot.firstElement("reference") { it.getAttribute("type") == "complete" }
+ ?.textContent?.trim()?.takeIf { it.isNotEmpty() }?.let { metadata["reference"] = it }
+
+ headerRoot.firstElement("idno") { it.getAttribute("type") == "URN" }
+ ?.textContent?.trim()?.takeIf { it.isNotEmpty() }?.let { urn ->
+ metadata["URN"] = mapOf("urn" to urn, "url" to "http://nbn-resolving.de/$urn")
}
- metadata.putIfNotBlank("publisher", headerRoot.firstText("publisher"))
- metadata.putIfNotBlank("distributor", headerRoot.firstText("distributor"))
- metadata.putIfNotBlank("availability", headerRoot.firstText("availability"))
- metadata.putIfNotBlank("ISBN", headerRoot.firstText("idno") { it.getAttribute("type") == "ISBN" })
+ metadata.putIfNotBlank("textType", textDesc.firstText("textType"))
+ metadata.putIfNotBlank("textDomain", textDesc.firstText("textDomain"))
+ metadata.putIfNotBlank("textTypeArt", textDesc.firstText("textTypeArt"))
+ metadata.putIfNotBlank("textTypeRef", textDesc.firstText("textTypeRef"))
+ metadata.putIfNotBlank("textColumn", textDesc.firstText("column"))
- headerRoot.firstElement("reference") { it.getAttribute("type") == "complete" }
- ?.textContent?.trim()?.takeIf { it.isNotEmpty() }?.let { metadata["reference"] = it }
+ headerRoot.firstElement("pubPlace")?.let { placeElement ->
+ placeElement.textContent?.trim()?.takeIf { it.isNotEmpty() }?.let { metadata["pubPlace"] = it }
+ placeElement.getAttribute("key").takeIf { it.isNotBlank() }?.let { metadata["pubPlaceKey"] = it }
+ }
- headerRoot.firstElement("idno") { it.getAttribute("type") == "URN" }
- ?.textContent?.trim()?.takeIf { it.isNotEmpty() }?.let { urn ->
- metadata["URN"] = mapOf("urn" to urn, "url" to "http://nbn-resolving.de/$urn")
+ val awards = headerRoot.childElements("note")
+ .mapNotNull { note ->
+ val subtype = note.getAttribute("subtype")
+ if (note.getAttribute("type") == "award" && subtype.isNotBlank()) subtype.trim() else null
+ }.toList()
+ if (awards.isNotEmpty()) {
+ metadata["award"] = awards
+ }
+
+ val textClassTopics = textClassElement.collectCatRefTopics()
+ val fallbackTopics = if (textClassTopics.isEmpty()) headerRoot.collectCatRefTopics() else emptyList()
+ val finalTopics = if (textClassTopics.isNotEmpty()) textClassTopics else fallbackTopics
+ if (finalTopics.isNotEmpty()) {
+ metadata["textClass"] = finalTopics
+ }
+
+ headerRoot.firstText("creatDate")?.replace(".", "-")?.let {
+ metadata["creationDate"] = it
+ }
+
+ var year: String? = null
+ var month: String? = null
+ var day: String? = null
+ var plainPubDate: String? = null
+ headerRoot.childElements("pubDate").forEach { element ->
+ val value = element.textContent?.trim()?.takeIf { it.isNotEmpty() }
+ val type = element.getAttribute("type")
+ if (type.isBlank()) {
+ if (plainPubDate == null && value != null) {
+ plainPubDate = value
}
-
- metadata.putIfNotBlank("textType", textDesc.firstText("textType"))
- metadata.putIfNotBlank("textDomain", textDesc.firstText("textDomain"))
- metadata.putIfNotBlank("textTypeArt", textDesc.firstText("textTypeArt"))
- metadata.putIfNotBlank("textTypeRef", textDesc.firstText("textTypeRef"))
- metadata.putIfNotBlank("textColumn", textDesc.firstText("column"))
-
- headerRoot.firstElement("pubPlace")?.let { placeElement ->
- placeElement.textContent?.trim()?.takeIf { it.isNotEmpty() }?.let { metadata["pubPlace"] = it }
- placeElement.getAttribute("key").takeIf { it.isNotBlank() }?.let { metadata["pubPlaceKey"] = it }
+ return@forEach
}
-
- val awards = headerRoot.childElements("note")
- .mapNotNull { note ->
- val subtype = note.getAttribute("subtype")
- if (note.getAttribute("type") == "award" && subtype.isNotBlank()) subtype.trim() else null
- }.toList()
- if (awards.isNotEmpty()) {
- metadata["award"] = awards
+ if (value == null) return@forEach
+ when (type) {
+ "year" -> year = value
+ "month" -> month = value
+ "day" -> day = value
}
+ }
+ composeKrillPubDate(year, month, day, plainPubDate)?.let { metadata["pubDate"] = it }
- val textClassTopics = textClassElement.collectCatRefTopics()
- val fallbackTopics = if (textClassTopics.isEmpty()) headerRoot.collectCatRefTopics() else emptyList()
- val finalTopics = if (textClassTopics.isNotEmpty()) textClassTopics else fallbackTopics
- if (finalTopics.isNotEmpty()) {
- metadata["textClass"] = finalTopics
+ headerRoot.firstElement("ref") { it.getAttribute("type") == "page_url" }
+ ?.getAttribute("target")?.takeIf { it.isNotBlank() }?.let { metadata["externalLink"] = it }
+
+ val biblNoteElement = analytic.firstElement("biblNote") { it.getAttribute("n") == "url" }
+ ?: monogr.firstElement("biblNote") { it.getAttribute("n") == "url" }
+ biblNoteElement?.let {
+ val url = it.textContent?.trim()?.takeIf { it.isNotEmpty() }
+ metadata.putIfNotBlank("textExternalLink", url)
+ val rendAttr = it.getAttribute("rend")?.trim()?.takeIf { it.isNotBlank() }
+ metadata.putIfNotBlank("textExternalLinkTitle", rendAttr)
+ }
+
+ if (!metadata.containsKey("language")) {
+ metadata["language"] = "de"
+ }
+
+ if (!metadata.containsKey("textType")) {
+ val textTypeArt = metadata["textTypeArt"] as? String
+ if (textTypeArt != null) {
+ metadata["textType"] = textTypeArt + "en"
}
+ }
- headerRoot.firstText("creatDate")?.replace(".", "-")?.let {
- metadata["creationDate"] = it
- }
+ normalizeKrillDateMetadata(metadata)
+ return metadata
+ }
- var year: String? = null
- var month: String? = null
- var day: String? = null
- var plainPubDate: String? = null
- headerRoot.childElements("pubDate").forEach { element ->
- val value = element.textContent?.trim()?.takeIf { it.isNotEmpty() }
- val type = element.getAttribute("type")
- if (type.isBlank()) {
- if (plainPubDate == null && value != null) {
- plainPubDate = value
- }
- return@forEach
- }
- if (value == null) return@forEach
- when (type) {
- "year" -> year = value
- "month" -> month = value
- "day" -> day = value
- }
- }
- composeKrillPubDate(year, month, day, plainPubDate)?.let { metadata["pubDate"] = it }
+ private fun mergeExtractedKrillMetadata(target: MutableMap<String, Any>, source: Map<String, Any>) {
+ source.forEach { (key, value) ->
+ target[key] = value
+ }
+ }
- headerRoot.firstElement("ref") { it.getAttribute("type") == "page_url" }
- ?.getAttribute("target")?.takeIf { it.isNotBlank() }?.let { metadata["externalLink"] = it }
+ private fun normalizeKrillDateMetadata(metadata: MutableMap<String, Any>) {
+ val creationDate = (metadata["creationDate"] as? String)?.trim()?.takeIf { it.isNotEmpty() }
+ val pubDate = (metadata["pubDate"] as? String)?.trim()?.takeIf { it.isNotEmpty() }
- // Extract textExternalLink from biblNote[@n='url']
- val biblNoteElement = analytic.firstElement("biblNote") { it.getAttribute("n") == "url" }
- ?: monogr.firstElement("biblNote") { it.getAttribute("n") == "url" }
- biblNoteElement?.let {
- val url = it.textContent?.trim()?.takeIf { it.isNotEmpty() }
- metadata.putIfNotBlank("textExternalLink", url)
- // Extract rend attribute as title
- val rendAttr = it.getAttribute("rend")?.trim()?.takeIf { it.isNotBlank() }
- metadata.putIfNotBlank("textExternalLinkTitle", rendAttr)
- }
-
- if (!metadata.containsKey("language")) {
- metadata["language"] = "de"
- }
-
- if (!metadata.containsKey("textType")) {
- val textTypeArt = metadata["textTypeArt"] as? String
- if (textTypeArt != null) {
- metadata["textType"] = textTypeArt + "en"
- }
- }
-
- LOGGER.fine("Collected ${metadata.size} metadata fields for $docId")
+ if (creationDate == null && pubDate != null) {
+ metadata["creationDate"] = pubDate
+ }
+ if (pubDate == null && creationDate != null) {
+ metadata["pubDate"] = creationDate
}
}
@@ -5517,18 +5543,20 @@
val metadata = corpusMetadata.getOrPut(corpusSigle) { mutableMapOf() }
synchronized(metadata) {
- metadata.putIfNotBlank("corpusTitle", headerRoot.firstText("c.title"))
+ mergeExtractedKrillMetadata(metadata, extractKrillHeaderMetadata(headerRoot))
+ metadata.putIfNotBlank(
+ "corpusTitle",
+ headerRoot.firstText("c.title") { it.getAttribute("type") == "main" } ?: headerRoot.firstText("c.title")
+ )
metadata.putIfNotBlank(
"corpusSubTitle",
headerRoot.firstText("c.title") { it.getAttribute("type") == "sub" }
)
metadata.putIfNotBlank("corpusAuthor", headerRoot.firstText("h.author"))
- metadata.putIfNotBlank("corpusEditor", headerRoot.firstElement("monogr").firstText("editor"))
- metadata.putIfNotBlank("publisher", headerRoot.firstText("publisher"))
- metadata.putIfNotBlank("distributor", headerRoot.firstText("distributor"))
- metadata.putIfNotBlank("pubPlace", headerRoot.firstText("pubPlace"))
- metadata.putIfNotBlank("textType", headerRoot.firstElement("textDesc").firstText("textType"))
-
+ metadata.putIfNotBlank(
+ "corpusEditor",
+ headerRoot.firstElement("monogr").firstText("editor") ?: headerRoot.firstText("editor")
+ )
LOGGER.fine("Collected ${metadata.size} corpus-level metadata fields for $corpusSigle")
}
}
@@ -5538,8 +5566,9 @@
val metadata = docMetadata.getOrPut(docSigle) { mutableMapOf() }
synchronized(metadata) {
- metadata.putIfNotBlank("docTitle", headerRoot.firstText("d.title"))
- metadata.putIfNotBlank("docAuthor", headerRoot.firstText("h.author"))
+ mergeExtractedKrillMetadata(metadata, extractKrillHeaderMetadata(headerRoot))
+ metadata.putIfNotBlank("docTitle", headerRoot.firstText("d.title") ?: metadata["title"] as? String)
+ metadata.putIfNotBlank("docAuthor", headerRoot.firstText("h.author") ?: metadata["author"] as? String)
LOGGER.fine("Collected ${metadata.size} doc-level metadata fields for $docSigle")
}
}
@@ -5888,42 +5917,14 @@
}
private fun applyInheritedKrillMetadata(textId: String, textData: KrillJsonGenerator.KrillTextData) {
- val corpusSigle = textId.substringBefore('_')
- val docSigle = textId.substringBeforeLast('.')
-
- corpusMetadata[corpusSigle]?.forEach { (key, value) ->
- val currentValue = textData.headerMetadata[key]
- val shouldInherit = when (currentValue) {
- null -> true
- is String -> currentValue.isBlank()
- else -> false
- }
- if (shouldInherit) {
- when (value) {
- is String -> if (value.isNotBlank()) textData.headerMetadata[key] = value
- is List<*> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
- is Map<*, *> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
- else -> textData.headerMetadata[key] = value
- }
- }
- }
-
- docMetadata[docSigle]?.forEach { (key, value) ->
- val currentValue = textData.headerMetadata[key]
- val shouldInherit = when (currentValue) {
- null -> true
- is String -> currentValue.isBlank()
- else -> false
- }
- if (shouldInherit) {
- when (value) {
- is String -> if (value.isNotBlank()) textData.headerMetadata[key] = value
- is List<*> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
- is Map<*, *> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
- else -> textData.headerMetadata[key] = value
- }
- }
- }
+ val resolvedMetadata = KrillJsonGenerator.resolveHeaderMetadata(
+ textId = textId,
+ textHeaderMetadata = textData.headerMetadata,
+ corpusMetadata = corpusMetadata,
+ docMetadata = docMetadata
+ )
+ textData.headerMetadata.clear()
+ textData.headerMetadata.putAll(resolvedMetadata)
}
private fun enqueueKrillCompression(textId: String, textData: KrillJsonGenerator.KrillTextData) {
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
index cf4a7d8..b57af66 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
@@ -14,6 +14,25 @@
private val LOGGER = Logger.getLogger(KrillJsonGenerator::class.java.name)
private val BASE_STRUCTURE_FOUNDRIES = setOf("base", "dereko")
+ fun resolveHeaderMetadata(
+ textId: String,
+ textHeaderMetadata: Map<String, Any>,
+ corpusMetadata: Map<String, out Map<String, Any>>,
+ docMetadata: Map<String, out Map<String, Any>>
+ ): MutableMap<String, Any> {
+ val resolved = mutableMapOf<String, Any>()
+ val textIdWithSlashes = textId.replace("_", "/").replace(".", "/")
+ val sigleParts = textIdWithSlashes.split("/")
+ val corpusSigle = sigleParts.firstOrNull().orEmpty()
+ val docSigle = sigleParts.take(2).joinToString("/")
+
+ mergeMeaningfulMetadata(resolved, corpusMetadata[corpusSigle])
+ mergeMeaningfulMetadata(resolved, docMetadata[docSigle])
+ mergeMeaningfulMetadata(resolved, textHeaderMetadata)
+ normalizeRequiredDates(resolved)
+ return resolved
+ }
+
/**
* Data class representing a complete Krill text with all annotations.
*/
@@ -108,47 +127,12 @@
)))
}
- // Merge corpus and doc metadata into text metadata (corpus < doc < text precedence)
- val corpusSigle = textIdWithSlashes.split("/")[0]
- val docSigle = textIdWithSlashes.split("/").take(2).joinToString("/")
-
- // First apply corpus-level metadata (lowest priority) - only if not already set with non-empty value
- corpusMetadata[corpusSigle]?.forEach { (key, value) ->
- val currentValue = textData.headerMetadata[key]
- val shouldInherit = when (currentValue) {
- null -> true
- is String -> currentValue.isBlank()
- else -> false
- }
- if (shouldInherit) {
- when (value) {
- is String -> if (value.isNotBlank()) textData.headerMetadata[key] = value
- is List<*> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
- is Map<*, *> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
- else -> textData.headerMetadata[key] = value
- }
- }
- }
-
- // Then apply doc-level metadata (medium priority) - only if not already set with non-empty value
- docMetadata[docSigle]?.forEach { (key, value) ->
- val currentValue = textData.headerMetadata[key]
- val shouldInherit = when (currentValue) {
- null -> true
- is String -> currentValue.isBlank()
- else -> false
- }
- if (shouldInherit) {
- when (value) {
- is String -> if (value.isNotBlank()) textData.headerMetadata[key] = value
- is List<*> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
- is Map<*, *> -> if (value.isNotEmpty()) textData.headerMetadata[key] = value
- else -> textData.headerMetadata[key] = value
- }
- }
- }
-
- // Text-level metadata is already in textData.headerMetadata (highest priority)
+ val resolvedHeaderMetadata = resolveHeaderMetadata(
+ textId = textData.textId,
+ textHeaderMetadata = textData.headerMetadata,
+ corpusMetadata = corpusMetadata,
+ docMetadata = docMetadata
+ )
// Add additional metadata fields from header with correct types
val fieldOrder = listOf(
@@ -156,12 +140,12 @@
"creationDate", "pubDate", "textClass", "award", "availability", "language",
"ISBN", "URN", "pubPlace", "pubPlaceKey",
"textType", "textTypeArt", "textTypeRef", "textDomain", "textColumn",
- "author", "title", "subTitle", "corpusTitle", "corpusSubTitle", "docTitle", "docAuthor",
+ "author", "title", "subTitle", "corpusAuthor", "corpusTitle", "corpusSubTitle", "docTitle", "docAuthor",
"textExternalLink", "tokenSource"
)
fieldOrder.forEach { key ->
- val value = textData.headerMetadata[key] ?: return@forEach
+ val value = resolvedHeaderMetadata[key] ?: return@forEach
// Determine field type and value format
val (fieldType, fieldValue) = when (key) {
@@ -214,20 +198,20 @@
"type:attachement" to jsonString("data:,${value.toString()}")
}
}
- "author", "title", "subTitle", "corpusTitle", "corpusSubTitle", "docTitle", "docAuthor" -> {
+ "author", "title", "subTitle", "corpusAuthor", "corpusTitle", "corpusSubTitle", "docTitle", "docAuthor" -> {
"type:text" to jsonString(value.toString())
}
"externalLink" -> {
val url = value.toString()
// Extract title from corpus/publisher metadata if available
- val title = textData.headerMetadata["publisher"]?.toString() ?: "Link"
+ val title = resolvedHeaderMetadata["publisher"]?.toString() ?: "Link"
val encodedUrl = url.replace(":", "%3A").replace("/", "%2F")
"type:attachement" to jsonString("data:application/x.korap-link;title=$title,$encodedUrl")
}
"textExternalLink" -> {
val url = value.toString()
- val title = textData.headerMetadata["textExternalLinkTitle"]?.toString()
- ?: textData.headerMetadata["publisher"]?.toString() ?: "Link"
+ val title = resolvedHeaderMetadata["textExternalLinkTitle"]?.toString()
+ ?: resolvedHeaderMetadata["publisher"]?.toString() ?: "Link"
val encodedUrl = url.replace(":", "%3A").replace("/", "%2F")
"type:attachement" to jsonString("data:application/x.korap-link;title=$title,$encodedUrl")
}
@@ -398,6 +382,35 @@
out.append("}") // close root
}
+ private fun mergeMeaningfulMetadata(target: MutableMap<String, Any>, source: Map<String, Any>?) {
+ source?.forEach { (key, value) ->
+ if (isMeaningfulMetadataValue(value)) {
+ target[key] = value
+ }
+ }
+ }
+
+ private fun normalizeRequiredDates(metadata: MutableMap<String, Any>) {
+ val creationDate = (metadata["creationDate"] as? String)?.trim()?.takeIf { it.isNotEmpty() }
+ val pubDate = (metadata["pubDate"] as? String)?.trim()?.takeIf { it.isNotEmpty() }
+
+ if (creationDate == null && pubDate != null) {
+ metadata["creationDate"] = pubDate
+ }
+ if (pubDate == null && creationDate != null) {
+ metadata["pubDate"] = creationDate
+ }
+ }
+
+ private fun isMeaningfulMetadataValue(value: Any?): Boolean = when (value) {
+ null -> false
+ is String -> value.isNotBlank()
+ is Collection<*> -> value.isNotEmpty()
+ is Map<*, *> -> value.isNotEmpty()
+ is Array<*> -> value.isNotEmpty()
+ else -> true
+ }
+
private fun generateStream(textData: KrillTextData, includeNonWordTokens: Boolean): List<String> {
val result = mutableListOf<String>()
forEachStreamItem(textData, includeNonWordTokens) { result.add(it) }
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
index dd607fa..b121a44 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
@@ -120,6 +120,26 @@
return tool.krillData.getValue(docId).headerMetadata
}
+ private fun collectDocMetadata(tool: KorapXmlTool, docSigle: String, headerRoot: Element): Map<String, Any> {
+ val method = KorapXmlTool::class.java.getDeclaredMethod("collectDocMetadata", String::class.java, Element::class.java)
+ method.isAccessible = true
+ method.invoke(tool, docSigle, headerRoot)
+ return tool.docMetadata.getValue(docSigle)
+ }
+
+ private fun collectCorpusMetadata(tool: KorapXmlTool, corpusSigle: String, headerRoot: Element): Map<String, Any> {
+ val method = KorapXmlTool::class.java.getDeclaredMethod("collectCorpusMetadata", String::class.java, Element::class.java)
+ method.isAccessible = true
+ method.invoke(tool, corpusSigle, headerRoot)
+ return tool.corpusMetadata.getValue(corpusSigle)
+ }
+
+ private fun krillFieldValue(json: String, fieldName: String): String? =
+ Regex(
+ """"key"\s*:\s*"$fieldName".*?"value"\s*:\s*"([^"]*)"""",
+ setOf(RegexOption.DOT_MATCHES_ALL)
+ ).find(json)?.groupValues?.getOrNull(1)
+
@Test
fun krillOutputMatchesExpectedStructure() {
val baseZip = loadResource("wud24_sample.zip").path
@@ -519,6 +539,107 @@
}
@Test
+ fun krillInheritedDatesIgnoreEmptyTextValuesAndBackfillEachOther() {
+ val tool = KorapXmlTool()
+ val textMetadata = collectKrillMetadata(
+ tool,
+ "TEST_DOC.1",
+ headerElement(
+ """
+ <idsHeader>
+ <analytic/>
+ <monogr/>
+ <textDesc/>
+ <creatDate/>
+ <pubDate/>
+ </idsHeader>
+ """
+ )
+ )
+
+ val json = KrillJsonGenerator.generate(
+ KrillJsonGenerator.KrillTextData(
+ textId = "TEST_DOC.1",
+ headerMetadata = textMetadata.toMutableMap()
+ ),
+ mapOf("TEST" to mutableMapOf<String, Any>("creationDate" to "1960-07-01")),
+ emptyMap<String, MutableMap<String, Any>>(),
+ includeNonWordTokens = false
+ )
+
+ assertEquals("1960-07-01", krillFieldValue(json, "creationDate"))
+ assertEquals("1960-07-01", krillFieldValue(json, "pubDate"))
+ }
+
+ @Test
+ fun krillMetadataResolutionUsesHierarchyPrecedence() {
+ val resolvedFromDoc = KrillJsonGenerator.resolveHeaderMetadata(
+ textId = "TEST_DOC.1",
+ textHeaderMetadata = mutableMapOf<String, Any>("pubPlace" to " "),
+ corpusMetadata = mapOf("TEST" to mutableMapOf<String, Any>("pubPlace" to "Corpus Place")),
+ docMetadata = mapOf("TEST/DOC" to mutableMapOf<String, Any>("pubPlace" to "Doc Place"))
+ )
+ assertEquals("Doc Place", resolvedFromDoc["pubPlace"])
+
+ val resolvedFromText = KrillJsonGenerator.resolveHeaderMetadata(
+ textId = "TEST_DOC.1",
+ textHeaderMetadata = mutableMapOf<String, Any>("pubPlace" to "Text Place"),
+ corpusMetadata = mapOf("TEST" to mutableMapOf<String, Any>("pubPlace" to "Corpus Place")),
+ docMetadata = mapOf("TEST/DOC" to mutableMapOf<String, Any>("pubPlace" to "Doc Place"))
+ )
+ assertEquals("Text Place", resolvedFromText["pubPlace"])
+ }
+
+ @Test
+ fun docAndCorpusMetadataCollectorsExposeCommonInheritedFields() {
+ val tool = KorapXmlTool()
+
+ val docMetadata = collectDocMetadata(
+ tool,
+ "TEST/DOC",
+ headerElement(
+ """
+ <idsHeader>
+ <d.title>Document Level Title</d.title>
+ <h.author>Document Author</h.author>
+ <publisher>Document Publisher</publisher>
+ <creatDate>1984-01-02</creatDate>
+ </idsHeader>
+ """
+ )
+ )
+ assertEquals("Document Level Title", docMetadata["title"])
+ assertEquals("Document Level Title", docMetadata["docTitle"])
+ assertEquals("Document Author", docMetadata["author"])
+ assertEquals("Document Author", docMetadata["docAuthor"])
+ assertEquals("Document Publisher", docMetadata["publisher"])
+ assertEquals("1984-01-02", docMetadata["creationDate"])
+ assertEquals("1984-01-02", docMetadata["pubDate"])
+
+ val corpusMetadata = collectCorpusMetadata(
+ tool,
+ "TEST",
+ headerElement(
+ """
+ <idsHeader>
+ <c.title>Corpus Level Title</c.title>
+ <h.author>Corpus Author</h.author>
+ <pubPlace>Corpus Place</pubPlace>
+ <pubDate type="year">1999</pubDate>
+ </idsHeader>
+ """
+ )
+ )
+ assertEquals("Corpus Level Title", corpusMetadata["title"])
+ assertEquals("Corpus Level Title", corpusMetadata["corpusTitle"])
+ assertEquals("Corpus Author", corpusMetadata["author"])
+ assertEquals("Corpus Author", corpusMetadata["corpusAuthor"])
+ assertEquals("Corpus Place", corpusMetadata["pubPlace"])
+ assertEquals("1999", corpusMetadata["pubDate"])
+ assertEquals("1999", corpusMetadata["creationDate"])
+ }
+
+ @Test
fun testCorrectTextCount() {
val baseZip = loadResource("wud24_sample.zip").path