Write pupDate to krill json also when incomplete
Resolves #39
Change-Id: I49a03a3a9e9caf450af77b8c468db666d5ab7aec
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb634a5..b6cd32d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
# Changelog
+## [v3.3.2] - 2026-04-06
+
+### Fixed
+
+- Krill output now writes `pubDate` even when only partial `pubDate` information is available in `header.xml`, for example year-only values such as `1960` ([#39](https://github.com/KorAP/korapxmltool/issues/39))
+
## [v3.3.1] - 2026-04-05
### Fixed
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index ae12b93..b181a0d 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -5440,17 +5440,24 @@
var year: String? = null
var month: String? = null
var day: String? = null
+ var plainPubDate: String? = null
headerRoot.childElements("pubDate").forEach { element ->
- val value = element.textContent?.trim()?.takeIf { it.isNotEmpty() } ?: return@forEach
- when (element.getAttribute("type")) {
+ val value = element.textContent?.trim()?.takeIf { it.isNotEmpty() }
+ val type = element.getAttribute("type")
+ if (type.isBlank()) {
+ if (plainPubDate == null && value != null) {
+ plainPubDate = value
+ }
+ return@forEach
+ }
+ if (value == null) return@forEach
+ when (type) {
"year" -> year = value
"month" -> month = value
"day" -> day = value
}
}
- if (year != null && month != null && day != null) {
- metadata["pubDate"] = "${year}-${month.padStart(2, '0')}-${day.padStart(2, '0')}"
- }
+ composeKrillPubDate(year, month, day, plainPubDate)?.let { metadata["pubDate"] = it }
headerRoot.firstElement("ref") { it.getAttribute("type") == "page_url" }
?.getAttribute("target")?.takeIf { it.isNotBlank() }?.let { metadata["externalLink"] = it }
@@ -5481,6 +5488,30 @@
}
}
+ private fun composeKrillPubDate(
+ year: String?,
+ month: String?,
+ day: String?,
+ plainPubDate: String? = null
+ ): String? {
+ val normalizedYear = year?.trim()?.takeIf { it.isNotEmpty() }
+ val normalizedMonth = month?.trim()?.takeIf { it.isNotEmpty() }
+ val normalizedDay = day?.trim()?.takeIf { it.isNotEmpty() }
+
+ if (normalizedYear != null) {
+ val parts = mutableListOf(normalizedYear)
+ if (normalizedMonth != null) {
+ parts.add(normalizedMonth.padStart(2, '0'))
+ if (normalizedDay != null) {
+ parts.add(normalizedDay.padStart(2, '0'))
+ }
+ }
+ return parts.joinToString("-")
+ }
+
+ return plainPubDate?.trim()?.takeIf { it.isNotEmpty() }
+ }
+
// Collect corpus-level metadata from corpus header
private fun collectCorpusMetadata(corpusSigle: String, headerRoot: Element) {
val metadata = corpusMetadata.getOrPut(corpusSigle) { mutableMapOf() }
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
index 94a0af7..dd607fa 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
@@ -5,11 +5,14 @@
import org.junit.After
import org.junit.AfterClass
import org.junit.Before
+import org.w3c.dom.Element
import java.io.ByteArrayOutputStream
+import java.io.ByteArrayInputStream
import java.io.File
import java.io.PrintStream
import java.net.URL
import java.util.zip.GZIPInputStream
+import javax.xml.parsers.DocumentBuilderFactory
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertTrue
@@ -102,6 +105,21 @@
return resource
}
+ private fun headerElement(xml: String): Element {
+ val dbFactory = DocumentBuilderFactory.newInstance()
+ val builder = dbFactory.newDocumentBuilder()
+ val doc = builder.parse(ByteArrayInputStream(xml.trimIndent().toByteArray()))
+ return doc.documentElement
+ }
+
+ private fun collectKrillMetadata(tool: KorapXmlTool, docId: String, headerRoot: Element): Map<String, Any> {
+ tool.krillData[docId] = KrillJsonGenerator.KrillTextData(textId = docId)
+ val method = KorapXmlTool::class.java.getDeclaredMethod("collectKrillMetadata", String::class.java, Element::class.java)
+ method.isAccessible = true
+ method.invoke(tool, docId, headerRoot)
+ return tool.krillData.getValue(docId).headerMetadata
+ }
+
@Test
fun krillOutputMatchesExpectedStructure() {
val baseZip = loadResource("wud24_sample.zip").path
@@ -460,6 +478,47 @@
}
@Test
+ fun krillPubDateFallsBackToAvailableDateParts() {
+ val tool = KorapXmlTool()
+
+ val yearOnlyMetadata = collectKrillMetadata(
+ tool,
+ "TEST_DOC.1",
+ headerElement(
+ """
+ <idsHeader>
+ <analytic/>
+ <monogr/>
+ <textDesc/>
+ <pubDate type="year">1960</pubDate>
+ <pubDate type="month"/>
+ <pubDate type="day"/>
+ </idsHeader>
+ """
+ )
+ )
+ assertEquals("1960", yearOnlyMetadata["pubDate"])
+
+ val yearMonthMetadata = collectKrillMetadata(
+ KorapXmlTool(),
+ "TEST_DOC.2",
+ headerElement(
+ """
+ <idsHeader>
+ <analytic/>
+ <monogr/>
+ <textDesc/>
+ <pubDate type="year">1960</pubDate>
+ <pubDate type="month">7</pubDate>
+ <pubDate type="day"/>
+ </idsHeader>
+ """
+ )
+ )
+ assertEquals("1960-07", yearMonthMetadata["pubDate"])
+ }
+
+ @Test
fun testCorrectTextCount() {
val baseZip = loadResource("wud24_sample.zip").path