Krill output: introduce month-aware sorting
Change-Id: I830a6e0b618a2cdd9b325db11b6b72463f3297e3
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index e7399cb..dc82a41 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -317,12 +317,14 @@
LOGGER.removeHandler(handler)
}
LOGGER.addHandler(handler)
- LOGGER.level = try {
+ val level = try {
Level.parse(logLevel.uppercase(Locale.getDefault()))
} catch (e: IllegalArgumentException) {
LOGGER.warning("Invalid log level: $logLevel. Defaulting to WARNING.")
Level.WARNING
}
+ LOGGER.level = level
+ handler.level = level // Handler also needs to be set to the same level
if (lemmaOnly) {
useLemma = true
@@ -383,6 +385,9 @@
// Track the next text ID (watermark) each foundry needs to process for priority scheduling
// The foundry with the lexicographically smallest next text ID gets priority
private val foundryWatermarks: ConcurrentHashMap<String, String> = ConcurrentHashMap()
+ private var scanOrderLogged = false
+ private var expectedTextOrder: List<String> = emptyList()
+ private var nextTextOrderIndex: Int = 0
// Priority-based task for foundry-aware scheduling
private inner class PrioritizedTask(
@@ -398,7 +403,7 @@
// and handles sparse foundries naturally (they won't block on non-existent texts)
// First, compare text IDs lexicographically
- val textIdDiff = textId.compareTo(other.textId)
+ val textIdDiff = compareTextIds(textId, other.textId)
if (textIdDiff != 0) return textIdDiff
// If same text ID, prefer base foundry (it should be processed first)
@@ -421,6 +426,45 @@
// Single priority-based executor for all entry processing
private var entryExecutor: java.util.concurrent.ExecutorService? = null
+ private val MONTH_ORDER = mapOf(
+ "JAN" to 1, "FEB" to 2, "MAR" to 3, "MRZ" to 3, "APR" to 4,
+ "MAY" to 5, "MAI" to 5, "JUN" to 6, "JUL" to 7, "AUG" to 8,
+ "SEP" to 9, "OCT" to 10, "OKT" to 10, "NOV" to 11, "DEC" to 12, "DEZ" to 12
+ )
+
+ private data class TextIdSortKey(
+ val prefix: String,
+ val monthRank: Int?,
+ val mid: String,
+ val number: Long,
+ val fallback: String
+ ) : Comparable<TextIdSortKey> {
+ override fun compareTo(other: TextIdSortKey): Int {
+ // First compare by prefix
+ val prefixCmp = prefix.compareTo(other.prefix)
+ if (prefixCmp != 0) return prefixCmp
+
+ // Then compare by month rank (if both have months, use rank; otherwise fall back to mid)
+ val thisRank = monthRank ?: Int.MAX_VALUE
+ val otherRank = other.monthRank ?: Int.MAX_VALUE
+ val rankCmp = thisRank.compareTo(otherRank)
+ if (rankCmp != 0) return rankCmp
+
+ // If both have no month rank (both MAX_VALUE), compare mid alphabetically
+ if (monthRank == null && other.monthRank == null) {
+ val midCmp = mid.compareTo(other.mid)
+ if (midCmp != 0) return midCmp
+ }
+
+ // Then compare by number
+ val numberCmp = number.compareTo(other.number)
+ if (numberCmp != 0) return numberCmp
+
+ // Finally fallback to full ID
+ return fallback.compareTo(other.fallback)
+ }
+ }
+
// Extract text ID from ZIP entry path (e.g., "ZGE24/JAN/00001/base/data.xml" -> "ZGE24_JAN.00001")
private fun getTextIdFromPath(path: String): String {
val parts = path.split('/')
@@ -431,6 +475,15 @@
}
}
+ private fun monthAwareSortKey(textId: String): TextIdSortKey {
+ val parts = Regex("[-_.]").split(textId)
+ val prefix = parts.getOrNull(0) ?: textId
+ val mid = parts.getOrNull(1) ?: ""
+ val tailNumber = parts.getOrNull(2)?.toLongOrNull() ?: Long.MAX_VALUE
+ val monthRank = if (mid.length == 3) MONTH_ORDER[mid.uppercase(Locale.ROOT)] else null
+ return TextIdSortKey(prefix, monthRank, mid, tailNumber, textId)
+ }
+
val texts: ConcurrentHashMap<String, NonBmpString> = ConcurrentHashMap()
val sentences: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
val tokens: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
@@ -909,7 +962,7 @@
// Continue using the same progress bar for remaining texts (no separate bar)
// Output remaining texts (these weren't output incrementally, possibly incomplete)
// Copy keys to avoid ConcurrentModificationException if scanner is still running
- val remainingKeys = krillData.keys.sorted()
+ val remainingKeys = krillData.keys.sortedWith(this::compareTextIds)
remainingKeys.forEach { textId ->
val textData = krillData.remove(textId) ?: return@forEach // Skip if already removed by scanner
val textFoundries = textData.morphoByFoundry.keys.toSet() + setOf("base")
@@ -1063,6 +1116,9 @@
}.toTypedArray()
}
+ private fun compareTextIds(a: String, b: String): Int =
+ monthAwareSortKey(a).compareTo(monthAwareSortKey(b))
+
private fun openZipFile(path: String): ApacheZipFile =
ApacheZipFile.builder()
.setFile(File(path))
@@ -1311,10 +1367,10 @@
// Sort entries by text ID to ensure texts complete as early as possible
// This is crucial for incremental output - all ZIPs will process texts in the same order
- entries.sortBy { entry ->
+ entries.sortWith(compareBy { entry ->
// Extract text ID from path like "ZGE24/JAN/00001/base/data.xml" -> "ZGE24_JAN.00001"
- getTextIdFromPath(entry.name)
- }
+ monthAwareSortKey(getTextIdFromPath(entry.name))
+ })
LOGGER.fine("Sorted entries by text ID for incremental processing")
// Determine document count for progress: prefer data.xml, fallback to tokens.xml
@@ -1354,7 +1410,7 @@
// Group entries by text ID to ensure all files for a text are processed together
val entriesByTextId = entries.groupBy { getTextIdFromPath(it.name) }
- val textIds = entriesByTextId.keys.sorted() // Process text IDs in lexicographic order
+ val textIds = entriesByTextId.keys.sortedWith(this::compareTextIds) // Process text IDs in month-aware order
// Initialize watermark for this foundry if not exists (set to first text ID)
if (!foundryWatermarks.containsKey(foundry) && textIds.isNotEmpty()) {
@@ -3533,12 +3589,12 @@
private fun scanAndOutputCompleteTexts(forceScan: Boolean = false) {
if ((shutdownIncrementalWriter && !forceScan) || !tarStreamOpen) return
- // Get all texts that we know about (from zipInventory), sorted to match processing order
- // This ensures we check texts in the same order they're being processed
- val allTexts = zipInventory.values.flatten().toSet().sorted()
+ if (expectedTextOrder.isEmpty()) return
+ val readyTexts = mutableListOf<String>()
var outputCount = 0
- for (textId in allTexts) {
+ while (nextTextOrderIndex < expectedTextOrder.size) {
+ val textId = expectedTextOrder[nextTextOrderIndex]
// Check if shutdown was requested, thread was interrupted, or stream was closed
if (shutdownIncrementalWriter || Thread.currentThread().isInterrupted || !tarStreamOpen) break
@@ -3553,41 +3609,48 @@
processedTextsPerZip[path]?.contains(textId) == true
}
- if (allProcessed && krillData.containsKey(textId)) {
- // Atomically claim this text for output (add returns false if already present)
- if (outputTexts.add(textId)) {
- // We successfully claimed it - output now
- val textData = krillData.remove(textId)
- if (textData != null) {
- // Acquire lock to prevent concurrent access with stream closing
- tarStreamLock.lock()
+ if (!allProcessed || !krillData.containsKey(textId)) {
+ // Maintain global ordering across foundries: stop at first not-ready text
+ break
+ }
+
+ readyTexts.add(textId)
+ nextTextOrderIndex++
+ }
+
+ readyTexts.sortWith { a, b -> compareTextIds(a, b) }
+
+ // Output ready texts in month-aware order (already in order because allTexts is sorted month-aware)
+ for (textId in readyTexts) {
+ val textData = krillData[textId] ?: continue
+ val relevantZips = zipInventory.filter { (_, texts) -> texts.contains(textId) }.keys
+ if (outputTexts.add(textId)) {
+ tarStreamLock.lock()
+ try {
+ if (tarStreamOpen) {
try {
- // Double-check stream is still open while holding the lock
- if (tarStreamOpen) {
- try {
- outputKrillText(textId, textData)
- incrementalProgressBar?.step()
- outputCount++
- LOGGER.fine("Output text $textId (processed by ${relevantZips.size} ZIPs, ${krillData.size} still pending)")
- } catch (e: IOException) {
- // Stream may have been closed - stop trying to output
- LOGGER.warning("Cannot output text $textId: stream closed")
- tarStreamOpen = false
- break
- }
- }
- } finally {
- tarStreamLock.unlock()
+ outputKrillText(textId, textData)
+ incrementalProgressBar?.step()
+ outputCount++
+ LOGGER.fine("Output text $textId (processed by ${relevantZips.size} ZIPs, ${krillData.size} still pending)")
+ } catch (e: IOException) {
+ LOGGER.warning("Cannot output text $textId: stream closed")
+ tarStreamOpen = false
+ break
}
}
-
- // Clean up tracking data for this text
- relevantZips.forEach { path ->
- zipInventory[path]?.remove(textId)
- processedTextsPerZip[path]?.remove(textId)
- }
+ } finally {
+ tarStreamLock.unlock()
}
}
+
+ krillData.remove(textId)
+
+ // Clean up tracking data for this text
+ relevantZips.forEach { path ->
+ zipInventory[path]?.remove(textId)
+ processedTextsPerZip[path]?.remove(textId)
+ }
}
if (outputCount > 0) {
@@ -3730,7 +3793,11 @@
LOGGER.info("ZIP inventory built: ${zipPaths.size} ZIPs scanned")
// Calculate total unique texts
val allTexts = zipInventory.values.flatten().toSet()
+ expectedTextOrder = allTexts.sortedWith(this::compareTextIds)
+ nextTextOrderIndex = 0
+ scanOrderLogged = false
LOGGER.info(" Total unique texts across all ZIPs: ${allTexts.size}")
+ LOGGER.info(" Text processing order (first 20): ${expectedTextOrder.take(20)}")
}
// Output a single text to Krill TAR (thread-safe)
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index 0a322aa..7d67da5 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
@@ -490,28 +490,76 @@
assertTrue(logFile.exists(), "Log file should exist at ${logFile.path}")
assertTrue(logFile.length() > 0, "Log file should not be empty")
- // Check that texts are processed in alphabetical order for each foundry
- val logContent = logFile.readText()
- val foundries = listOf("spacy", "marmot", "opennlp", "treetagger")
+ // Check that texts are output in month-aware order in the TAR file
+ // Note: We check TAR order instead of log order because parallel processing means
+ // log completion order can differ from submission order, but TAR output follows sorted order
+ val monthOrder = mapOf(
+ "JAN" to 1, "FEB" to 2, "MAR" to 3, "MRZ" to 3, "APR" to 4,
+ "MAY" to 5, "MAI" to 5, "JUN" to 6, "JUL" to 7, "AUG" to 8,
+ "SEP" to 9, "OCT" to 10, "OKT" to 10, "NOV" to 11, "DEC" to 12, "DEZ" to 12
+ )
+ data class MonthKey(
+ val prefix: String,
+ val monthRank: Int,
+ val mid: String,
+ val num: Long,
+ val fallback: String
+ ) : Comparable<MonthKey> {
+ override fun compareTo(other: MonthKey): Int {
+ // First compare by prefix
+ val prefixCmp = prefix.compareTo(other.prefix)
+ if (prefixCmp != 0) return prefixCmp
- foundries.forEach { foundry ->
- // Extract text IDs for this foundry from log using regex
- val pattern = Regex("Processing.*for ([^ :]+).*foundry=$foundry")
- val textIds = pattern.findAll(logContent)
- .map { it.groupValues[1] }
- .toList()
+ // Then compare by month rank
+ val rankCmp = monthRank.compareTo(other.monthRank)
+ if (rankCmp != 0) return rankCmp
- if (textIds.isNotEmpty()) {
- // Check if text IDs are in alphabetical order
- val sortedTextIds = textIds.sorted()
- assertEquals(
- sortedTextIds,
- textIds,
- "Text IDs for foundry '$foundry' should be processed in alphabetical order. Expected: $sortedTextIds, but got: $textIds"
- )
+ // If both have no month rank (both MAX_VALUE), compare mid alphabetically
+ if (monthRank == Int.MAX_VALUE && other.monthRank == Int.MAX_VALUE) {
+ val midCmp = mid.compareTo(other.mid)
+ if (midCmp != 0) return midCmp
+ }
+
+ // Then compare by number
+ val numCmp = num.compareTo(other.num)
+ if (numCmp != 0) return numCmp
+
+ // Finally fallback to full ID
+ return fallback.compareTo(other.fallback)
}
}
+ fun monthAwareKey(textId: String): MonthKey {
+ val tokens = textId.split('_', '.', '-')
+ val prefix = tokens.getOrNull(0) ?: textId
+ val mid = tokens.getOrNull(1) ?: ""
+ val num = tokens.getOrNull(2)?.toLongOrNull() ?: Long.MAX_VALUE
+ val monthRank = if (mid.length == 3) monthOrder[mid] else null
+ return MonthKey(prefix, monthRank ?: Int.MAX_VALUE, mid, num, textId)
+ }
+
+ // Extract text IDs from TAR file (these are written in sorted order)
+ val tarListProcess = ProcessBuilder("tar", "-tf", generatedTar.path)
+ .redirectErrorStream(true)
+ .start()
+ val tarFiles = tarListProcess.inputStream.bufferedReader().readLines()
+ assertTrue(tarListProcess.waitFor() == 0, "tar -tf should succeed")
+
+ // Extract text IDs from JSON filenames in TAR
+ val textIdsInTar = tarFiles
+ .filter { it.endsWith(".json.gz") }
+ .map { it.substringAfterLast('/').removeSuffix(".json.gz").replace('-', '_').replace('.', '_') }
+
+ if (textIdsInTar.isNotEmpty()) {
+ // Check if text IDs in TAR follow month-aware ordering
+ val sortedTextIds = textIdsInTar.sortedWith(compareBy { monthAwareKey(it) })
+ assertEquals(
+ sortedTextIds,
+ textIdsInTar,
+ "Text IDs in TAR should be in month-aware order. Expected: $sortedTextIds, but got: $textIdsInTar"
+ )
+ }
+
// Extract tar to verify it contains JSON files
val extractDir = File.createTempFile("extract", "").let {
it.delete()
@@ -932,6 +980,51 @@
)
}
+ private fun KorapXmlTool.compareTextIds(a: String, b: String): Int {
+ val m = KorapXmlTool::class.java.getDeclaredMethod("compareTextIds", String::class.java, String::class.java)
+ m.isAccessible = true
+ return m.invoke(this, a, b) as Int
+ }
+
+ @Test
+ fun monthAwareComparatorOrdersCalendarMonths() {
+ val tool = KorapXmlTool()
+ assertTrue(tool.compareTextIds("ZGE24_JAN.00001", "ZGE24_MAR.00001") < 0, "JAN should sort before MAR")
+ assertTrue(tool.compareTextIds("ZGE24_MRZ.00001", "ZGE24_APR.00001") < 0, "MRZ should sort before APR")
+ assertTrue(tool.compareTextIds("ZGE24_OKT.00001", "ZGE24_SEP.00001") > 0, "OKT should sort after SEP")
+ assertTrue(tool.compareTextIds("ZGE24_DEZ.00001", "ZGE24_NOV.00001") > 0, "DEZ should sort after NOV")
+ assertTrue(tool.compareTextIds("ZGE24_MAI.00001", "ZGE24_JUL.00001") < 0, "MAI should sort before JUL")
+ }
+
+ @Test
+ fun monthAwareComparatorFallsBackToAlphabeticalWhenNoMonth() {
+ val tool = KorapXmlTool()
+ val ids = listOf("WUD24_I0083.95367", "WUD24_Z0087.65594", "WUD24_K0086.98010")
+ val sorted = ids.sortedWith { a, b -> tool.compareTextIds(a, b) }
+ assertEquals(listOf("WUD24_I0083.95367", "WUD24_K0086.98010", "WUD24_Z0087.65594"), sorted, "Non-month IDs should sort alphabetically")
+ }
+
+ @Test
+ fun monthAwareComparatorSortsMixedMonthsInCalendarOrder() {
+ val tool = KorapXmlTool()
+ val ids = listOf(
+ "ZGE24_OKT.00002",
+ "ZGE24_JAN.00003",
+ "ZGE24_DEZ.00001",
+ "ZGE24_SEP.00005",
+ "ZGE24_MAR.00001"
+ )
+ val expected = listOf(
+ "ZGE24_JAN.00003",
+ "ZGE24_MAR.00001",
+ "ZGE24_SEP.00005",
+ "ZGE24_OKT.00002",
+ "ZGE24_DEZ.00001"
+ )
+ val sorted = ids.sortedWith { a, b -> tool.compareTextIds(a, b) }
+ assertEquals(expected, sorted, "Mixed month IDs should follow calendar order")
+ }
+
private fun readKrillJson(tarFile: File): Map<String, String> {
val extractDir = File.createTempFile("krill_extract", "").let {
it.delete()