Test that annotations are processed ordered
Change-Id: I370bbbc69c9c8970824a53858f97a6e77c314629
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 41f3361..5cd2a43 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -2898,7 +2898,7 @@
// Skip if already output (thread-safe check with ConcurrentHashMap.KeySet)
if (outputTexts.contains(docId)) return
- LOGGER.info("Collecting krill base data for $docId: text=${texts[docId] != null}, tokens=${tokens[docId] != null}, sentences=${sentences[docId] != null}")
+ LOGGER.info("Processing base data for $docId: text=${texts[docId] != null}, tokens=${tokens[docId] != null}, sentences=${sentences[docId] != null}, foundry=base")
val textData = krillData.getOrPut(docId) {
KrillTextData(textId = docId)
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index 7fd3154..9b22a26 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
@@ -473,6 +473,7 @@
val generatedTar = ensureKrillTar("wud24_full_foundries") { outputDir ->
arrayOf(
"-f", "krill",
+ "-l", "info",
"-D", outputDir.path,
baseZip,
spacyZip,
@@ -484,6 +485,33 @@
assertTrue(generatedTar.exists(), "Generated krill tar should exist at ${generatedTar.path}")
assertTrue(generatedTar.length() > 0, "Generated tar should not be empty")
+ // Check that log file exists
+ val logFile = File(generatedTar.path.replace(Regex("\\.tar$"), ".log"))
+ assertTrue(logFile.exists(), "Log file should exist at ${logFile.path}")
+ assertTrue(logFile.length() > 0, "Log file should not be empty")
+
+ // Check that texts are processed in alphabetical order for each foundry
+ val logContent = logFile.readText()
+ val foundries = listOf("spacy", "marmot", "opennlp", "treetagger")
+
+ foundries.forEach { foundry ->
+ // Extract text IDs for this foundry from log using regex
+ val pattern = Regex("Processing.*for ([^ :]+).*foundry=$foundry")
+ val textIds = pattern.findAll(logContent)
+ .map { it.groupValues[1] }
+ .toList()
+
+ if (textIds.isNotEmpty()) {
+ // Check if text IDs are in alphabetical order
+ val sortedTextIds = textIds.sorted()
+ assertEquals(
+ sortedTextIds,
+ textIds,
+ "Text IDs for foundry '$foundry' should be processed in alphabetical order. Expected: $sortedTextIds, but got: $textIds"
+ )
+ }
+ }
+
// Extract tar to verify it contains JSON files
val extractDir = File.createTempFile("extract", "").let {
it.delete()