Make sure the foundry is picked up from external annotators (-A)
Resolves #8
Change-Id: I29a284ef24b90c01ada84eee7bfb8fedf7169d7f
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 6a5ff6b..56852ca 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -1228,6 +1228,31 @@
morphoZipOutputStream!!.flush()
morphoZipOutputStream!!.close()
LOGGER.info("Closed output ZIP file after annotation processing")
+
+ // Rename ZIP file if foundry was detected from CoNLL-U output
+ if (targetZipFileName != null && externalFoundry != null) {
+ val currentFile = File(targetZipFileName!!)
+ val baseZipName = File(args[0]).name.replace(Regex("\\.zip$"), "")
+ val newFileName = File(outputDir, "$baseZipName.$externalFoundry.zip").absolutePath
+
+ if (currentFile.absolutePath != newFileName) {
+ val newFile = File(newFileName)
+ if (currentFile.renameTo(newFile)) {
+ LOGGER.info("Renamed output ZIP from ${currentFile.name} to ${newFile.name} based on detected foundry")
+
+ // Also rename the log file
+ val oldLogFile = File(targetZipFileName!!.replace(Regex("\\.zip$"), ".log"))
+ val newLogFile = File(newFileName.replace(Regex("\\.zip$"), ".log"))
+ if (oldLogFile.exists() && oldLogFile.renameTo(newLogFile)) {
+ LOGGER.info("Renamed log file from ${oldLogFile.name} to ${newLogFile.name}")
+ }
+
+ targetZipFileName = newFileName
+ } else {
+ LOGGER.warning("Failed to rename ZIP file from ${currentFile.absolutePath} to $newFileName")
+ }
+ }
+ }
} catch (e: Exception) {
LOGGER.severe("ERROR closing ZIP file: ${e.message}")
e.printStackTrace()
@@ -3643,9 +3668,17 @@
val sentenceSpans = mutableListOf<Span>()
var sentenceStartOffset: Int? = null
var sentenceEndOffset: Int? = null
+ var extractedFoundry: String? = null
for (line in lines) {
when {
+ line.startsWith("# foundry =") -> {
+ val foundryStr = line.substring("# foundry =".length).trim()
+ if (foundryStr.isNotEmpty()) {
+ extractedFoundry = foundryStr
+ LOGGER.fine("Extracted foundry from CoNLL-U output: $extractedFoundry")
+ }
+ }
line.startsWith("# start_offsets =") -> {
val offsetsStr = line.substring("# start_offsets =".length).trim()
val allOffsets = offsetsStr.split(Regex("\\s+")).mapNotNull { it.toIntOrNull() }
@@ -3740,10 +3773,20 @@
}
}
+ // Use extracted foundry from CoNLL-U output if available
+ val actualFoundry = if (extractedFoundry != null) {
+ LOGGER.info("Using foundry from CoNLL-U output: $extractedFoundry (was: $foundry)")
+ // Update the global externalFoundry variable for consistent naming
+ externalFoundry = extractedFoundry
+ extractedFoundry
+ } else {
+ foundry
+ }
+
try {
val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
docId = tempDocId,
- foundry = foundry,
+ foundry = actualFoundry,
tokens = tokens[tempDocId],
sentences = sentences[tempDocId],
text = texts[tempDocId],
@@ -3766,7 +3809,7 @@
"docid=\"$docId\""
)
- val morphoEntryPath = docId.replace(Regex("[_.]"), "/") + "/$foundry/morpho.xml"
+ val morphoEntryPath = docId.replace(Regex("[_.]"), "/") + "/$actualFoundry/morpho.xml"
val morphoZipEntry = ZipArchiveEntry(morphoEntryPath)
morphoZipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
@@ -3786,7 +3829,7 @@
try {
val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
docId = tempDocId,
- foundry = foundry,
+ foundry = actualFoundry,
tokens = tokens[tempDocId],
sentences = sentences[tempDocId],
text = texts[tempDocId],
@@ -3809,7 +3852,7 @@
"docid=\"$docId\""
)
- val dependencyEntryPath = docId.replace(Regex("[_.]"), "/") + "/$foundry/dependency.xml"
+ val dependencyEntryPath = docId.replace(Regex("[_.]"), "/") + "/$actualFoundry/dependency.xml"
val dependencyZipEntry = ZipArchiveEntry(dependencyEntryPath)
dependencyZipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlFormatterTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlFormatterTest.kt
index 860aa22..1a2dbb6 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlFormatterTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlFormatterTest.kt
@@ -8,6 +8,7 @@
import java.net.URL
import kotlin.test.Test
import kotlin.test.assertEquals
+import kotlin.test.assertNotNull
import kotlin.test.assertTrue
/**
@@ -99,20 +100,15 @@
val outputZip = File(outputDir, "wud24_sample.corenlp.zip")
assertTrue(outputZip.exists(), "Output ZIP should exist at ${outputZip.path}")
- val constituencyFiles = mutableListOf<String>()
- ProcessBuilder("unzip", "-l", outputZip.path)
- .redirectOutput(ProcessBuilder.Redirect.PIPE)
- .start()
- .inputStream
- .bufferedReader()
- .useLines { lines ->
- lines.forEach { line ->
- if (line.contains("constituency.xml")) {
- constituencyFiles.add(line.trim())
- }
- }
+ // Get all ZIP entries
+ val zipEntries = org.apache.commons.compress.archivers.zip.ZipFile.builder()
+ .setFile(outputZip)
+ .get()
+ .use { zip ->
+ zip.entries.asSequence().map { it.name }.toList()
}
+ val constituencyFiles = zipEntries.filter { it.contains("constituency.xml") }
assertTrue(constituencyFiles.isNotEmpty(), "Should have constituency.xml files in output")
val expectedDocs = listOf(
@@ -126,21 +122,70 @@
assertTrue(found, "Should have constituency.xml for $docPath")
}
- val morphoFiles = mutableListOf<String>()
- ProcessBuilder("unzip", "-l", outputZip.path)
- .redirectOutput(ProcessBuilder.Redirect.PIPE)
- .start()
- .inputStream
- .bufferedReader()
- .useLines { lines ->
- lines.forEach { line ->
- if (line.contains("/corenlp/morpho.xml")) {
- morphoFiles.add(line.trim())
- }
- }
+ val morphoFiles = zipEntries.filter { it.contains("/corenlp/morpho.xml") }
+ assertTrue(morphoFiles.size >= 3, "Should have morpho.xml files for at least 3 documents")
+
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun externalFoundryDetection() {
+ val baseZip = loadResource("wdd17sample.zip").path
+ val cmcConlluAnnotation = loadResource("wdd17sample.cmc.conllu").path
+ val outputDir = File.createTempFile("external_foundry_test", "").apply {
+ delete()
+ mkdirs()
+ }
+
+ try {
+ val args = arrayOf(
+ "-f",
+ "-t", "zip",
+ "-q",
+ "-D", outputDir.path,
+ "-j", "1",
+ "-A", "cat > /dev/null; cat $cmcConlluAnnotation",
+ baseZip
+ )
+
+ val exitCode = debug(args)
+ assertEquals(0, exitCode, "External annotation processing should succeed")
+
+ // Check that output ZIP has "cmc" in filename, not "annotated"
+ val outputZip = File(outputDir, "wdd17sample.cmc.zip")
+ assertTrue(outputZip.exists(), "Output ZIP should exist at ${outputZip.path} with 'cmc' foundry name")
+
+ // Verify internal structure contains cmc folders
+ val zipEntries = org.apache.commons.compress.archivers.zip.ZipFile.builder()
+ .setFile(outputZip)
+ .get()
+ .use { zip ->
+ zip.entries.asSequence().map { it.name }.toList()
}
- assertTrue(morphoFiles.size >= 3, "Should have morpho.xml files for at least 3 documents")
+ val cmcFolders = zipEntries.filter { it.contains("/cmc/") }
+ assertTrue(cmcFolders.isNotEmpty(), "Should have cmc folders in output ZIP structure")
+
+ // Verify no "annotated" folders exist
+ val annotatedFolders = zipEntries.filter { it.contains("/annotated/") }
+ assertTrue(annotatedFolders.isEmpty(), "Should NOT have 'annotated' folders in output ZIP structure")
+
+ // Verify morpho.xml contains CMC annotations (EMOASC, EMOIMG, URL)
+ val morphoXmlPath = "WDD17/B06/45592/cmc/morpho.xml"
+ val morphoXml = org.apache.commons.compress.archivers.zip.ZipFile.builder()
+ .setFile(outputZip)
+ .get()
+ .use { zip ->
+ val entry = zip.getEntry(morphoXmlPath)
+ assertNotNull(entry, "Should contain $morphoXmlPath")
+ zip.getInputStream(entry).bufferedReader(java.nio.charset.StandardCharsets.UTF_8).use { it.readText() }
+ }
+
+ assertTrue(morphoXml.contains("EMOASC"), "morpho.xml should contain EMOASC annotations")
+ assertTrue(morphoXml.contains("EMOIMG"), "morpho.xml should contain EMOIMG annotations")
+ assertTrue(morphoXml.contains("URL"), "morpho.xml should contain URL annotations")
} finally {
outputDir.deleteRecursively()