Add conllu2korapxml part
Change-Id: Ic74a2e68e0a6c73a8d3e16ee8bf1b787d51219e2
diff --git a/app/build.gradle b/app/build.gradle
index fa09110..ac3cd4f 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -151,9 +151,11 @@
def targetExec = new File(binDir, "korapxmltool")
def krillExec = new File(binDir, "korapxml2krill")
def conlluExec = new File(binDir, "korapxml2conllu")
+ def conllu2korapxmlExec = new File(binDir, "conllu2korapxml")
outputs.file(targetExec)
outputs.file(krillExec)
outputs.file(conlluExec)
+ outputs.file(conllu2korapxmlExec)
doLast {
def shebang = rootProject.file("korapxmltool.shebang")
@@ -205,6 +207,22 @@
java.nio.file.Files.copy(targetExec.toPath(), conlluExec.toPath())
conlluExec.setExecutable(true, false)
}
+
+ // Create conllu2korapxml symlink for CoNLL-U to KorAP XML ZIP conversion
+ if (conllu2korapxmlExec.exists()) {
+ conllu2korapxmlExec.delete()
+ }
+ try {
+ java.nio.file.Files.createSymbolicLink(
+ conllu2korapxmlExec.toPath(),
+ java.nio.file.Paths.get("korapxmltool")
+ )
+ println "Created symlink: conllu2korapxml -> korapxmltool"
+ } catch (Exception e) {
+ println "Warning: Could not create conllu2korapxml symlink (${e.message}), copying instead"
+ java.nio.file.Files.copy(targetExec.toPath(), conllu2korapxmlExec.toPath())
+ conllu2korapxmlExec.setExecutable(true, false)
+ }
}
}
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 149b248..2b391b5 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -71,6 +71,13 @@
" Basic conversion to CoNLL-U format:",
" ./build/bin/korapxmltool app/src/test/resources/wdf19.tree_tagger.zip | head -10",
"",
+ " CoNLL-U to KorAP XML ZIP conversion (auto-detects foundry from comments):",
+ " ./build/bin/conllu2korapxml file.conllu",
+ " cat file.conllu | ./build/bin/conllu2korapxml -o output.zip",
+ " ./build/bin/korapxmltool -t zip -F custom file.conllu",
+ " # Note: Foundry auto-detected from '# foundry = <name>' comment; override with -F",
+ " # Note: Output path auto-inferred (file.conllu → file.zip) or specify with -o",
+ "",
" Word2Vec style output:",
" ./build/bin/korapxmltool -t w2v app/src/test/resources/wud24_sample.zip",
"",
@@ -107,7 +114,7 @@
private var targetZipFileName: String? = null
// Locale is now globally forced to ROOT at startup (see main())
- @Parameters(arity = "1..*", description = ["At least one zip file name"])
+ @Parameters(arity = "0..*", description = ["Input files: KorAP-XML ZIP files or CoNLL-U files (.conllu). If omitted, reads from stdin (requires -o for output path)."])
var zipFileNames: Array<String>? = null
@Option(
@@ -263,6 +270,20 @@
var outputDir: String = "."
@Option(
+ names = ["-o", "--output"],
+ paramLabel = "FILE",
+ description = ["Output file path (for CoNLL-U to ZIP conversion). Required when reading from stdin."]
+ )
+ var outputFile: String? = null
+
+ @Option(
+ names = ["-F", "--foundry"],
+ paramLabel = "FOUNDRY",
+ description = ["Override foundry name for CoNLL-U input (default: auto-detect from '# foundry = <name>' comment)"]
+ )
+ var foundryOverride: String? = null
+
+ @Option(
names = ["--mem-stats-interval"],
paramLabel = "N",
description = ["Log memory and cache statistics every N processed documents (0 disables; default: 0)"]
@@ -504,6 +525,73 @@
})
}
+ // CoNLL-U to KorAP XML ZIP conversion mode
+ val isConlluInput = zipFileNames == null || zipFileNames!!.isEmpty() ||
+ zipFileNames!!.any { it.endsWith(".conllu") }
+
+ if (isConlluInput) {
+ // Validate: CoNLL-U mode requires -t zip (default or explicit)
+ if (outputFormat != OutputFormat.KORAP_XML) {
+ throw ParameterException(spec.commandLine(),
+ "CoNLL-U input requires output format 'zip' (use -t zip or invoke as 'conllu2korapxml')")
+ }
+
+ when {
+ // Case 1: stdin input (no files specified)
+ zipFileNames == null || zipFileNames!!.isEmpty() -> {
+ if (outputFile == null) {
+ throw ParameterException(spec.commandLine(),
+ "Reading from stdin requires -o/--output to specify output file path")
+ }
+ val finalOutputPath = if (outputDir != ".") {
+ File(outputDir, File(outputFile!!).name).path
+ } else {
+ outputFile!!
+ }
+ LOGGER.info("Converting CoNLL-U from stdin to: $finalOutputPath")
+ convertConlluToZip(System.`in`, finalOutputPath)
+ return 0
+ }
+
+ // Case 2: CoNLL-U file(s) specified
+ zipFileNames!!.all { it.endsWith(".conllu") } -> {
+ zipFileNames!!.forEach { conlluFile ->
+ val outputPath = when {
+ outputFile != null -> {
+ // Explicit -o specified: use outputDir if specified
+ if (outputDir != ".") {
+ File(outputDir, File(outputFile!!).name).path
+ } else {
+ outputFile!!
+ }
+ }
+ else -> {
+ // Auto-infer from input filename
+ val baseName = File(conlluFile).name.replace(Regex("\\.conllu$"), ".zip")
+ if (outputDir != ".") {
+ File(outputDir, baseName).path
+ } else {
+ conlluFile.replace(Regex("\\.conllu$"), ".zip")
+ }
+ }
+ }
+ LOGGER.info("Converting CoNLL-U file: $conlluFile → $outputPath")
+ FileInputStream(conlluFile).use { inputStream ->
+ convertConlluToZip(inputStream, outputPath)
+ }
+ }
+ return 0
+ }
+
+ // Case 3: Mixed input (some .conllu, some .zip) - not supported
+ else -> {
+ throw ParameterException(spec.commandLine(),
+ "Cannot mix CoNLL-U (.conllu) and ZIP files in the same invocation")
+ }
+ }
+ }
+
+ // Normal ZIP processing mode
LOGGER.info("Processing zip files: " + zipFileNames!!.joinToString(", "))
korapxml2conllu(zipFileNames!!)
@@ -3265,6 +3353,298 @@
sentences.remove(tempDocId)
}
+ /**
+ * Convert CoNLL-U input to KorAP XML ZIP format
+ * Supports:
+ * - Auto-detection of foundry from "# foundry = <name>" comment
+ * - Manual foundry override via -F option
+ * - Multi-document input (split on "# text_id" changes)
+ * - Combined foundries (e.g., "marmot-malt" → marmot/morpho.xml + malt/dependency.xml)
+ * - Text ID to path conversion (WUD24_I0083.95367 → WUD24/I0083/95367)
+ */
+ private fun convertConlluToZip(inputStream: InputStream, outputPath: String) {
+ LOGGER.info("Converting CoNLL-U to KorAP XML ZIP: $outputPath")
+
+ // Initialize DocumentBuilder for XML generation
+ if (dBuilder == null) {
+ dbFactory = DocumentBuilderFactory.newInstance()
+ dBuilder = dbFactory!!.newDocumentBuilder()
+ }
+
+ // Parse text_id to derive directory path: WUD24_I0083.95367 → WUD24/I0083/95367
+ fun textIdToPath(textId: String): String {
+ val parts = textId.split('_', limit = 2)
+ if (parts.size < 2) return textId.replace('.', '/')
+ val corpus = parts[0]
+ val remainder = parts[1].replace('.', '/')
+ return "$corpus/$remainder"
+ }
+
+ // Read all input and split into documents
+ data class ConlluDocument(
+ val textId: String,
+ val foundry: String,
+ val lines: List<String>
+ )
+
+ val documents = mutableListOf<ConlluDocument>()
+ val reader = BufferedReader(InputStreamReader(inputStream, StandardCharsets.UTF_8))
+ var currentTextId: String? = null
+ var currentFoundry: String? = null
+ var currentLines = mutableListOf<String>()
+
+ reader.forEachLine { line ->
+ when {
+ line.startsWith("# text_id = ") -> {
+ // Save previous document if exists
+ if (currentTextId != null && currentFoundry != null && currentLines.isNotEmpty()) {
+ documents.add(ConlluDocument(currentTextId!!, currentFoundry!!, currentLines.toList()))
+ currentLines = mutableListOf()
+ }
+ currentTextId = line.substring("# text_id = ".length).trim()
+ }
+ line.startsWith("# foundry = ") -> {
+ val detectedFoundry = line.substring("# foundry = ".length).trim()
+ currentFoundry = foundryOverride ?: detectedFoundry
+ }
+ else -> {
+ currentLines.add(line)
+ }
+ }
+ }
+
+ // Add final document
+ if (currentTextId != null && currentFoundry != null && currentLines.isNotEmpty()) {
+ documents.add(ConlluDocument(currentTextId!!, currentFoundry!!, currentLines.toList()))
+ }
+
+ if (documents.isEmpty()) {
+ LOGGER.severe("No documents found in CoNLL-U input (missing '# text_id' and '# foundry' comments)")
+ throw IllegalArgumentException("Invalid CoNLL-U format: missing required comments '# text_id' and '# foundry'")
+ }
+
+ LOGGER.info("Found ${documents.size} document(s) in CoNLL-U input")
+
+ // Create output ZIP
+ val outputFile = File(outputPath)
+ if (outputFile.exists() && !overwrite) {
+ LOGGER.severe("Output file already exists: $outputPath (use -f to overwrite)")
+ throw IOException("Output file already exists: $outputPath")
+ }
+
+ val zipOutputStream = ZipArchiveOutputStream(BufferedOutputStream(FileOutputStream(outputFile)))
+ zipOutputStream.setUseZip64(Zip64Mode.AsNeeded)
+
+ try {
+ // Process each document
+ documents.forEach { doc ->
+ LOGGER.fine("Processing document: ${doc.textId}, foundry: ${doc.foundry}")
+
+ // Parse CoNLL-U content
+ val morphoSpans = mutableMapOf<String, MorphoSpan>()
+ var currentStartOffsets: List<Int>? = null
+ var currentEndOffsets: List<Int>? = null
+ var tokenIndexInSentence = 0
+ val sentenceSpans = mutableListOf<Span>()
+ var sentenceStartOffset: Int? = null
+ var sentenceEndOffset: Int? = null
+
+ for (line in doc.lines) {
+ when {
+ line.startsWith("# start_offsets =") -> {
+ val offsetsStr = line.substring("# start_offsets =".length).trim()
+ val allOffsets = offsetsStr.split(Regex("\\s+")).mapNotNull { it.toIntOrNull() }
+ if (allOffsets.isEmpty()) {
+ LOGGER.severe("Missing start_offsets for text ${doc.textId}")
+ throw IllegalArgumentException("CoNLL-U format error: missing start_offsets for text ${doc.textId}")
+ }
+ sentenceStartOffset = allOffsets.firstOrNull()
+ currentStartOffsets = if (allOffsets.size > 1) allOffsets.drop(1) else allOffsets
+ tokenIndexInSentence = 0
+ }
+ line.startsWith("# end_offsets =") -> {
+ val offsetsStr = line.substring("# end_offsets =".length).trim()
+ val allOffsets = offsetsStr.split(Regex("\\s+")).mapNotNull { it.toIntOrNull() }
+ if (allOffsets.isEmpty()) {
+ LOGGER.severe("Missing end_offsets for text ${doc.textId}")
+ throw IllegalArgumentException("CoNLL-U format error: missing end_offsets for text ${doc.textId}")
+ }
+ sentenceEndOffset = allOffsets.firstOrNull()
+ currentEndOffsets = if (allOffsets.size > 1) allOffsets.drop(1) else emptyList()
+ }
+ line.isEmpty() -> {
+ // Sentence boundary
+ if (sentenceStartOffset != null && sentenceEndOffset != null) {
+ sentenceSpans.add(Span(sentenceStartOffset!!, sentenceEndOffset!!))
+ }
+ sentenceStartOffset = null
+ sentenceEndOffset = null
+ currentStartOffsets = null
+ currentEndOffsets = null
+ tokenIndexInSentence = 0
+ }
+ !line.startsWith("#") -> {
+ val fields = line.split("\t")
+ if (fields.size < 10) continue
+
+ val lemma = if (fields.size > 2) fields[2] else "_"
+ val upos = if (fields.size > 3) fields[3] else "_"
+ val xpos = if (fields.size > 4) fields[4] else "_"
+ val feats = if (fields.size > 5) fields[5] else "_"
+ val head = if (fields.size > 6) fields[6] else "_"
+ val deprel = if (fields.size > 7) fields[7] else "_"
+ val deps = if (fields.size > 8) fields[8] else "_"
+ val misc = if (fields.size > 9) fields[9] else "_"
+
+ if (currentStartOffsets == null || currentEndOffsets == null) {
+ LOGGER.severe("Token found before offset comments in text ${doc.textId}")
+ throw IllegalArgumentException("CoNLL-U format error: tokens found before offset comments in text ${doc.textId}")
+ }
+
+ if (tokenIndexInSentence < currentStartOffsets.size &&
+ tokenIndexInSentence < currentEndOffsets.size) {
+
+ val spanFrom = currentStartOffsets[tokenIndexInSentence]
+ val spanTo = currentEndOffsets[tokenIndexInSentence]
+ val spanKey = "$spanFrom-$spanTo"
+
+ morphoSpans[spanKey] = MorphoSpan(lemma, upos, xpos, feats, head, deprel, deps, misc)
+ tokenIndexInSentence++
+ }
+ }
+ }
+ }
+
+ // Capture final sentence if not ended with empty line
+ if (sentenceStartOffset != null && sentenceEndOffset != null) {
+ sentenceSpans.add(Span(sentenceStartOffset!!, sentenceEndOffset!!))
+ }
+
+ if (morphoSpans.isEmpty()) {
+ LOGGER.warning("No morpho spans found for text ${doc.textId}, skipping")
+ return@forEach
+ }
+
+ // Determine which layers to generate based on foundry and content
+ val hasDependencies = morphoSpans.values.any { span ->
+ span.head != null && span.head != "_" && span.deprel != null && span.deprel != "_"
+ }
+
+ // Get foundry names for each layer (handles combined foundries like "marmot-malt")
+ val morphoFoundry = getFoundryForLayer(doc.foundry, "morpho")
+ val dependencyFoundry = if (hasDependencies) getFoundryForLayer(doc.foundry, "dependency") else null
+
+ // Store data in temp maps for XML generation
+ val tempDocId = "_temp_conllu_${doc.textId}"
+ morpho[tempDocId] = morphoSpans
+ if (sentenceSpans.isNotEmpty()) {
+ sentences[tempDocId] = sentenceSpans.toTypedArray()
+ } else if (morphoSpans.isNotEmpty()) {
+ // Fallback: create single sentence spanning all tokens
+ val minOffset = morphoSpans.keys.minOfOrNull { it.split("-")[0].toInt() } ?: 0
+ val maxOffset = morphoSpans.keys.maxOfOrNull { it.split("-")[1].toInt() } ?: 0
+ sentences[tempDocId] = arrayOf(Span(minOffset, maxOffset))
+ }
+
+ // Generate morpho.xml
+ try {
+ val basePath = textIdToPath(doc.textId)
+ val morphoPath = "$basePath/$morphoFoundry/morpho.xml"
+
+ val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
+ docId = tempDocId,
+ foundry = morphoFoundry,
+ tokens = getTokenSpansFromMorho(morphoSpans),
+ sentences = sentences[tempDocId],
+ text = null,
+ morpho = morpho[tempDocId],
+ metadata = null,
+ extraFeatures = null,
+ fileName = null,
+ useLemma = useLemma,
+ extractMetadataRegex = extractMetadataRegex,
+ extractAttributesRegex = extractAttributesRegex,
+ columns = columns,
+ constituencyTrees = null,
+ includeOffsetsInMisc = false,
+ compatibilityMode = COMPATIBILITY_MODE,
+ tokenSeparator = tokenSeparator
+ )
+
+ val morphoXmlOutput = KorapXmlFormatter.formatMorpho(context, dBuilder!!)
+ val fixedMorphoXml = morphoXmlOutput.toString().replace(
+ "docid=\"$tempDocId\"",
+ "docid=\"${doc.textId}\""
+ )
+
+ val morphoZipEntry = ZipArchiveEntry(morphoPath)
+ morphoZipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
+ zipOutputStream.putArchiveEntry(morphoZipEntry)
+ zipOutputStream.write(fixedMorphoXml.toByteArray())
+ zipOutputStream.closeArchiveEntry()
+
+ LOGGER.fine("Wrote $morphoPath (${fixedMorphoXml.length} bytes)")
+ } catch (e: Exception) {
+ LOGGER.severe("ERROR generating morpho.xml for ${doc.textId}: ${e.message}")
+ throw e
+ }
+
+ // Generate dependency.xml if dependencies present
+ if (hasDependencies && dependencyFoundry != null) {
+ try {
+ val basePath = textIdToPath(doc.textId)
+ val dependencyPath = "$basePath/$dependencyFoundry/dependency.xml"
+
+ val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
+ docId = tempDocId,
+ foundry = dependencyFoundry,
+ tokens = getTokenSpansFromMorho(morphoSpans),
+ sentences = sentences[tempDocId],
+ text = null,
+ morpho = morpho[tempDocId],
+ metadata = null,
+ extraFeatures = null,
+ fileName = null,
+ useLemma = useLemma,
+ extractMetadataRegex = extractMetadataRegex,
+ extractAttributesRegex = extractAttributesRegex,
+ columns = columns,
+ constituencyTrees = null,
+ includeOffsetsInMisc = false,
+ compatibilityMode = COMPATIBILITY_MODE,
+ tokenSeparator = tokenSeparator
+ )
+
+ val dependencyXmlOutput = KorapXmlFormatter.formatDependency(context, dBuilder!!)
+ val fixedDependencyXml = dependencyXmlOutput.toString().replace(
+ "docid=\"$tempDocId\"",
+ "docid=\"${doc.textId}\""
+ )
+
+ val dependencyZipEntry = ZipArchiveEntry(dependencyPath)
+ dependencyZipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
+ zipOutputStream.putArchiveEntry(dependencyZipEntry)
+ zipOutputStream.write(fixedDependencyXml.toByteArray())
+ zipOutputStream.closeArchiveEntry()
+
+ LOGGER.fine("Wrote $dependencyPath (${fixedDependencyXml.length} bytes)")
+ } catch (e: Exception) {
+ LOGGER.severe("ERROR generating dependency.xml for ${doc.textId}: ${e.message}")
+ throw e
+ }
+ }
+
+ // Cleanup temp data
+ morpho.remove(tempDocId)
+ sentences.remove(tempDocId)
+ }
+
+ LOGGER.info("Successfully wrote ${documents.size} document(s) to $outputPath")
+ } finally {
+ zipOutputStream.close()
+ }
+ }
+
// Collect structural spans from structure.xml for krill format
private fun collectKrillStructureSpans(docId: String, spans: NodeList) {
// Skip if already output (thread-safe check with ConcurrentHashMap.KeySet)
@@ -4468,6 +4848,35 @@
System.err.println("korapxml2conllu compatibility mode: using conllu format")
newArgs.toTypedArray()
}
+ "conllu2korapxml" -> {
+ // Set zip output format for conllu2korapxml (CoNLL-U → KorAP XML ZIP)
+ val newArgs = mutableListOf<String>()
+
+ // Always set zip output format
+ if (!args.contains("-t") && !args.contains("--to")) {
+ newArgs.add("-t")
+ newArgs.add("zip")
+ }
+
+ var i = 0
+ while (i < args.size) {
+ val arg = args[i]
+ if (arg == "-t" || arg == "--to") {
+ // If format is already specified, override with zip
+ newArgs.add(arg)
+ if (i + 1 < args.size) {
+ i++
+ newArgs.add("zip")
+ }
+ } else {
+ newArgs.add(arg)
+ }
+ i++
+ }
+
+ System.err.println("conllu2korapxml mode: converting CoNLL-U to KorAP XML ZIP")
+ newArgs.toTypedArray()
+ }
else -> args
}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/ConlluConversionTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/ConlluConversionTest.kt
new file mode 100644
index 0000000..c95e6ee
--- /dev/null
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/ConlluConversionTest.kt
@@ -0,0 +1,388 @@
+package de.ids_mannheim.korapxmltools
+
+import org.junit.After
+import org.junit.Before
+import java.io.ByteArrayOutputStream
+import java.io.File
+import java.io.PrintStream
+import java.net.URL
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertFalse
+import kotlin.test.assertTrue
+
+/**
+ * Tests for CoNLL-U to KorAP XML ZIP conversion functionality
+ */
+class ConlluConversionTest {
+ private val outContent = ByteArrayOutputStream(10000000)
+ private val errContent = ByteArrayOutputStream()
+ private val originalOut: PrintStream = System.out
+ private val originalErr: PrintStream = System.err
+
+ @Before
+ fun setUpStreams() {
+ System.setOut(PrintStream(outContent))
+ System.setErr(PrintStream(errContent))
+ }
+
+ @After
+ fun restoreStreams() {
+ System.setOut(originalOut)
+ System.setErr(originalErr)
+ }
+
+ private fun loadResource(path: String): URL {
+ val resource = Thread.currentThread().contextClassLoader.getResource(path)
+ requireNotNull(resource) { "Resource $path not found" }
+ return resource
+ }
+
+ private fun createTempDir(prefix: String): File {
+ return File.createTempFile(prefix, "").apply {
+ delete()
+ mkdirs()
+ }
+ }
+
+ @Test
+ fun canConvertBasicConlluToZip() {
+ val outputDir = createTempDir("conllu_basic")
+ try {
+ val outputZip = File(outputDir, "output.zip")
+ val args = arrayOf(
+ "-t", "zip",
+ "-o", outputZip.path,
+ loadResource("wud24_sample.spacy.conllu").path
+ )
+ val exitCode = debug(args)
+ assertEquals(0, exitCode, "CoNLL-U conversion should succeed")
+ assertTrue(outputZip.exists(), "Output ZIP should be created")
+ assertTrue(outputZip.length() > 0, "Output ZIP should not be empty")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun conlluZipContainsMorphoXml() {
+ val outputDir = createTempDir("conllu_morpho")
+ try {
+ val outputZip = File(outputDir, "output.zip")
+ val args = arrayOf(
+ "-t", "zip",
+ "-o", outputZip.path,
+ loadResource("wud24_sample.spacy.conllu").path
+ )
+ debug(args)
+
+ val zipEntries = extractZipFileList(outputZip)
+ assertTrue(zipEntries.any { it.contains("spacy/morpho.xml") },
+ "ZIP should contain morpho.xml files")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun conlluZipContainsDependencyXml() {
+ val outputDir = createTempDir("conllu_dependency")
+ try {
+ val outputZip = File(outputDir, "output.zip")
+ val args = arrayOf(
+ "-t", "zip",
+ "-o", outputZip.path,
+ loadResource("wud24_sample.spacy.conllu").path
+ )
+ debug(args)
+
+ val zipEntries = extractZipFileList(outputZip)
+ assertTrue(zipEntries.any { it.contains("spacy/dependency.xml") },
+ "ZIP should contain dependency.xml files when dependencies present")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun canAutoInferOutputFilename() {
+ val outputDir = createTempDir("conllu_autoinfer")
+ try {
+ // Copy test file to temp dir
+ val inputFile = File(outputDir, "test.conllu")
+ File(loadResource("wud24_sample.spacy.conllu").path).copyTo(inputFile)
+
+ val args = arrayOf(
+ "-t", "zip",
+ "-D", outputDir.path,
+ inputFile.path
+ )
+ val exitCode = debug(args)
+ assertEquals(0, exitCode, "CoNLL-U conversion should succeed")
+
+ val outputZip = File(outputDir, "test.zip")
+ assertTrue(outputZip.exists(), "Output ZIP should be auto-inferred as test.zip")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun respectsOutputDirOption() {
+ val outputDir = createTempDir("conllu_output_dir")
+ try {
+ val inputFile = File(outputDir, "input.conllu")
+ File(loadResource("wud24_sample.spacy.conllu").path).copyTo(inputFile)
+
+ val args = arrayOf(
+ "-t", "zip",
+ "-D", outputDir.path,
+ inputFile.path
+ )
+ debug(args)
+
+ val outputZip = File(outputDir, "input.zip")
+ assertTrue(outputZip.exists(), "Output should be in specified directory")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun canHandleCombinedFoundries() {
+ val outputDir = createTempDir("conllu_combined")
+ try {
+ val outputZip = File(outputDir, "output.zip")
+ val args = arrayOf(
+ "-t", "zip",
+ "-o", outputZip.path,
+ loadResource("wud24_sample.marmot-malt.conllu").path
+ )
+ val exitCode = debug(args)
+ assertEquals(0, exitCode, "Combined foundry conversion should succeed")
+
+ val zipEntries = extractZipFileList(outputZip)
+ assertTrue(zipEntries.any { it.contains("marmot/morpho.xml") },
+ "ZIP should contain marmot morpho.xml")
+ assertTrue(zipEntries.any { it.contains("malt/dependency.xml") },
+ "ZIP should contain malt dependency.xml")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun canOverrideFoundryName() {
+ val outputDir = createTempDir("conllu_override")
+ try {
+ val outputZip = File(outputDir, "output.zip")
+ val args = arrayOf(
+ "-t", "zip",
+ "-F", "custom",
+ "-o", outputZip.path,
+ loadResource("wud24_sample.spacy.conllu").path
+ )
+ val exitCode = debug(args)
+ assertEquals(0, exitCode, "Foundry override conversion should succeed")
+
+ val zipEntries = extractZipFileList(outputZip)
+ assertTrue(zipEntries.any { it.contains("custom/morpho.xml") },
+ "ZIP should contain custom foundry morpho.xml")
+ assertFalse(zipEntries.any { it.contains("spacy/morpho.xml") },
+ "ZIP should not contain original spacy foundry")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun canConvertFromStdin() {
+ val outputDir = createTempDir("conllu_stdin")
+ try {
+ val outputZip = File(outputDir, "stdin_output.zip")
+ val inputFile = File(loadResource("wud24_sample.spacy.conllu").path)
+
+ // Use KorapXmlTool directly with redirected stdin
+ val inputStream = inputFile.inputStream()
+ val args = arrayOf(
+ "-t", "zip",
+ "-o", outputZip.path
+ )
+
+ val originalIn = System.`in`
+ try {
+ System.setIn(inputStream)
+ val exitCode = debug(args)
+ assertEquals(0, exitCode, "Stdin conversion should succeed")
+ assertTrue(outputZip.exists(), "Output ZIP should be created from stdin")
+ } finally {
+ System.setIn(originalIn)
+ inputStream.close()
+ }
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun validatesRequiredTextId() {
+ val outputDir = createTempDir("conllu_validation")
+ try {
+ // Create invalid CoNLL-U without text_id
+ val invalidConllu = File(outputDir, "invalid.conllu")
+ invalidConllu.writeText("""
+ # foundry = test
+ # start_offsets = 0 5
+ # end_offsets = 4 10
+ 1 Test test NOUN NN _ 0 ROOT _ _
+
+ """.trimIndent())
+
+ val outputZip = File(outputDir, "output.zip")
+ val args = arrayOf(
+ "-t", "zip",
+ "-o", outputZip.path,
+ invalidConllu.path
+ )
+ val exitCode = debug(args)
+ assertTrue(exitCode != 0, "Should fail without text_id")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun handlesMultipleDocuments() {
+ val outputDir = createTempDir("conllu_multidoc")
+ try {
+ val outputZip = File(outputDir, "output.zip")
+ val args = arrayOf(
+ "-t", "zip",
+ "-o", outputZip.path,
+ loadResource("wud24_sample.spacy.conllu").path
+ )
+ val exitCode = debug(args)
+ assertEquals(0, exitCode, "Multi-document conversion should succeed")
+
+ val zipEntries = extractZipFileList(outputZip)
+ // The sample file has 3 documents: WUD24_I0083.95367, WUD24_K0086.98010, WUD24_Z0087.65594
+ assertTrue(zipEntries.any { it.contains("WUD24/I0083/95367") },
+ "Should contain first document")
+ assertTrue(zipEntries.any { it.contains("WUD24/K0086/98010") },
+ "Should contain second document")
+ assertTrue(zipEntries.any { it.contains("WUD24/Z0087/65594") },
+ "Should contain third document")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun createsValidKorapXmlStructure() {
+ val outputDir = createTempDir("conllu_xml_validation")
+ try {
+ val outputZip = File(outputDir, "output.zip")
+ val args = arrayOf(
+ "-t", "zip",
+ "-o", outputZip.path,
+ loadResource("wud24_sample.spacy.conllu").path
+ )
+ val exitCode = debug(args)
+ assertEquals(0, exitCode)
+
+ // Check ZIP contains expected files
+ val zipEntries = extractZipFileList(outputZip)
+ assertTrue(zipEntries.any { it.contains("WUD24/I0083/95367/spacy/morpho.xml") },
+ "ZIP should contain morpho.xml")
+ assertTrue(zipEntries.any { it.contains("WUD24/I0083/95367/spacy/dependency.xml") },
+ "ZIP should contain dependency.xml")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun morphoXmlContainsLexicalFeatures() {
+ val outputDir = createTempDir("conllu_morpho_features")
+ try {
+ val outputZip = File(outputDir, "output.zip")
+ val args = arrayOf(
+ "-t", "zip",
+ "-o", outputZip.path,
+ loadResource("wud24_sample.spacy.conllu").path
+ )
+ debug(args)
+
+ val morphoXml = extractFileFromZip(outputZip, "WUD24/I0083/95367/spacy/morpho.xml")
+ assertTrue(morphoXml.length > 100, "Morpho XML should have substantial content")
+ assertTrue(morphoXml.contains("lemma") && morphoXml.contains("upos"),
+ "Morpho XML should contain lemma and upos fields")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun dependencyXmlContainsDependencyRelations() {
+ val outputDir = createTempDir("conllu_dep_relations")
+ try {
+ val outputZip = File(outputDir, "output.zip")
+ val args = arrayOf(
+ "-t", "zip",
+ "-o", outputZip.path,
+ loadResource("wud24_sample.spacy.conllu").path
+ )
+ debug(args)
+
+ val depXml = extractFileFromZip(outputZip, "WUD24/I0083/95367/spacy/dependency.xml")
+ assertTrue(depXml.length > 100, "Dependency XML should have substantial content")
+ assertTrue(depXml.contains("deprel") || depXml.contains("label"),
+ "Dependency XML should contain dependency relations")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun xmlContainsCorrectDocumentId() {
+ val outputDir = createTempDir("conllu_docid")
+ try {
+ val outputZip = File(outputDir, "output.zip")
+ val args = arrayOf(
+ "-t", "zip",
+ "-o", outputZip.path,
+ loadResource("wud24_sample.spacy.conllu").path
+ )
+ debug(args)
+
+ val morphoXml = extractFileFromZip(outputZip, "WUD24/I0083/95367/spacy/morpho.xml")
+ assertTrue(morphoXml.contains("WUD24_I0083.95367"),
+ "XML should contain correct document ID")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ private fun extractZipFileList(zipFile: File): List<String> {
+ val process = ProcessBuilder("unzip", "-l", zipFile.path)
+ .redirectOutput(ProcessBuilder.Redirect.PIPE)
+ .start()
+ val output = process.inputStream.bufferedReader().use { it.readText() }
+ process.waitFor()
+ return output.lines()
+ }
+
+ private fun extractFileFromZip(zipFile: File, filePath: String): String {
+ val process = ProcessBuilder("unzip", "-p", zipFile.path, filePath)
+ .redirectOutput(ProcessBuilder.Redirect.PIPE)
+ .redirectError(ProcessBuilder.Redirect.PIPE)
+ .start()
+ val content = process.inputStream.bufferedReader().use { it.readText() }
+ val exitCode = process.waitFor()
+ if (exitCode != 0) {
+ val error = process.errorStream.bufferedReader().use { it.readText() }
+ throw RuntimeException("Failed to extract $filePath from $zipFile: $error")
+ }
+ return content
+ }
+}
diff --git a/app/src/test/resources/wud24_sample.spacy.zip b/app/src/test/resources/wud24_sample.spacy.zip
index d65720a..8e88c44 100644
--- a/app/src/test/resources/wud24_sample.spacy.zip
+++ b/app/src/test/resources/wud24_sample.spacy.zip
Binary files differ