Split tests
Change-Id: Ia9d88ee1fe41fd4fa55c8472c520171dafc112df
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/ConlluFormatterTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/ConlluFormatterTest.kt
new file mode 100644
index 0000000..9d300c6
--- /dev/null
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/ConlluFormatterTest.kt
@@ -0,0 +1,201 @@
+package de.ids_mannheim.korapxmltools
+
+import org.junit.After
+import org.junit.Before
+import java.io.ByteArrayOutputStream
+import java.io.PrintStream
+import java.net.URL
+import kotlin.test.Test
+import kotlin.test.assertContains
+import kotlin.test.assertEquals
+import kotlin.test.assertFalse
+import kotlin.test.assertTrue
+
+/**
+ * Tests for CoNLL-U format output (default format)
+ */
+class ConlluFormatterTest {
+ private val outContent = ByteArrayOutputStream(10000000)
+ private val errContent = ByteArrayOutputStream()
+ private val originalOut: PrintStream = System.out
+ private val originalErr: PrintStream = System.err
+
+ @Before
+ fun setUpStreams() {
+ System.setOut(PrintStream(outContent))
+ System.setErr(PrintStream(errContent))
+ }
+
+ @After
+ fun restoreStreams() {
+ System.setOut(originalOut)
+ System.setErr(originalErr)
+ }
+
+ private fun loadResource(path: String): URL {
+ val resource = Thread.currentThread().contextClassLoader.getResource(path)
+ requireNotNull(resource) { "Resource $path not found" }
+ return resource
+ }
+
+ @Test
+ fun canConvertBasicZipToConllu() {
+ val args = arrayOf(loadResource("goe.zip").path)
+ debug(args)
+ assertContains(outContent.toString(), "# foundry = base")
+ assertContains(
+ outContent.toString(),
+ "# start_offsets = 55 55 59 63 70 75 82 87 94 102 105 111 120 124 130 134 140 144 151 153 163 175 187 191 207 209 213 218 222 239 248 255 259 264 267 271 277 283 297 307"
+ )
+ }
+
+ @Test
+ fun canConvertWithMorphoAnnotations() {
+ val args = arrayOf(loadResource("goe.tree_tagger.zip").path)
+ debug(args)
+ assertContains(outContent.toString(), "# foundry = tree_tagger")
+ assertContains(outContent.toString(), "9\tentzücke\tentzücken\t_\tVVFIN\t_\t_\t_\t_\t1.000000")
+ }
+
+ @Test
+ fun canInferFoundryFromFilename() {
+ val goeTreeTagger = loadResource("goe.tree_tagger.zip").path
+ val args = arrayOf(goeTreeTagger)
+ debug(args)
+ assertContains(outContent.toString(), "# foundry = tree_tagger")
+ assertContains(outContent.toString(), "9\tentzücke\tentzücken\t_\tVVFIN\t_\t_\t_\t_\t1.000000")
+ }
+
+ @Test
+ fun canConvertWithFrenchAnnotations() {
+ val args = arrayOf(loadResource("wdf19.tree_tagger.zip").path)
+ debug(args)
+ assertContains(outContent.toString(), "# foundry = tree_tagger")
+ assertContains(outContent.toString(), "\tvraie\tvrai\t_\tADJ\t_\t_\t_\t_\t")
+ }
+
+ @Test
+ fun respectsSiglePattern() {
+ val args = arrayOf("-p", ".*7", loadResource("wdf19.zip").path)
+ debug(args)
+ assertContains(outContent.toString(), "# text_id = WDF19_A0000.14247")
+ assertFalse { outContent.toString().contains("WDF19_A0000.13865") }
+ }
+
+ @Test
+ fun respectsColumnsParam() {
+ val args = arrayOf("-c", "5", loadResource("wdf19.zip").path)
+ debug(args)
+ assertContains(outContent.toString(), "42\tparfaitement\t_\t_\t_\n")
+ }
+
+ @Test
+ fun respectsSpecial1ColumnsParam() {
+ val args = arrayOf("-c", "1", loadResource("wdf19.zip").path)
+ debug(args)
+ assertContains(outContent.toString(), "\nparfaitement\n")
+ }
+
+ @Test
+ fun canConvertMultipleZips() {
+ val wdf19 = loadResource("wdf19.zip").path
+ val goe = loadResource("goe.zip").path
+ val args = arrayOf(wdf19, goe)
+ debug(args)
+ assertContains(outContent.toString(), "6\tautomatique\t_\t_\t_\t_\t_\t_\t_\t_\n")
+ assertContains(outContent.toString(), "36\tGedanken\t_\t_\t_\t_\t_\t_\t_\t_\n")
+ }
+
+ @Test
+ fun canConvertMorphoFeatureAnnotations() {
+ val goeMarmot = loadResource("goe.marmot.zip").path
+ val args = arrayOf(goeMarmot)
+ debug(args)
+ assertContains(
+ outContent.toString(),
+ "9\tentzücke\t_\t_\tVVFIN\tnumber=sg|person=3|tense=pres|mood=subj\t_\t_\t_\t_\n"
+ )
+ }
+
+ @Test
+ fun dependencyColumnsArePopulatedFromSpacyZip() {
+ val goeSpacy = loadResource("goe.spacy.zip").path
+ val args = arrayOf(goeSpacy)
+ debug(args)
+ val out = outContent.toString()
+
+ assertContains(out, "# foundry = spacy")
+ assertContains(out, "# text_id = GOE_AGA.00000")
+
+ val dataLines = out.lines().filter { !it.startsWith("#") && it.isNotBlank() }
+ assertTrue(dataLines.isNotEmpty(), "Should have data lines in output")
+
+ var tokensWithHead = 0
+ var tokensWithDeprel = 0
+ var totalTokens = 0
+
+ for (line in dataLines) {
+ val columns = line.split(Regex("\\s+"))
+ if (columns.size >= 8) {
+ totalTokens++
+ val head = columns[6]
+ val deprel = columns[7]
+ if (head != "_") tokensWithHead++
+ if (deprel != "_") tokensWithDeprel++
+ }
+ }
+
+ assertTrue(totalTokens > 0, "Should have parsed at least some tokens")
+
+ val headCoverage = (tokensWithHead.toDouble() / totalTokens) * 100
+ assertTrue(
+ headCoverage > 40.0,
+ "HEAD column should be populated for significant portion of tokens. Found: $tokensWithHead/$totalTokens (${headCoverage}%)"
+ )
+
+ val deprelCoverage = (tokensWithDeprel.toDouble() / totalTokens) * 100
+ assertTrue(
+ deprelCoverage > 40.0,
+ "DEPREL column should be populated for significant portion of tokens. Found: $tokensWithDeprel/$totalTokens (${deprelCoverage}%)"
+ )
+
+ assertTrue(
+ out.contains(Regex("\\n\\d+\\t\\S+\\t\\S+\\t\\S+\\t\\S+\\t\\S+\\t\\d+\\t\\S+\\t")),
+ "Should find tokens with numeric HEAD values in column 7"
+ )
+ }
+
+ @Test
+ fun conlluIncludesConstituencyCommentsWhenAvailable() {
+ outContent.reset()
+ errContent.reset()
+
+ val wud24Corenlp = loadResource("wud24_sample.corenlp.zip").path
+ val args = arrayOf(wud24Corenlp)
+ val exitCode = debug(args)
+ assertEquals(0, exitCode, "CoNLL-U conversion should succeed when constituency annotations are present")
+
+ val output = outContent.toString("UTF-8")
+ val constituencyLines = output.lineSequence().filter { it.startsWith("# constituency =") }.toList()
+
+ assertTrue(constituencyLines.isNotEmpty(), "CoNLL-U output should include constituency comment lines")
+ assertTrue(
+ constituencyLines.first().contains("("),
+ "Constituency comment should contain bracketed structure"
+ )
+ }
+
+ @Test
+ fun canExtractExtraFeaturesByRegex() {
+ val args = arrayOf("-e", "(posting/id|div/id)", loadResource("wdf19.zip").path)
+ debug(args)
+ assertContains(
+ outContent.toString(),
+ "12\t)\t_\t_\t_\t_\t_\t_\t_\t_\n" +
+ "# div/id = i.14293_8\n" +
+ "13\tDifférentiation\t_\t_\t_\t_\t_\t_\t_\t_\n" +
+ "# posting/id = i.14293_8_1\n" +
+ "14\tAinsi\t_\t_\t_\t_\t_\t_\t_\t_\n"
+ )
+ }
+}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt
new file mode 100644
index 0000000..ad43606
--- /dev/null
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt
@@ -0,0 +1,113 @@
+package de.ids_mannheim.korapxmltools
+
+import org.junit.After
+import org.junit.Before
+import java.io.ByteArrayOutputStream
+import java.io.PrintStream
+import java.net.URL
+import kotlin.test.Test
+import kotlin.test.assertContains
+import kotlin.test.assertEquals
+import kotlin.test.assertTrue
+
+/**
+ * Tests for general tool features: help, logging, annotation, metadata, sorting
+ */
+class GeneralFeaturesTest {
+ private val outContent = ByteArrayOutputStream(10000000)
+ private val errContent = ByteArrayOutputStream()
+ private val originalOut: PrintStream = System.out
+ private val originalErr: PrintStream = System.err
+
+ @Before
+ fun setUpStreams() {
+ System.setOut(PrintStream(outContent))
+ System.setErr(PrintStream(errContent))
+ }
+
+ @After
+ fun restoreStreams() {
+ System.setOut(originalOut)
+ System.setErr(originalErr)
+ }
+
+ private fun loadResource(path: String): URL {
+ val resource = Thread.currentThread().contextClassLoader.getResource(path)
+ requireNotNull(resource) { "Resource $path not found" }
+ return resource
+ }
+
+ @Test
+ fun canPrintHelp() {
+ debug(arrayOf("-h"))
+ assertContains(outContent.toString(), "--s-bounds-from-morpho")
+ }
+
+ @Test
+ fun canSetLogLevel() {
+ val args = arrayOf("-l", "info", loadResource("wdf19.zip").path)
+ debug(args)
+ assertContains(errContent.toString(), "Processing zip file")
+ }
+
+ @Test
+ fun canAnnotate() {
+ val args = arrayOf("-A", "sed -e 's/u/x/g'", loadResource("wdf19.zip").path)
+ debug(args)
+ assertContains(outContent.toString(), "axtomatiqxe")
+ assertTrue(
+ "Annotated CoNLL-U should have at least as many lines as the original, but only has ${
+ outContent.toString().count { it == '\n' }
+ } lines"
+ ) { outContent.toString().count { it == '\n' } >= 61511 }
+ }
+
+ @Test
+ fun monthAwareComparatorOrdersCalendarMonths() {
+ val tool = KorapXmlTool()
+ assertTrue(tool.compareTextIds("ZGE24_JAN.00001", "ZGE24_MAR.00001") < 0, "JAN should sort before MAR")
+ assertTrue(tool.compareTextIds("ZGE24_MRZ.00001", "ZGE24_APR.00001") < 0, "MRZ should sort before APR")
+ assertTrue(tool.compareTextIds("ZGE24_OKT.00001", "ZGE24_SEP.00001") > 0, "OKT should sort after SEP")
+ assertTrue(tool.compareTextIds("ZGE24_DEZ.00001", "ZGE24_NOV.00001") > 0, "DEZ should sort after NOV")
+ assertTrue(tool.compareTextIds("ZGE24_MAI.00001", "ZGE24_JUL.00001") < 0, "MAI should sort before JUL")
+ }
+
+ @Test
+ fun monthAwareComparatorFallsBackToAlphabeticalWhenNoMonth() {
+ val tool = KorapXmlTool()
+ val ids = listOf("WUD24_I0083.95367", "WUD24_Z0087.65594", "WUD24_K0086.98010")
+ val sorted = ids.sortedWith { a, b -> tool.compareTextIds(a, b) }
+ assertEquals(
+ listOf("WUD24_I0083.95367", "WUD24_K0086.98010", "WUD24_Z0087.65594"),
+ sorted,
+ "Non-month IDs should sort alphabetically"
+ )
+ }
+
+ @Test
+ fun monthAwareComparatorSortsMixedMonthsInCalendarOrder() {
+ val tool = KorapXmlTool()
+ val ids = listOf(
+ "ZGE24_OKT.00002",
+ "ZGE24_JAN.00003",
+ "ZGE24_DEZ.00001",
+ "ZGE24_SEP.00005",
+ "ZGE24_MAR.00001"
+ )
+ val expected = listOf(
+ "ZGE24_JAN.00003",
+ "ZGE24_MAR.00001",
+ "ZGE24_SEP.00005",
+ "ZGE24_OKT.00002",
+ "ZGE24_DEZ.00001"
+ )
+ val sorted = ids.sortedWith { a, b -> tool.compareTextIds(a, b) }
+ assertEquals(expected, sorted, "Mixed month IDs should follow calendar order")
+ }
+
+ private fun KorapXmlTool.compareTextIds(a: String, b: String): Int {
+ val m = KorapXmlTool::class.java.getDeclaredMethod("compareTextIds", String::class.java, String::class.java)
+ m.isAccessible = true
+ return m.invoke(this, a, b) as Int
+ }
+}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlFormatterTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlFormatterTest.kt
new file mode 100644
index 0000000..7f479b5
--- /dev/null
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlFormatterTest.kt
@@ -0,0 +1,148 @@
+package de.ids_mannheim.korapxmltools
+
+import org.junit.After
+import org.junit.Before
+import java.io.ByteArrayOutputStream
+import java.io.File
+import java.io.PrintStream
+import java.net.URL
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertTrue
+
+/**
+ * Tests for KorAP XML format output (-f zip or -t zip)
+ */
+class KorapXmlFormatterTest {
+ private val outContent = ByteArrayOutputStream(10000000)
+ private val errContent = ByteArrayOutputStream()
+ private val originalOut: PrintStream = System.out
+ private val originalErr: PrintStream = System.err
+
+ @Before
+ fun setUpStreams() {
+ System.setOut(PrintStream(outContent))
+ System.setErr(PrintStream(errContent))
+ }
+
+ @After
+ fun restoreStreams() {
+ System.setOut(originalOut)
+ System.setErr(originalErr)
+ }
+
+ private fun loadResource(path: String): URL {
+ val resource = Thread.currentThread().contextClassLoader.getResource(path)
+ requireNotNull(resource) { "Resource $path not found" }
+ return resource
+ }
+
+ @Test
+ fun korapXmlOutputWorks() {
+ val sourceFile = loadResource("wdf19.zip").path
+ val tmpSourceFile = File.createTempFile("tmp", ".zip")
+ val tmpSourceFileName = tmpSourceFile.absolutePath
+ File(sourceFile).copyTo(File(tmpSourceFileName), true)
+ val outputDir = File(tmpSourceFileName).parentFile.absolutePath
+ val args = arrayOf("-D", outputDir, "-f", "-t", "zip", tmpSourceFileName)
+ debug(args)
+
+ val resultFile = tmpSourceFileName.toString().replace(".zip", ".base.zip")
+ assertTrue(File(resultFile).exists())
+ }
+
+ @Test
+ fun overwriteWorks() {
+ val sourceFile = loadResource("wdf19.zip").path
+ val tmpSourceFile = File.createTempFile("tmp", ".zip")
+ val tmpSourceFileName = tmpSourceFile.absolutePath
+ File(sourceFile).copyTo(File(tmpSourceFileName), true)
+ val resultFile = tmpSourceFileName.toString().replace(".zip", ".base.zip")
+ File(resultFile).createNewFile()
+ val outputDir = File(tmpSourceFileName).parentFile.absolutePath
+ val args = arrayOf("-D", outputDir, "-f", "-t", "zip", tmpSourceFileName)
+ debug(args)
+ assertTrue(File(resultFile).exists())
+ assertTrue(File(resultFile).length() > 0)
+ }
+
+ @Test
+ fun corenlpConstituencyParsing() {
+ val taggerModel = File("libs/german-fast.tagger")
+ val parserModel = File("libs/germanSR.ser.gz")
+
+ if (!taggerModel.exists() || !parserModel.exists()) {
+ System.err.println("Skipping CoreNLP test: model files not found")
+ return
+ }
+
+ val baseZip = loadResource("wud24_sample.zip").path
+ val outputDir = File.createTempFile("corenlp_test", "").apply {
+ delete()
+ mkdirs()
+ }
+
+ try {
+ val args = arrayOf(
+ "-t", "zip",
+ "-o",
+ "-D", outputDir.path,
+ "-t", "corenlp:${taggerModel.path}",
+ "-P", "corenlp:${parserModel.path}",
+ baseZip
+ )
+
+ val exitCode = debug(args)
+ assertEquals(0, exitCode, "CoreNLP processing should succeed")
+
+ val outputZip = File(outputDir, "wud24_sample.corenlp.zip")
+ assertTrue(outputZip.exists(), "Output ZIP should exist at ${outputZip.path}")
+
+ val constituencyFiles = mutableListOf<String>()
+ ProcessBuilder("unzip", "-l", outputZip.path)
+ .redirectOutput(ProcessBuilder.Redirect.PIPE)
+ .start()
+ .inputStream
+ .bufferedReader()
+ .useLines { lines ->
+ lines.forEach { line ->
+ if (line.contains("constituency.xml")) {
+ constituencyFiles.add(line.trim())
+ }
+ }
+ }
+
+ assertTrue(constituencyFiles.isNotEmpty(), "Should have constituency.xml files in output")
+
+ val expectedDocs = listOf(
+ "WUD24/I0083/95367/corenlp/constituency.xml",
+ "WUD24/Z0087/65594/corenlp/constituency.xml",
+ "WUD24/K0086/98010/corenlp/constituency.xml"
+ )
+
+ expectedDocs.forEach { docPath ->
+ val found = constituencyFiles.any { it.contains(docPath) }
+ assertTrue(found, "Should have constituency.xml for $docPath")
+ }
+
+ val morphoFiles = mutableListOf<String>()
+ ProcessBuilder("unzip", "-l", outputZip.path)
+ .redirectOutput(ProcessBuilder.Redirect.PIPE)
+ .start()
+ .inputStream
+ .bufferedReader()
+ .useLines { lines ->
+ lines.forEach { line ->
+ if (line.contains("/corenlp/morpho.xml")) {
+ morphoFiles.add(line.trim())
+ }
+ }
+ }
+
+ assertTrue(morphoFiles.size >= 3, "Should have morpho.xml files for at least 3 documents")
+
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
deleted file mode 100644
index ea1860f..0000000
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ /dev/null
@@ -1,1058 +0,0 @@
-package de.ids_mannheim.korapxmltools
-
-import org.junit.After
-import org.junit.AfterClass
-import org.junit.Before
-import java.io.ByteArrayOutputStream
-import java.io.File
-import java.io.PrintStream
-import java.net.URL
-import kotlin.test.Test
-import kotlin.test.assertContains
-import kotlin.test.assertEquals
-import kotlin.test.assertFalse
-import kotlin.test.assertTrue
-
-class KorapXmlToolTest {
- companion object {
- private data class KrillTarResult(val outputDir: File, val tar: File)
-
- private val krillTarCache = mutableMapOf<String, KrillTarResult>()
-
- private fun ensureKrillTar(
- key: String,
- tarName: String = "wud24_sample.krill.tar",
- argsBuilder: (File) -> Array<String>
- ): File {
- return krillTarCache.getOrPut(key) {
- val outputDir = createTempDir("krill_cache_$key")
- val args = argsBuilder(outputDir)
- val exitCode = debug(args)
- assertTrue(exitCode == 0, "Krill conversion should succeed for cached fixture '$key'")
- val tar = File(outputDir, tarName)
- assertTrue(tar.exists(), "Expected $tarName for cached fixture '$key'")
- KrillTarResult(outputDir, tar)
- }.tar
- }
-
- private fun createTempDir(prefix: String): File {
- return File.createTempFile(prefix, "").apply {
- delete()
- mkdirs()
- }
- }
-
- @JvmStatic
- @AfterClass
- fun cleanupKrillTarCache() {
- krillTarCache.values.forEach { it.outputDir.deleteRecursively() }
- krillTarCache.clear()
- }
- }
-
- private val outContent = ByteArrayOutputStream(10000000)
- private val errContent = ByteArrayOutputStream()
- private val originalOut: PrintStream = System.out
- private val originalErr: PrintStream = System.err
-
- val goe = loadResource("goe.zip").path
- val goeSpacy = loadResource("goe.spacy.zip").path
- val goeMarmot = loadResource("goe.marmot.zip").path
- val goeTreeTagger = loadResource("goe.tree_tagger.zip").path
- val zca20scrambled = loadResource("zca20-scrambled.zip").path
- val wdf19 = loadResource("wdf19.zip").path
- val wdd17 = loadResource("wdd17sample.zip").path
- val wud24Corenlp = loadResource("wud24_sample.corenlp.zip").path
-
- @Before
- fun setUpStreams() {
- System.setOut(PrintStream(outContent))
- System.setErr(PrintStream(errContent))
- }
-
- @After
- fun restoreStreams() {
- System.setOut(originalOut)
- System.setErr(originalErr)
- }
-
- private fun loadResource(path: String): URL {
- val resource = Thread.currentThread().contextClassLoader.getResource(path)
- requireNotNull(resource) { "Resource $path not found" }
- return resource
- }
-
- @Test
- fun canConvertGOE() {
- val args = arrayOf(loadResource("goe.zip").path)
- debug(args)
- assertContains(
- outContent.toString(),
- "# foundry = base"
- )
- assertContains(
- outContent.toString(),
- "# start_offsets = 55 55 59 63 70 75 82 87 94 102 105 111 120 124 130 134 140 144 151 153 163 175 187 191 207 209 213 218 222 239 248 255 259 264 267 271 277 283 297 307"
- )
- }
- @Test
- fun canConvertWithMorphoAnnotations() {
- val args = arrayOf(loadResource("goe.tree_tagger.zip").path)
- debug(args)
- assertContains(
- outContent.toString(),
- "# foundry = tree_tagger"
- )
- assertContains(
- outContent.toString(),
- "9\tentzücke\tentzücken\t_\tVVFIN\t_\t_\t_\t_\t1.000000"
- )
- }
- @Test
- fun canInferBaseName() {
- val args = arrayOf(goeTreeTagger)
- debug(args)
- assertContains(
- outContent.toString(),
- "# foundry = tree_tagger"
- )
- assertContains(
- outContent.toString(),
- "9\tentzücke\tentzücken\t_\tVVFIN\t_\t_\t_\t_\t1.000000"
- )
- }
-
- @Test
- fun canConvertWfdWithMorphoAnnotations() {
- val args = arrayOf(loadResource("wdf19.tree_tagger.zip").path)
- debug(args)
- assertContains(
- outContent.toString(),
- "# foundry = tree_tagger"
- )
- assertContains(
- outContent.toString(),
- "\tvraie\tvrai\t_\tADJ\t_\t_\t_\t_\t"
- )
- }
-
- @Test
- fun canPrintHelp() {
- debug(arrayOf("-h"))
- assertContains(
- outContent.toString(),
- "--s-bounds-from-morpho"
- )
- }
-
- @Test
- fun respectsSiglePattern() {
- val args = arrayOf("-p",".*7", loadResource("wdf19.zip").path)
- debug(args)
- assertContains(
- outContent.toString(),
- "# text_id = WDF19_A0000.14247"
- )
- assertFalse { outContent.toString().contains("WDF19_A0000.13865") }
- }
-
- @Test
- fun respectsColumnsParam() {
- val args = arrayOf("-c","5", loadResource("wdf19.zip").path)
- debug(args)
- assertContains(
- outContent.toString(),
- "42\tparfaitement\t_\t_\t_\n"
- )
- }
-
- @Test
- fun respectsSpecial1ColumnsParam() {
- val args = arrayOf("-c","1", loadResource("wdf19.zip").path)
- debug(args)
- assertContains(
- outContent.toString(),
- "\nparfaitement\n"
- )
- }
-
- @Test
- fun deprecatedW2vOptionWorks() {
- // Test that the old -w option no longer works (should fail for v3.0)
- val args = arrayOf("-w", loadResource("wdf19.zip").path)
- val exitCode = debug(args)
- // Should fail since -w was removed
- assertTrue(exitCode != 0, "Old -w option should no longer work in v3.0")
- }
-
- @Test
- fun w2vOptionWorks() {
- val args = arrayOf("-t", "w2v", loadResource("wdf19.zip").path)
- debug(args)
- assertContains(
- outContent.toString(),
- "\nje ne suis pas du tout d'accord !\n"
- )
- assertFalse { outContent.toString().contains("WDF19_A0000.13865") }
- }
-
- @Test
- fun nowOptionWorks() {
- val args = arrayOf("-t", "now", loadResource("wdf19.zip").path)
- debug(args)
- val output = outContent.toString()
- // Check that output starts with @@<text-sigle>
- assertContains(output, "@@WDF19_A0000.")
- // Check that sentence boundaries are replaced with <p> tags
- assertContains(output, " <p> ")
- // Check that it contains the expected text content
- assertContains(output, "Arts visuels Pourquoi toujours vouloir")
- // Check that it doesn't contain CoNLL-U format markers
- assertFalse(output.contains("# foundry"))
- // Check that each text is on one line (no newlines within text except at end)
- val lines = output.trim().split('\n')
- assertTrue(lines.all { it.startsWith("@@") })
- }
-
- @Test
- fun canConvertXMLwithInvalidComments() {
- val args = arrayOf("-t", "w2v", zca20scrambled)
- debug(args)
- assertContains(
- outContent.toString(),
- "\nDys est yuch dyr Grund dyfür , dyss ys schon myl myhryry Wochyn dyuyrn kynn .\n"
- )
- }
-
- @Test
- fun canSetLogLevel() {
- val args = arrayOf("-l", "info", loadResource("wdf19.zip").path)
- debug(args)
- assertContains(
- errContent.toString(),
- "Processing zip file"
- )
- }
-
- @Test
- fun canAnnotate() {
- val args = arrayOf("-A", "sed -e 's/u/x/g'", loadResource("wdf19.zip").path)
- debug(args)
- assertContains(
- outContent.toString(),
- "axtomatiqxe"
- )
- assertTrue("Annotated CoNLL-U should have at least as many lines as the original, but only has ${outContent.toString().count { it == '\n'}} lines"
- ) { outContent.toString().count { it == '\n' } >= 61511 }
- }
-
- @Test
- fun canExtractMetadata() {
- val args = arrayOf("-t", "w2v", "-m" ,"<textSigle>([^<]+)", "-m", "<creatDate>([^<]+)", loadResource("wdf19.zip").path)
- debug(args)
- assertContains(
- outContent.toString(),
- "WDF19/A0000.12006\t2011.08.11\tmerci pour l'info je suis curieux !"
- )
- }
-
- @Test
- fun canHandleNonBmpText() {
- val args = arrayOf("-t", "w2v", wdd17)
- debug(args)
- assertContains(
- outContent.toString(),
- "\n-- mach \uD83D\uDE48 \uD83D\uDE49 \uD83D\uDE4A 20 : 45 , 1. Feb .\n" // 🙈 🙉 🙊
- )
- assertContains(
- outContent.toString(),
- "\nBereinige wenigstens die allergröbsten Sachen .\n"
- )
- }
-
- @Test
- fun canExtractExtraFeaturesByRegex() {
- val args = arrayOf("-e" ,"(posting/id|div/id)",loadResource("wdf19.zip").path)
- debug(args)
- assertContains(
- outContent.toString(),
- "12\t)\t_\t_\t_\t_\t_\t_\t_\t_\n" +
- "# div/id = i.14293_8\n" +
- "13\tDifférentiation\t_\t_\t_\t_\t_\t_\t_\t_\n" +
- "# posting/id = i.14293_8_1\n" +
- "14\tAinsi\t_\t_\t_\t_\t_\t_\t_\t_\n"
- )
- }
-
- @Test
- fun canConvertMultipleZips() {
- val args = arrayOf(wdf19, goe)
- debug(args)
- assertContains(
- outContent.toString(),
- "6\tautomatique\t_\t_\t_\t_\t_\t_\t_\t_\n"
- )
- assertContains(
- outContent.toString(),
- "36\tGedanken\t_\t_\t_\t_\t_\t_\t_\t_\n"
- )
- }
-
- @Test
- fun canConvertMorphoFeatureAnnotations() {
- val args = arrayOf(goeMarmot)
- debug(args)
- assertContains(
- outContent.toString(),
- "9\tentzücke\t_\t_\tVVFIN\tnumber=sg|person=3|tense=pres|mood=subj\t_\t_\t_\t_\n"
- )
- }
-
- @Test
- fun korapXmlOutputWorks() {
- val sourceFile = loadResource("wdf19.zip").path
- val tmpSourceFile = File.createTempFile("tmp", ".zip")
- val tmpSourceFileName = tmpSourceFile.absolutePath
- File(sourceFile).copyTo(File(tmpSourceFileName), true)
- val outputDir = File(tmpSourceFileName).parentFile.absolutePath
- val args = arrayOf("-D", outputDir, "-f", "-t", "zip", tmpSourceFileName)
- debug(args)
-
- val resultFile = tmpSourceFileName.toString().replace(".zip", ".base.zip")
- assert(File(resultFile).exists())
- }
-
- @Test
- fun overwriteWorks() {
- val sourceFile = loadResource("wdf19.zip").path
- val tmpSourceFile = File.createTempFile("tmp", ".zip")
- val tmpSourceFileName = tmpSourceFile.absolutePath
- File(sourceFile).copyTo(File(tmpSourceFileName), true)
- val resultFile = tmpSourceFileName.toString().replace(".zip", ".base.zip")
- File(resultFile).createNewFile()
- val outputDir = File(tmpSourceFileName).parentFile.absolutePath
- val args = arrayOf("-D", outputDir, "-f", "-t", "zip", tmpSourceFileName)
- debug(args)
- assert(File(resultFile).exists())
- assert(File(resultFile).length() > 0)
- }
-
- @Test
- fun canWord2VecLemma() {
- val args = arrayOf("--lemma", "-t", "w2v", loadResource("goe.tree_tagger.zip").path)
- debug(args)
- val out = outContent.toString()
- // Expect lemma sequence containing "mein Ankunft" (surface would include inflected form elsewhere)
- assertContains(out, " mein Ankunft ")
- }
-
- @Test
- fun canNowLemma() {
- val args = arrayOf("--lemma", "-t", "now", loadResource("goe.tree_tagger.zip").path)
- debug(args)
- val out = outContent.toString()
- assertContains(out, "@@")
- assertContains(out, " <p> ")
- assertContains(out, " mein Ankunft ")
- }
-
- @Test
- fun lemmaOnlyWord2VecWorks() {
- val args = arrayOf("--lemma-only", "-t", "w2v", loadResource("goe.tree_tagger.zip").path)
- debug(args)
- val out = outContent.toString()
- // Should produce some lemma tokens without requiring data.xml
- assertTrue(out.contains(" mein ") || out.contains(" Ankunft "))
- }
-
- @Test
- fun lemmaOnlyNowWorks() {
- val args = arrayOf("--lemma-only", "-t", "now", loadResource("goe.tree_tagger.zip").path)
- debug(args)
- val out = outContent.toString()
- assertContains(out, "@@")
- assertContains(out, " <p> ")
- }
-
- @Test
- fun excludeZipGlobSkipsFiles() {
- val args = arrayOf("--exclude-zip-glob", "goe.zip", loadResource("wdf19.zip").path, loadResource("goe.zip").path)
- debug(args)
- val out = outContent.toString()
- // Expect French content, but not the German token from GOE
- assertContains(out, "automatique")
- assertFalse(out.contains("Gedanken"))
- }
-
- @Test
- fun sequentialOnlyForNowAndW2V() {
- val args = arrayOf("--sequential", loadResource("wdf19.zip").path)
- // Default format is conllu; this should error
- val rc = debug(args)
- // Non-zero is expected; and error message should be present
- assertTrue(rc != 0)
- assertContains(errContent.toString(), "--sequential is supported only with -t word2vec or -t now")
- }
-
- @Test
- fun dependencyColumnsArePopulatedFromSpacyZip() {
- val args = arrayOf(goeSpacy)
- debug(args)
- val out = outContent.toString()
-
- // Check that output is CoNLL-U format
- assertContains(out, "# foundry = spacy")
- assertContains(out, "# text_id = GOE_AGA.00000")
-
- // Get data lines (non-comment, non-empty)
- val dataLines = out.lines()
- .filter { !it.startsWith("#") && it.isNotBlank() }
-
- assertTrue(dataLines.isNotEmpty(), "Should have data lines in output")
-
- // Parse tokens and check dependency columns (column 7 = HEAD, column 8 = DEPREL)
- var tokensWithHead = 0
- var tokensWithDeprel = 0
- var totalTokens = 0
-
- for (line in dataLines) {
- val columns = line.split(Regex("\\s+"))
- if (columns.size >= 8) {
- totalTokens++
- // Column 7 (index 6) is HEAD, column 8 (index 7) is DEPREL
- val head = columns[6]
- val deprel = columns[7]
-
- if (head != "_") tokensWithHead++
- if (deprel != "_") tokensWithDeprel++
- }
- }
-
- // Assert that we have tokens
- assertTrue(totalTokens > 0, "Should have parsed at least some tokens")
-
- // Print diagnostic information
- System.err.println("=== Dependency Test Diagnostics ===")
- System.err.println("Total tokens: $totalTokens")
- System.err.println("Tokens with HEAD (!= '_'): $tokensWithHead")
- System.err.println("Tokens with DEPREL (!= '_'): $tokensWithDeprel")
- System.err.println("First 5 data lines:")
- dataLines.take(5).forEach { System.err.println(" $it") }
-
- // Assert that HEAD column (col 7) is populated for a significant portion of tokens
- // When processing spacy zip alone, we get ~50% coverage (base tokens don't have deps)
- val headCoverage = (tokensWithHead.toDouble() / totalTokens) * 100
- assertTrue(
- headCoverage > 40.0,
- "HEAD column should be populated for significant portion of tokens. Found: $tokensWithHead/$totalTokens (${headCoverage}%)"
- )
-
- // Assert that DEPREL column (col 8) is populated for a significant portion of tokens
- val deprelCoverage = (tokensWithDeprel.toDouble() / totalTokens) * 100
- assertTrue(
- deprelCoverage > 40.0,
- "DEPREL column should be populated for significant portion of tokens. Found: $tokensWithDeprel/$totalTokens (${deprelCoverage}%)"
- )
-
- // Check for specific dependency relations and head indices in output
- // Look for numeric head indices (not "_")
- assertTrue(
- out.contains(Regex("\\n\\d+\\t\\S+\\t\\S+\\t\\S+\\t\\S+\\t\\S+\\t\\d+\\t\\S+\\t")),
- "Should find tokens with numeric HEAD values in column 7"
- )
- }
-
- @Test
- fun krillOutputMatchesExpectedStructure() {
- // Test krill format output generation succeeds
- val baseZip = loadResource("wud24_sample.zip").path
- val spacyZip = loadResource("wud24_sample.spacy.zip").path
- val marmotMaltZip = loadResource("wud24_sample.marmot-malt.zip").path
- val opennlpZip = loadResource("wud24_sample.opennlp.zip").path
- val treeTaggerZip = loadResource("wud24_sample.tree_tagger.zip").path
-
- val generatedTar = ensureKrillTar("wud24_full_foundries") { outputDir ->
- arrayOf(
- "-t", "krill",
- "-l", "info",
- "-D", outputDir.path,
- baseZip,
- spacyZip,
- marmotMaltZip,
- opennlpZip,
- treeTaggerZip
- )
- }
- assertTrue(generatedTar.exists(), "Generated krill tar should exist at ${generatedTar.path}")
- assertTrue(generatedTar.length() > 0, "Generated tar should not be empty")
-
- // Check that log file exists
- val logFile = File(generatedTar.path.replace(Regex("\\.tar$"), ".log"))
- assertTrue(logFile.exists(), "Log file should exist at ${logFile.path}")
- assertTrue(logFile.length() > 0, "Log file should not be empty")
-
- // Check that texts are output in month-aware order in the TAR file
- // Note: We check TAR order instead of log order because parallel processing means
- // log completion order can differ from submission order, but TAR output follows sorted order
- val monthOrder = mapOf(
- "JAN" to 1, "FEB" to 2, "MAR" to 3, "MRZ" to 3, "APR" to 4,
- "MAY" to 5, "MAI" to 5, "JUN" to 6, "JUL" to 7, "AUG" to 8,
- "SEP" to 9, "OCT" to 10, "OKT" to 10, "NOV" to 11, "DEC" to 12, "DEZ" to 12
- )
- data class MonthKey(
- val prefix: String,
- val monthRank: Int,
- val mid: String,
- val num: Long,
- val fallback: String
- ) : Comparable<MonthKey> {
- override fun compareTo(other: MonthKey): Int {
- // First compare by prefix
- val prefixCmp = prefix.compareTo(other.prefix)
- if (prefixCmp != 0) return prefixCmp
-
- // Then compare by month rank
- val rankCmp = monthRank.compareTo(other.monthRank)
- if (rankCmp != 0) return rankCmp
-
- // If both have no month rank (both MAX_VALUE), compare mid alphabetically
- if (monthRank == Int.MAX_VALUE && other.monthRank == Int.MAX_VALUE) {
- val midCmp = mid.compareTo(other.mid)
- if (midCmp != 0) return midCmp
- }
-
- // Then compare by number
- val numCmp = num.compareTo(other.num)
- if (numCmp != 0) return numCmp
-
- // Finally fallback to full ID
- return fallback.compareTo(other.fallback)
- }
- }
-
- fun monthAwareKey(textId: String): MonthKey {
- val tokens = textId.split('_', '.', '-')
- val prefix = tokens.getOrNull(0) ?: textId
- val mid = tokens.getOrNull(1) ?: ""
- val num = tokens.getOrNull(2)?.toLongOrNull() ?: Long.MAX_VALUE
- val monthRank = if (mid.length == 3) monthOrder[mid] else null
- return MonthKey(prefix, monthRank ?: Int.MAX_VALUE, mid, num, textId)
- }
-
- // Extract text IDs from TAR file (these are written in sorted order)
- val tarListProcess = ProcessBuilder("tar", "-tf", generatedTar.path)
- .redirectErrorStream(true)
- .start()
- val tarFiles = tarListProcess.inputStream.bufferedReader().readLines()
- assertTrue(tarListProcess.waitFor() == 0, "tar -tf should succeed")
-
- // Extract text IDs from JSON filenames in TAR
- val textIdsInTar = tarFiles
- .filter { it.endsWith(".json.gz") }
- .map { it.substringAfterLast('/').removeSuffix(".json.gz").replace('-', '_').replace('.', '_') }
-
- if (textIdsInTar.isNotEmpty()) {
- // Check if text IDs in TAR follow month-aware ordering
- val sortedTextIds = textIdsInTar.sortedWith(compareBy { monthAwareKey(it) })
- assertEquals(
- sortedTextIds,
- textIdsInTar,
- "Text IDs in TAR should be in month-aware order. Expected: $sortedTextIds, but got: $textIdsInTar"
- )
- }
-
- // Extract tar to verify it contains JSON files
- val extractDir = File.createTempFile("extract", "").let {
- it.delete()
- it.mkdirs()
- it
- }
-
- try {
- // Extract tar
- val tarProcess = ProcessBuilder("tar", "-xf", generatedTar.path, "-C", extractDir.path)
- .redirectErrorStream(true)
- .start()
- assertTrue(tarProcess.waitFor() == 0, "Tar extraction should succeed")
-
- // Get list of JSON files
- val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.gz") } ?: emptyList()
- assertTrue(jsonFiles.isNotEmpty(), "Tar should contain JSON.gz files")
-
- // Verify each JSON file is valid
- jsonFiles.forEach { jsonFile ->
- val jsonContent = ProcessBuilder("gunzip", "-c", jsonFile.path)
- .redirectOutput(ProcessBuilder.Redirect.PIPE)
- .start()
- .inputStream
- .bufferedReader()
- .readText()
-
- // Check required fields in JSON
- assertTrue(jsonContent.contains("\"@context\""), "JSON should have @context")
- assertTrue(jsonContent.contains("\"@type\":\"koral:corpus\""), "JSON should have correct @type")
- assertTrue(jsonContent.contains("\"data\""), "JSON should have data section")
- assertTrue(jsonContent.contains("\"foundries\""), "JSON should have foundries")
- assertTrue(jsonContent.contains("\"layerInfos\""), "JSON should have layerInfos")
- assertTrue(jsonContent.contains("\"name\":\"tokens\""), "JSON should have name field")
- assertTrue(jsonContent.contains("\"stream\""), "JSON should have stream")
- assertTrue(jsonContent.contains("\"text\""), "JSON should have text")
-
- // Check for multiple foundries
- assertTrue(jsonContent.contains("spacy"), "JSON should contain spacy foundry")
- assertTrue(jsonContent.contains("marmot") || jsonContent.contains("malt"), "JSON should contain marmot or malt foundry")
- assertTrue(jsonContent.contains("treetagger"), "JSON should contain treetagger foundry")
- }
- } finally {
- extractDir.deleteRecursively()
- }
- }
-
- @Test
- fun krillOutputContainsInverseDependencies() {
- // Test that inverse dependency annotations are included
- val baseZip = loadResource("wud24_sample.zip").path
- val spacyZip = loadResource("wud24_sample.spacy.zip").path
-
- val generatedTar = ensureKrillTar("wud24_base_spacy") { outputDir ->
- arrayOf("-t", "krill", "-D", outputDir.path, baseZip, spacyZip)
- }
- assertTrue(generatedTar.exists())
-
- // Extract and check for inverse dependencies
- val extractDir = File.createTempFile("extract_inv", "").let {
- it.delete()
- it.mkdirs()
- it
- }
-
- try {
- ProcessBuilder("tar", "-xf", generatedTar.path, "-C", extractDir.path).start().waitFor()
- val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.gz") } ?: emptyList()
- assertTrue(jsonFiles.isNotEmpty())
-
- jsonFiles.forEach { jsonFile ->
- val jsonContent = ProcessBuilder("gunzip", "-c", jsonFile.path)
- .redirectOutput(ProcessBuilder.Redirect.PIPE)
- .start()
- .inputStream
- .bufferedReader()
- .readText()
-
- // Check for inverse dependency annotations (format: <:foundry/d:label$...)
- assertTrue(
- jsonContent.contains("<:") && jsonContent.contains("/d:"),
- "JSON should contain inverse dependency annotations"
- )
- }
- } finally {
- extractDir.deleteRecursively()
- }
- }
-
- @Test
- fun krillOutputContainsBaseStructureSpans() {
- // Test that base structure spans are included
- val baseZip = loadResource("wud24_sample.zip").path
- val spacyZip = loadResource("wud24_sample.spacy.zip").path
-
- val generatedTar = ensureKrillTar("wud24_base_spacy") { outputDir ->
- arrayOf("-t", "krill", "-D", outputDir.path, baseZip, spacyZip)
- }
- assertTrue(generatedTar.exists())
-
- val extractDir = File.createTempFile("extract_base", "").let {
- it.delete()
- it.mkdirs()
- it
- }
-
- try {
- ProcessBuilder("tar", "-xf", generatedTar.path, "-C", extractDir.path).start().waitFor()
- val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.gz") } ?: emptyList()
- assertTrue(jsonFiles.isNotEmpty())
-
- jsonFiles.forEach { jsonFile ->
- val jsonContent = ProcessBuilder("gunzip", "-c", jsonFile.path)
- .redirectOutput(ProcessBuilder.Redirect.PIPE)
- .start()
- .inputStream
- .bufferedReader()
- .readText()
-
- // Check for base structure spans
- assertTrue(
- jsonContent.contains("base/s:t"),
- "JSON should contain base text span (base/s:t)"
- )
- assertTrue(
- jsonContent.contains("base/s:s"),
- "JSON should contain base sentence spans (base/s:s)"
- )
- }
- } finally {
- extractDir.deleteRecursively()
- }
- }
-
- @Test
- fun krillOutputIncludesAllFoundries() {
- // Test that all foundries are properly included
- val baseZip = loadResource("wud24_sample.zip").path
- val spacyZip = loadResource("wud24_sample.spacy.zip").path
- val marmotZip = loadResource("wud24_sample.marmot-malt.zip").path
- val opennlpZip = loadResource("wud24_sample.opennlp.zip").path
- val treeTaggerZip = loadResource("wud24_sample.tree_tagger.zip").path
-
- val generatedTar = ensureKrillTar("wud24_full_foundries") { outputDir ->
- arrayOf("-t", "krill", "-D", outputDir.path, baseZip, spacyZip, marmotZip, opennlpZip, treeTaggerZip)
- }
- assertTrue(generatedTar.exists())
-
- val extractDir = File.createTempFile("extract_foundries", "").let {
- it.delete()
- it.mkdirs()
- it
- }
-
- try {
- ProcessBuilder("tar", "-xf", generatedTar.path, "-C", extractDir.path).start().waitFor()
- val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.gz") } ?: emptyList()
- assertTrue(jsonFiles.isNotEmpty())
-
- jsonFiles.forEach { jsonFile ->
- val jsonContent = ProcessBuilder("gunzip", "-c", jsonFile.path)
- .redirectOutput(ProcessBuilder.Redirect.PIPE)
- .start()
- .inputStream
- .bufferedReader()
- .readText()
-
- // Check foundries field includes all expected foundries
- val foundries = jsonContent.substringAfter("\"foundries\":").substringBefore(",").trim()
- assertTrue(foundries.contains("spacy"), "Foundries should include spacy")
- assertTrue(foundries.contains("marmot") || foundries.contains("malt"), "Foundries should include marmot or malt")
- assertTrue(foundries.contains("opennlp"), "Foundries should include opennlp")
- assertTrue(foundries.contains("treetagger"), "Foundries should include treetagger (not tt)")
- assertTrue(foundries.contains("dereko"), "Foundries should include dereko")
- }
- } finally {
- extractDir.deleteRecursively()
- }
- }
-
- @Test
- fun krillRespectsNonWordTokenOption() {
- val baseZip = loadResource("wud24_sample.zip").path
- val spacyZip = loadResource("wud24_sample.spacy.zip").path
-
- val defaultTar = ensureKrillTar("wud24_default_corenlp") { outputDir ->
- arrayOf("-t", "krill", "-D", outputDir.path, baseZip, spacyZip, wud24Corenlp)
- }
- assertTrue(defaultTar.exists(), "Default krill tar should exist")
-
- val defaultJsons = readKrillJson(defaultTar).values
- assertTrue(defaultJsons.isNotEmpty(), "Default Krill tar should contain JSON files")
- assertTrue(
- defaultJsons.all { !it.contains("\"s:,\"") },
- "Default Krill output should skip comma tokens"
- )
- assertTrue(
- defaultJsons.all { !it.contains("\"s:!\"") },
- "Default Krill output should skip exclamation mark tokens"
- )
-
- val flagTar = ensureKrillTar("wud24_default_corenlp_nwt") { outputDir ->
- arrayOf("-t", "krill", "--non-word-tokens", "-D", outputDir.path, baseZip, spacyZip, wud24Corenlp)
- }
- assertTrue(flagTar.exists(), "Krill tar should exist when --non-word-tokens is set")
-
- val flagJsons = readKrillJson(flagTar).values
- assertTrue(flagJsons.isNotEmpty(), "Krill tar should contain JSON files when --non-word-tokens is set")
- assertTrue(
- flagJsons.any { it.contains("\"s:,\"") },
- "Krill output should include commas when --non-word-tokens is set"
- )
- assertTrue(
- flagJsons.any { it.contains("\"s:!\"") },
- "Krill output should include exclamation marks when --non-word-tokens is set"
- )
- }
-
- @Test
- fun krillDefaultMatchesPerlReference() {
- val baseZip = loadResource("wud24_sample.zip").path
- val spacyZip = loadResource("wud24_sample.spacy.zip").path
- val marmotMaltZip = loadResource("wud24_sample.marmot-malt.zip").path
- val opennlpZip = loadResource("wud24_sample.opennlp.zip").path
- val treeTaggerZip = loadResource("wud24_sample.tree_tagger.zip").path
- val corenlpZip = wud24Corenlp
- val referenceTar = File(loadResource("wud24_sample.wonwtopt.krill.tar").toURI())
- assertTrue(referenceTar.exists(), "Reference Krill tar is missing: ${referenceTar.path}")
-
- val kotlinTar = ensureKrillTar("wud24_reference_default") { outputDir ->
- arrayOf(
- "-t", "krill",
- "-D", outputDir.path,
- baseZip,
- spacyZip,
- marmotMaltZip,
- treeTaggerZip,
- corenlpZip
- )
- }
- assertTrue(kotlinTar.exists(), "Kotlin-produced Krill tar should exist at ${kotlinTar.path}")
-
- val kotlinJsons = readKrillJson(kotlinTar)
- val referenceJsons = readKrillJson(referenceTar)
-
- assertEquals(referenceJsons.keys, kotlinJsons.keys, "Kotlin and reference JSON sets differ")
-
- val tokensToCheck = listOf("\"s:,\"", "\"s:.\"")
- referenceJsons.forEach { (doc, referenceJson) ->
- val kotlinJson = kotlinJsons.getValue(doc)
- tokensToCheck.forEach { token ->
- val refHas = referenceJson.contains(token)
- val kotlinHas = kotlinJson.contains(token)
- assertEquals(
- refHas,
- kotlinHas,
- "Mismatch for $token in document $doc compared to reference"
- )
- }
- }
- }
-
- @Test
- fun krillNonWordTokensMatchesPerlReference() {
- val baseZip = loadResource("wud24_sample.zip").path
- val spacyZip = loadResource("wud24_sample.spacy.zip").path
- val marmotMaltZip = loadResource("wud24_sample.marmot-malt.zip").path
- val treeTaggerZip = loadResource("wud24_sample.tree_tagger.zip").path
- val corenlpZipNwt = wud24Corenlp
- val referenceTar = File(loadResource("wud24_sample.nwt.krill.tar").toURI())
- assertTrue(referenceTar.exists(), "Non-word-token reference tar missing: ${referenceTar.path}")
-
- val kotlinTar = ensureKrillTar("wud24_reference_nwt") { outputDir ->
- arrayOf(
- "-t", "krill",
- "--non-word-tokens",
- "-D", outputDir.path,
- baseZip,
- spacyZip,
- marmotMaltZip,
- treeTaggerZip,
- corenlpZipNwt
- )
- }
- assertTrue(kotlinTar.exists(), "Kotlin-produced Krill tar (nwt) should exist at ${kotlinTar.path}")
-
- val kotlinJsons = readKrillJson(kotlinTar)
- val referenceJsons = readKrillJson(referenceTar)
-
- assertEquals(referenceJsons.keys, kotlinJsons.keys, "Kotlin and reference JSON sets differ (nwt)")
-
- val tokensToCheck = listOf(
- "\"s:,\"",
- "\"s:.\"",
- "\"s:!\"",
- "\"marmot/p:\\$,\"",
- "\"spacy/p:\\$,\"",
- "\"tt/p:\\$,\"",
- "\"-:corenlp/sentences\$<i>11\"",
- "corenlp/s=spans",
- "corenlp/c=spans"
- )
- referenceJsons.forEach { (doc, referenceJson) ->
- val kotlinJson = kotlinJsons.getValue(doc)
- tokensToCheck.forEach { token ->
- val refHas = referenceJson.contains(token)
- val kotlinHas = kotlinJson.contains(token)
- assertEquals(
- refHas,
- kotlinHas,
- "Mismatch for $token in document $doc compared to nwt reference"
- )
- }
- }
- }
-
- @Test
- fun corenlpConstituencyParsing() {
- // Check if CoreNLP models are available
- val taggerModel = File("libs/german-fast.tagger")
- val parserModel = File("libs/germanSR.ser.gz")
-
- if (!taggerModel.exists() || !parserModel.exists()) {
- System.err.println("Skipping CoreNLP test: model files not found")
- System.err.println(" Tagger: ${taggerModel.absolutePath} - exists: ${taggerModel.exists()}")
- System.err.println(" Parser: ${parserModel.absolutePath} - exists: ${parserModel.exists()}")
- return
- }
-
- val baseZip = loadResource("wud24_sample.zip").path
- val outputDir = createTempDir("corenlp_test")
-
- try {
- // Run CoreNLP with both tagger and parser
- val args = arrayOf(
- "-t", "zip",
- "-o",
- "-D", outputDir.path,
- "-t", "corenlp:${taggerModel.path}",
- "-P", "corenlp:${parserModel.path}",
- baseZip
- )
-
- val exitCode = debug(args)
- assertEquals(0, exitCode, "CoreNLP processing should succeed")
-
- // Check output ZIP was created
- val outputZip = File(outputDir, "wud24_sample.corenlp.zip")
- assertTrue(outputZip.exists(), "Output ZIP should exist at ${outputZip.path}")
-
- // Check that constituency.xml files were created
- val constituencyFiles = mutableListOf<String>()
- ProcessBuilder("unzip", "-l", outputZip.path)
- .redirectOutput(ProcessBuilder.Redirect.PIPE)
- .start()
- .inputStream
- .bufferedReader()
- .useLines { lines ->
- lines.forEach { line ->
- if (line.contains("constituency.xml")) {
- constituencyFiles.add(line.trim())
- }
- }
- }
-
- assertTrue(constituencyFiles.isNotEmpty(), "Should have constituency.xml files in output")
-
- // Verify we have the expected documents
- val expectedDocs = listOf(
- "WUD24/I0083/95367/corenlp/constituency.xml",
- "WUD24/Z0087/65594/corenlp/constituency.xml",
- "WUD24/K0086/98010/corenlp/constituency.xml"
- )
-
- expectedDocs.forEach { docPath ->
- val found = constituencyFiles.any { it.contains(docPath) }
- assertTrue(found, "Should have constituency.xml for $docPath")
- }
-
- // Check morpho.xml files also exist (tagger output)
- val morphoFiles = mutableListOf<String>()
- ProcessBuilder("unzip", "-l", outputZip.path)
- .redirectOutput(ProcessBuilder.Redirect.PIPE)
- .start()
- .inputStream
- .bufferedReader()
- .useLines { lines ->
- lines.forEach { line ->
- if (line.contains("/corenlp/morpho.xml")) {
- morphoFiles.add(line.trim())
- }
- }
- }
-
- assertTrue(morphoFiles.size >= 3, "Should have morpho.xml files for at least 3 documents")
-
- } finally {
- outputDir.deleteRecursively()
- }
- }
-
- @Test
- fun conlluIncludesConstituencyCommentsWhenAvailable() {
- outContent.reset()
- errContent.reset()
-
- val args = arrayOf(wud24Corenlp)
- val exitCode = debug(args)
- assertEquals(0, exitCode, "CoNLL-U conversion should succeed when constituency annotations are present")
-
- val output = outContent.toString("UTF-8")
- val constituencyLines = output.lineSequence().filter { it.startsWith("# constituency =") }.toList()
-
- assertTrue(constituencyLines.isNotEmpty(), "CoNLL-U output should include constituency comment lines")
- assertTrue(
- constituencyLines.first().contains("("),
- "Constituency comment should contain bracketed structure"
- )
- }
-
- private fun KorapXmlTool.compareTextIds(a: String, b: String): Int {
- val m = KorapXmlTool::class.java.getDeclaredMethod("compareTextIds", String::class.java, String::class.java)
- m.isAccessible = true
- return m.invoke(this, a, b) as Int
- }
-
- @Test
- fun monthAwareComparatorOrdersCalendarMonths() {
- val tool = KorapXmlTool()
- assertTrue(tool.compareTextIds("ZGE24_JAN.00001", "ZGE24_MAR.00001") < 0, "JAN should sort before MAR")
- assertTrue(tool.compareTextIds("ZGE24_MRZ.00001", "ZGE24_APR.00001") < 0, "MRZ should sort before APR")
- assertTrue(tool.compareTextIds("ZGE24_OKT.00001", "ZGE24_SEP.00001") > 0, "OKT should sort after SEP")
- assertTrue(tool.compareTextIds("ZGE24_DEZ.00001", "ZGE24_NOV.00001") > 0, "DEZ should sort after NOV")
- assertTrue(tool.compareTextIds("ZGE24_MAI.00001", "ZGE24_JUL.00001") < 0, "MAI should sort before JUL")
- }
-
- @Test
- fun monthAwareComparatorFallsBackToAlphabeticalWhenNoMonth() {
- val tool = KorapXmlTool()
- val ids = listOf("WUD24_I0083.95367", "WUD24_Z0087.65594", "WUD24_K0086.98010")
- val sorted = ids.sortedWith { a, b -> tool.compareTextIds(a, b) }
- assertEquals(listOf("WUD24_I0083.95367", "WUD24_K0086.98010", "WUD24_Z0087.65594"), sorted, "Non-month IDs should sort alphabetically")
- }
-
- @Test
- fun monthAwareComparatorSortsMixedMonthsInCalendarOrder() {
- val tool = KorapXmlTool()
- val ids = listOf(
- "ZGE24_OKT.00002",
- "ZGE24_JAN.00003",
- "ZGE24_DEZ.00001",
- "ZGE24_SEP.00005",
- "ZGE24_MAR.00001"
- )
- val expected = listOf(
- "ZGE24_JAN.00003",
- "ZGE24_MAR.00001",
- "ZGE24_SEP.00005",
- "ZGE24_OKT.00002",
- "ZGE24_DEZ.00001"
- )
- val sorted = ids.sortedWith { a, b -> tool.compareTextIds(a, b) }
- assertEquals(expected, sorted, "Mixed month IDs should follow calendar order")
- }
-
- private fun readKrillJson(tarFile: File): Map<String, String> {
- val extractDir = File.createTempFile("krill_extract", "").let {
- it.delete()
- it.mkdirs()
- it
- }
-
- return try {
- val tarProcess = ProcessBuilder("tar", "-xf", tarFile.path, "-C", extractDir.path)
- .redirectErrorStream(true)
- .start()
- assertTrue(tarProcess.waitFor() == 0, "Tar extraction should succeed for ${tarFile.path}")
-
- val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.gz") }.orEmpty()
- assertTrue(jsonFiles.isNotEmpty(), "No JSON files found in ${tarFile.path}")
-
- jsonFiles.associate { jsonFile ->
- val jsonContent = ProcessBuilder("gunzip", "-c", jsonFile.path)
- .redirectOutput(ProcessBuilder.Redirect.PIPE)
- .start()
- .inputStream
- .bufferedReader()
- .use { it.readText() }
- jsonFile.name.removeSuffix(".gz") to jsonContent
- }
- } finally {
- extractDir.deleteRecursively()
- }
- }
-}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
new file mode 100644
index 0000000..1e124fb
--- /dev/null
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
@@ -0,0 +1,397 @@
+package de.ids_mannheim.korapxmltools
+
+import org.junit.After
+import org.junit.AfterClass
+import org.junit.Before
+import java.io.ByteArrayOutputStream
+import java.io.File
+import java.io.PrintStream
+import java.net.URL
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertTrue
+
+/**
+ * Tests for Krill JSON format output (-t krill)
+ */
+class KrillJsonGeneratorTest {
+ companion object {
+ private data class KrillTarResult(val outputDir: File, val tar: File)
+ private val krillTarCache = mutableMapOf<String, KrillTarResult>()
+
+ private fun ensureKrillTar(
+ key: String,
+ tarName: String = "wud24_sample.krill.tar",
+ argsBuilder: (File) -> Array<String>
+ ): File {
+ return krillTarCache.getOrPut(key) {
+ val outputDir = File.createTempFile(key, "").apply {
+ delete()
+ mkdirs()
+ }
+ val args = argsBuilder(outputDir)
+ val exitCode = debug(args)
+ assertTrue(exitCode == 0, "Krill conversion should succeed for cached fixture '$key'")
+ val tar = File(outputDir, tarName)
+ assertTrue(tar.exists(), "Expected $tarName for cached fixture '$key'")
+ KrillTarResult(outputDir, tar)
+ }.tar
+ }
+
+ @JvmStatic
+ @AfterClass
+ fun cleanupKrillTarCache() {
+ krillTarCache.values.forEach { it.outputDir.deleteRecursively() }
+ krillTarCache.clear()
+ }
+
+ private fun readKrillJson(tarFile: File): Map<String, String> {
+ val extractDir = File.createTempFile("krill_extract", "").let {
+ it.delete()
+ it.mkdirs()
+ it
+ }
+
+ return try {
+ val tarProcess = ProcessBuilder("tar", "-xf", tarFile.path, "-C", extractDir.path)
+ .redirectErrorStream(true)
+ .start()
+ assertTrue(tarProcess.waitFor() == 0, "Tar extraction should succeed for ${tarFile.path}")
+
+ val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.gz") }.orEmpty()
+ assertTrue(jsonFiles.isNotEmpty(), "No JSON files found in ${tarFile.path}")
+
+ jsonFiles.associate { jsonFile ->
+ val jsonContent = ProcessBuilder("gunzip", "-c", jsonFile.path)
+ .redirectOutput(ProcessBuilder.Redirect.PIPE)
+ .start()
+ .inputStream
+ .bufferedReader()
+ .use { it.readText() }
+ jsonFile.name.removeSuffix(".gz") to jsonContent
+ }
+ } finally {
+ extractDir.deleteRecursively()
+ }
+ }
+ }
+
+ private val outContent = ByteArrayOutputStream(10000000)
+ private val errContent = ByteArrayOutputStream()
+ private val originalOut: PrintStream = System.out
+ private val originalErr: PrintStream = System.err
+
+ @Before
+ fun setUpStreams() {
+ System.setOut(PrintStream(outContent))
+ System.setErr(PrintStream(errContent))
+ }
+
+ @After
+ fun restoreStreams() {
+ System.setOut(originalOut)
+ System.setErr(originalErr)
+ }
+
+ private fun loadResource(path: String): URL {
+ val resource = Thread.currentThread().contextClassLoader.getResource(path)
+ requireNotNull(resource) { "Resource $path not found" }
+ return resource
+ }
+
+ @Test
+ fun krillOutputMatchesExpectedStructure() {
+ val baseZip = loadResource("wud24_sample.zip").path
+ val spacyZip = loadResource("wud24_sample.spacy.zip").path
+ val marmotMaltZip = loadResource("wud24_sample.marmot-malt.zip").path
+ val opennlpZip = loadResource("wud24_sample.opennlp.zip").path
+ val treeTaggerZip = loadResource("wud24_sample.tree_tagger.zip").path
+
+ val generatedTar = ensureKrillTar("wud24_full_foundries") { outputDir ->
+ arrayOf(
+ "-t", "krill",
+ "-l", "info",
+ "-D", outputDir.path,
+ baseZip, spacyZip, marmotMaltZip, opennlpZip, treeTaggerZip
+ )
+ }
+ assertTrue(generatedTar.exists())
+ assertTrue(generatedTar.length() > 0)
+
+ val logFile = File(generatedTar.path.replace(Regex("\\.tar$"), ".log"))
+ assertTrue(logFile.exists())
+ assertTrue(logFile.length() > 0)
+
+ val monthOrder = mapOf(
+ "JAN" to 1, "FEB" to 2, "MAR" to 3, "MRZ" to 3, "APR" to 4,
+ "MAY" to 5, "MAI" to 5, "JUN" to 6, "JUL" to 7, "AUG" to 8,
+ "SEP" to 9, "OCT" to 10, "OKT" to 10, "NOV" to 11, "DEC" to 12, "DEZ" to 12
+ )
+ data class MonthKey(
+ val prefix: String,
+ val monthRank: Int,
+ val mid: String,
+ val num: Long,
+ val fallback: String
+ ) : Comparable<MonthKey> {
+ override fun compareTo(other: MonthKey): Int {
+ val prefixCmp = prefix.compareTo(other.prefix)
+ if (prefixCmp != 0) return prefixCmp
+ val rankCmp = monthRank.compareTo(other.monthRank)
+ if (rankCmp != 0) return rankCmp
+ if (monthRank == Int.MAX_VALUE && other.monthRank == Int.MAX_VALUE) {
+ val midCmp = mid.compareTo(other.mid)
+ if (midCmp != 0) return midCmp
+ }
+ val numCmp = num.compareTo(other.num)
+ if (numCmp != 0) return numCmp
+ return fallback.compareTo(other.fallback)
+ }
+ }
+
+ fun monthAwareKey(textId: String): MonthKey {
+ val tokens = textId.split('_', '.', '-')
+ val prefix = tokens.getOrNull(0) ?: textId
+ val mid = tokens.getOrNull(1) ?: ""
+ val num = tokens.getOrNull(2)?.toLongOrNull() ?: Long.MAX_VALUE
+ val monthRank = if (mid.length == 3) monthOrder[mid] else null
+ return MonthKey(prefix, monthRank ?: Int.MAX_VALUE, mid, num, textId)
+ }
+
+ val tarListProcess = ProcessBuilder("tar", "-tf", generatedTar.path).redirectErrorStream(true).start()
+ val tarFiles = tarListProcess.inputStream.bufferedReader().readLines()
+ assertTrue(tarListProcess.waitFor() == 0)
+
+ val textIdsInTar = tarFiles
+ .filter { it.endsWith(".json.gz") }
+ .map { it.substringAfterLast('/').removeSuffix(".json.gz").replace('-', '_').replace('.', '_') }
+
+ if (textIdsInTar.isNotEmpty()) {
+ val sortedTextIds = textIdsInTar.sortedWith(compareBy { monthAwareKey(it) })
+ assertEquals(sortedTextIds, textIdsInTar)
+ }
+
+ val extractDir = File.createTempFile("extract", "").let { it.delete(); it.mkdirs(); it }
+ try {
+ val tarProcess = ProcessBuilder("tar", "-xf", generatedTar.path, "-C", extractDir.path)
+ .redirectErrorStream(true).start()
+ assertTrue(tarProcess.waitFor() == 0)
+
+ val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.gz") } ?: emptyList()
+ assertTrue(jsonFiles.isNotEmpty())
+
+ jsonFiles.forEach { jsonFile ->
+ val jsonContent = ProcessBuilder("gunzip", "-c", jsonFile.path)
+ .redirectOutput(ProcessBuilder.Redirect.PIPE)
+ .start().inputStream.bufferedReader().readText()
+
+ assertTrue(jsonContent.contains("\"@context\""))
+ assertTrue(jsonContent.contains("\"@type\":\"koral:corpus\""))
+ assertTrue(jsonContent.contains("\"data\""))
+ assertTrue(jsonContent.contains("\"foundries\""))
+ assertTrue(jsonContent.contains("\"layerInfos\""))
+ assertTrue(jsonContent.contains("\"name\":\"tokens\""))
+ assertTrue(jsonContent.contains("\"stream\""))
+ assertTrue(jsonContent.contains("\"text\""))
+ assertTrue(jsonContent.contains("spacy"))
+ assertTrue(jsonContent.contains("marmot") || jsonContent.contains("malt"))
+ assertTrue(jsonContent.contains("treetagger"))
+ }
+ } finally {
+ extractDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun krillOutputContainsInverseDependencies() {
+ val baseZip = loadResource("wud24_sample.zip").path
+ val spacyZip = loadResource("wud24_sample.spacy.zip").path
+
+ val generatedTar = ensureKrillTar("wud24_base_spacy") { outputDir ->
+ arrayOf("-t", "krill", "-D", outputDir.path, baseZip, spacyZip)
+ }
+ assertTrue(generatedTar.exists())
+
+ val extractDir = File.createTempFile("extract_inv", "").let { it.delete(); it.mkdirs(); it }
+ try {
+ ProcessBuilder("tar", "-xf", generatedTar.path, "-C", extractDir.path).start().waitFor()
+ val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.gz") } ?: emptyList()
+ assertTrue(jsonFiles.isNotEmpty())
+
+ jsonFiles.forEach { jsonFile ->
+ val jsonContent = ProcessBuilder("gunzip", "-c", jsonFile.path)
+ .redirectOutput(ProcessBuilder.Redirect.PIPE)
+ .start().inputStream.bufferedReader().readText()
+
+ assertTrue(
+ jsonContent.contains("<:") && jsonContent.contains("/d:"),
+ "JSON should contain inverse dependency annotations"
+ )
+ }
+ } finally {
+ extractDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun krillOutputContainsBaseStructureSpans() {
+ val baseZip = loadResource("wud24_sample.zip").path
+ val spacyZip = loadResource("wud24_sample.spacy.zip").path
+
+ val generatedTar = ensureKrillTar("wud24_base_spacy") { outputDir ->
+ arrayOf("-t", "krill", "-D", outputDir.path, baseZip, spacyZip)
+ }
+ assertTrue(generatedTar.exists())
+
+ val extractDir = File.createTempFile("extract_base", "").let { it.delete(); it.mkdirs(); it }
+ try {
+ ProcessBuilder("tar", "-xf", generatedTar.path, "-C", extractDir.path).start().waitFor()
+ val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.gz") } ?: emptyList()
+ assertTrue(jsonFiles.isNotEmpty())
+
+ jsonFiles.forEach { jsonFile ->
+ val jsonContent = ProcessBuilder("gunzip", "-c", jsonFile.path)
+ .redirectOutput(ProcessBuilder.Redirect.PIPE)
+ .start().inputStream.bufferedReader().readText()
+
+ assertTrue(jsonContent.contains("base/s:t"), "JSON should contain base text span (base/s:t)")
+ assertTrue(jsonContent.contains("base/s:s"), "JSON should contain base sentence spans (base/s:s)")
+ }
+ } finally {
+ extractDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun krillOutputIncludesAllFoundries() {
+ val baseZip = loadResource("wud24_sample.zip").path
+ val spacyZip = loadResource("wud24_sample.spacy.zip").path
+ val marmotZip = loadResource("wud24_sample.marmot-malt.zip").path
+ val opennlpZip = loadResource("wud24_sample.opennlp.zip").path
+ val treeTaggerZip = loadResource("wud24_sample.tree_tagger.zip").path
+
+ val generatedTar = ensureKrillTar("wud24_full_foundries") { outputDir ->
+ arrayOf("-t", "krill", "-D", outputDir.path, baseZip, spacyZip, marmotZip, opennlpZip, treeTaggerZip)
+ }
+ assertTrue(generatedTar.exists())
+
+ val extractDir = File.createTempFile("extract_foundries", "").let { it.delete(); it.mkdirs(); it }
+ try {
+ ProcessBuilder("tar", "-xf", generatedTar.path, "-C", extractDir.path).start().waitFor()
+ val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.gz") } ?: emptyList()
+ assertTrue(jsonFiles.isNotEmpty())
+
+ jsonFiles.forEach { jsonFile ->
+ val jsonContent = ProcessBuilder("gunzip", "-c", jsonFile.path)
+ .redirectOutput(ProcessBuilder.Redirect.PIPE)
+ .start().inputStream.bufferedReader().readText()
+
+ val foundries = jsonContent.substringAfter("\"foundries\":").substringBefore(",").trim()
+ assertTrue(foundries.contains("spacy"))
+ assertTrue(foundries.contains("marmot") || foundries.contains("malt"))
+ assertTrue(foundries.contains("opennlp"))
+ assertTrue(foundries.contains("treetagger"))
+ assertTrue(foundries.contains("dereko"))
+ }
+ } finally {
+ extractDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun krillRespectsNonWordTokenOption() {
+ val baseZip = loadResource("wud24_sample.zip").path
+ val spacyZip = loadResource("wud24_sample.spacy.zip").path
+ val wud24Corenlp = loadResource("wud24_sample.corenlp.zip").path
+
+ val defaultTar = ensureKrillTar("wud24_default_corenlp") { outputDir ->
+ arrayOf("-t", "krill", "-D", outputDir.path, baseZip, spacyZip, wud24Corenlp)
+ }
+ assertTrue(defaultTar.exists())
+
+ val defaultJsons = readKrillJson(defaultTar).values
+ assertTrue(defaultJsons.isNotEmpty())
+ assertTrue(defaultJsons.all { !it.contains("\"s:,\"") })
+ assertTrue(defaultJsons.all { !it.contains("\"s:!\"") })
+
+ val flagTar = ensureKrillTar("wud24_default_corenlp_nwt") { outputDir ->
+ arrayOf("-t", "krill", "--non-word-tokens", "-D", outputDir.path, baseZip, spacyZip, wud24Corenlp)
+ }
+ assertTrue(flagTar.exists())
+
+ val flagJsons = readKrillJson(flagTar).values
+ assertTrue(flagJsons.isNotEmpty())
+ assertTrue(flagJsons.any { it.contains("\"s:,\"") })
+ assertTrue(flagJsons.any { it.contains("\"s:!\"") })
+ }
+
+ @Test
+ fun krillDefaultMatchesPerlReference() {
+ val baseZip = loadResource("wud24_sample.zip").path
+ val spacyZip = loadResource("wud24_sample.spacy.zip").path
+ val marmotMaltZip = loadResource("wud24_sample.marmot-malt.zip").path
+ val treeTaggerZip = loadResource("wud24_sample.tree_tagger.zip").path
+ val corenlpZip = loadResource("wud24_sample.corenlp.zip").path
+ val referenceTar = File(loadResource("wud24_sample.wonwtopt.krill.tar").toURI())
+ assertTrue(referenceTar.exists())
+
+ val kotlinTar = ensureKrillTar("wud24_reference_default") { outputDir ->
+ arrayOf("-t", "krill", "-D", outputDir.path, baseZip, spacyZip, marmotMaltZip, treeTaggerZip, corenlpZip)
+ }
+ assertTrue(kotlinTar.exists())
+
+ val kotlinJsons = readKrillJson(kotlinTar)
+ val referenceJsons = readKrillJson(referenceTar)
+
+ assertEquals(referenceJsons.keys, kotlinJsons.keys)
+
+ val tokensToCheck = listOf("\"s:,\"", "\"s:.\"")
+ referenceJsons.forEach { (doc, referenceJson) ->
+ val kotlinJson = kotlinJsons.getValue(doc)
+ tokensToCheck.forEach { token ->
+ val refHas = referenceJson.contains(token)
+ val kotlinHas = kotlinJson.contains(token)
+ assertEquals(refHas, kotlinHas, "Mismatch for $token in document $doc compared to reference")
+ }
+ }
+ }
+
+ @Test
+ fun krillNonWordTokensMatchesPerlReference() {
+ val baseZip = loadResource("wud24_sample.zip").path
+ val spacyZip = loadResource("wud24_sample.spacy.zip").path
+ val marmotMaltZip = loadResource("wud24_sample.marmot-malt.zip").path
+ val treeTaggerZip = loadResource("wud24_sample.tree_tagger.zip").path
+ val corenlpZipNwt = loadResource("wud24_sample.corenlp.zip").path
+ val referenceTar = File(loadResource("wud24_sample.nwt.krill.tar").toURI())
+ assertTrue(referenceTar.exists())
+
+ val kotlinTar = ensureKrillTar("wud24_reference_nwt") { outputDir ->
+ arrayOf(
+ "-t", "krill", "--non-word-tokens", "-D", outputDir.path,
+ baseZip, spacyZip, marmotMaltZip, treeTaggerZip, corenlpZipNwt
+ )
+ }
+ assertTrue(kotlinTar.exists())
+
+ val kotlinJsons = readKrillJson(kotlinTar)
+ val referenceJsons = readKrillJson(referenceTar)
+
+ assertEquals(referenceJsons.keys, kotlinJsons.keys)
+
+ val tokensToCheck = listOf(
+ "\"s:,\"", "\"s:.\"", "\"s:!\"",
+ "\"marmot/p:\\$,\"", "\"spacy/p:\\$,\"", "\"tt/p:\\$,\"",
+ "\"-:corenlp/sentences\$<i>11\"",
+ "corenlp/s=spans", "corenlp/c=spans"
+ )
+ referenceJsons.forEach { (doc, referenceJson) ->
+ val kotlinJson = kotlinJsons.getValue(doc)
+ tokensToCheck.forEach { token ->
+ val refHas = referenceJson.contains(token)
+ val kotlinHas = kotlinJson.contains(token)
+ assertEquals(refHas, kotlinHas, "Mismatch for $token in document $doc compared to nwt reference")
+ }
+ }
+ }
+}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/NowFormatterTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/NowFormatterTest.kt
new file mode 100644
index 0000000..f5296ee
--- /dev/null
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/NowFormatterTest.kt
@@ -0,0 +1,78 @@
+package de.ids_mannheim.korapxmltools
+
+import org.junit.After
+import org.junit.Before
+import java.io.ByteArrayOutputStream
+import java.io.PrintStream
+import java.net.URL
+import kotlin.test.Test
+import kotlin.test.assertContains
+import kotlin.test.assertTrue
+
+/**
+ * Tests for NOW (News on Web) format output (-t now)
+ */
+class NowFormatterTest {
+ private val outContent = ByteArrayOutputStream(10000000)
+ private val errContent = ByteArrayOutputStream()
+ private val originalOut: PrintStream = System.out
+ private val originalErr: PrintStream = System.err
+
+ @Before
+ fun setUpStreams() {
+ System.setOut(PrintStream(outContent))
+ System.setErr(PrintStream(errContent))
+ }
+
+ @After
+ fun restoreStreams() {
+ System.setOut(originalOut)
+ System.setErr(originalErr)
+ }
+
+ private fun loadResource(path: String): URL {
+ val resource = Thread.currentThread().contextClassLoader.getResource(path)
+ requireNotNull(resource) { "Resource $path not found" }
+ return resource
+ }
+
+ @Test
+ fun nowOptionWorks() {
+ val args = arrayOf("-t", "now", loadResource("wdf19.zip").path)
+ debug(args)
+ val output = outContent.toString()
+ assertContains(output, "@@WDF19_A0000.")
+ assertContains(output, " <p> ")
+ assertContains(output, "Arts visuels Pourquoi toujours vouloir")
+ assertTrue(!output.contains("# foundry"))
+ val lines = output.trim().split('\n')
+ assertTrue(lines.all { it.startsWith("@@") })
+ }
+
+ @Test
+ fun canNowLemma() {
+ val args = arrayOf("--lemma", "-t", "now", loadResource("goe.tree_tagger.zip").path)
+ debug(args)
+ val out = outContent.toString()
+ assertContains(out, "@@")
+ assertContains(out, " <p> ")
+ assertContains(out, " mein Ankunft ")
+ }
+
+ @Test
+ fun lemmaOnlyNowWorks() {
+ val args = arrayOf("--lemma-only", "-t", "now", loadResource("goe.tree_tagger.zip").path)
+ debug(args)
+ val out = outContent.toString()
+ assertContains(out, "@@")
+ assertContains(out, " <p> ")
+ }
+
+ @Test
+ fun sequentialOnlyForNowAndW2V() {
+ val args = arrayOf("--sequential", loadResource("wdf19.zip").path)
+ val rc = debug(args)
+ assertTrue(rc != 0)
+ assertContains(errContent.toString(), "--sequential is supported only with -t word2vec or -t now")
+ }
+}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/Word2VecFormatterTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/Word2VecFormatterTest.kt
new file mode 100644
index 0000000..0e676bc
--- /dev/null
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/Word2VecFormatterTest.kt
@@ -0,0 +1,119 @@
+package de.ids_mannheim.korapxmltools
+
+import org.junit.After
+import org.junit.Before
+import java.io.ByteArrayOutputStream
+import java.io.PrintStream
+import java.net.URL
+import kotlin.test.Test
+import kotlin.test.assertContains
+import kotlin.test.assertFalse
+import kotlin.test.assertTrue
+
+/**
+ * Tests for Word2Vec format output (-t w2v)
+ */
+class Word2VecFormatterTest {
+ private val outContent = ByteArrayOutputStream(10000000)
+ private val errContent = ByteArrayOutputStream()
+ private val originalOut: PrintStream = System.out
+ private val originalErr: PrintStream = System.err
+
+ @Before
+ fun setUpStreams() {
+ System.setOut(PrintStream(outContent))
+ System.setErr(PrintStream(errContent))
+ }
+
+ @After
+ fun restoreStreams() {
+ System.setOut(originalOut)
+ System.setErr(originalErr)
+ }
+
+ private fun loadResource(path: String): URL {
+ val resource = Thread.currentThread().contextClassLoader.getResource(path)
+ requireNotNull(resource) { "Resource $path not found" }
+ return resource
+ }
+
+ @Test
+ fun deprecatedW2vOptionNoLongerWorks() {
+ val args = arrayOf("-w", loadResource("wdf19.zip").path)
+ val exitCode = debug(args)
+ assertTrue(exitCode != 0, "Old -w option should no longer work in v3.0")
+ }
+
+ @Test
+ fun w2vOptionWorks() {
+ val args = arrayOf("-t", "w2v", loadResource("wdf19.zip").path)
+ debug(args)
+ assertContains(outContent.toString(), "\nje ne suis pas du tout d'accord !\n")
+ assertFalse { outContent.toString().contains("WDF19_A0000.13865") }
+ }
+
+ @Test
+ fun canHandleInvalidXmlComments() {
+ val zca20scrambled = loadResource("zca20-scrambled.zip").path
+ val args = arrayOf("-t", "w2v", zca20scrambled)
+ debug(args)
+ assertContains(
+ outContent.toString(),
+ "\nDys est yuch dyr Grund dyfür , dyss ys schon myl myhryry Wochyn dyuyrn kynn .\n"
+ )
+ }
+
+ @Test
+ fun canWord2VecLemma() {
+ val args = arrayOf("--lemma", "-t", "w2v", loadResource("goe.tree_tagger.zip").path)
+ debug(args)
+ val out = outContent.toString()
+ assertContains(out, " mein Ankunft ")
+ }
+
+ @Test
+ fun lemmaOnlyWord2VecWorks() {
+ val args = arrayOf("--lemma-only", "-t", "w2v", loadResource("goe.tree_tagger.zip").path)
+ debug(args)
+ val out = outContent.toString()
+ assertTrue(out.contains(" mein ") || out.contains(" Ankunft "))
+ }
+
+ @Test
+ fun w2vCanExtractMetadata() {
+ val args = arrayOf(
+ "-t", "w2v",
+ "-m", "<textSigle>([^<]+)",
+ "-m", "<creatDate>([^<]+)",
+ loadResource("wdf19.zip").path
+ )
+ debug(args)
+ assertContains(
+ outContent.toString(),
+ "WDF19/A0000.12006\t2011.08.11\tmerci pour l'info je suis curieux !"
+ )
+ }
+
+ @Test
+ fun w2vCanHandleNonBmpText() {
+ val wdd17 = loadResource("wdd17sample.zip").path
+ val args = arrayOf("-t", "w2v", wdd17)
+ debug(args)
+ assertContains(outContent.toString(), "\n-- mach \uD83D\uDE48 \uD83D\uDE49 \uD83D\uDE4A 20 : 45 , 1. Feb .\n")
+ assertContains(outContent.toString(), "\nBereinige wenigstens die allergröbsten Sachen .\n")
+ }
+
+ @Test
+ fun w2vExcludeZipGlobSkipsFiles() {
+ val args = arrayOf(
+ "--exclude-zip-glob", "goe.zip",
+ "-t", "w2v",
+ loadResource("wdf19.zip").path,
+ loadResource("goe.zip").path
+ )
+ debug(args)
+ val out = outContent.toString()
+ assertContains(out, "automatique")
+ assertFalse(out.contains("Gedanken"))
+ }
+}