Add option to write KorAP-XML-ZIP directly

Change-Id: Icdca5c08620971a06813c47d14317e8a9990c033
diff --git a/app/build.gradle b/app/build.gradle
index 2f74157..4209307 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -54,6 +54,8 @@
     implementation 'org.maltparser:maltparser:1.9.2'
     implementation 'org.apache.opennlp:opennlp-tools:2.3.2'
     implementation 'org.slf4j:slf4j-simple:2.1.0-alpha1'
+    implementation 'org.apache.ant:ant:1.10.14'
+
 }
 
 
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 4753f40..44de0c2 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -9,8 +9,11 @@
 import org.xml.sax.SAXParseException
 import picocli.CommandLine
 import picocli.CommandLine.*
+import java.io.ByteArrayOutputStream
 import java.io.File
 import java.io.InputStream
+import java.io.StringWriter
+import java.lang.Integer.parseInt
 import java.util.*
 import java.util.concurrent.Callable
 import java.util.concurrent.ConcurrentHashMap
@@ -23,12 +26,19 @@
 import java.util.regex.Pattern
 import java.util.stream.IntStream
 import java.util.zip.ZipEntry
+
 import java.util.zip.ZipFile
+import java.util.zip.ZipOutputStream
 import javax.xml.parsers.DocumentBuilder
 import javax.xml.parsers.DocumentBuilderFactory
+import javax.xml.transform.OutputKeys
+import javax.xml.transform.TransformerFactory
+import javax.xml.transform.dom.DOMSource
+import javax.xml.transform.stream.StreamResult
 import kotlin.math.min
 import kotlin.system.exitProcess
 
+val ZIP_ENTRY_UNIX_MODE = parseInt("644", 8)
 
 @Command(
     name = "KorapXml2Conllu",
@@ -48,6 +58,27 @@
     var zipFileNames: Array<String>? = null
 
     @Option(
+        names = ["-f", "--output-format"],
+        description = ["Output format: ${ConlluOutputFormat.NAME}, ${Word2VecOutputFormat.NAME}, ${KorapXmlOutputFormat.NAME}",
+            "conllu: CoNLL-U format",
+            "korapxml, xml, zip: KorAP-XML format zip",
+            "word2vec, w2v: Print text in LM training format: tokens separated by space, sentences separated by newlines",
+        ],
+        converter = [OutputFormatConverter::class]
+    )
+    var outputFormat: OutputFormat = OutputFormat.CONLLU
+    class OutputFormatConverter : ITypeConverter<OutputFormat> {
+        override fun convert(value: String?): OutputFormat {
+            return when (value?.lowercase(Locale.getDefault())) {
+                "conllu", "conll" -> OutputFormat.CONLLU
+                "word2vec", "w2v" -> OutputFormat.WORD2VEC
+                "korapxml", "korap", "xml", "zip" -> OutputFormat.KORAPXML
+                else -> throw IllegalArgumentException("Unknown output format: `$value'. Use one of: ${OutputFormat.entries.joinToString(", ") { it.name }}")
+            }
+        }
+    }
+
+    @Option(
         names = ["--sigle-pattern", "-p"],
         paramLabel = "PATTERN",
         description = ["Extract only documents with sigle matching the pattern (regex)"]
@@ -83,9 +114,14 @@
 
     @Option(
         names = ["--word2vec", "-w"],
-        description = ["Print text in LM training format: tokens separated by space, sentences separated by newline"]
+        description = ["Print text in LM training format: tokens separated by space, sentences separated by newline",
+            "Deprecated: use -f word2vec"]
     )
-    var lmTrainingData: Boolean = false
+    fun setWord2Vec(word2vec: Boolean) {
+        if (word2vec) {
+            outputFormat = OutputFormat.WORD2VEC
+        }
+    }
 
     @Option(
         names = ["--token-separator", "-s"],
@@ -93,7 +129,7 @@
         defaultValue = "\n",
         description = ["Token separator. Default: new-line for CoNLL-U, space for word2vec format."]
     )
-    var tokenSeparator: String = if (lmTrainingData) " " else "\n"
+    var tokenSeparator: String = if (outputFormat == OutputFormat.WORD2VEC) " " else "\n"
 
     @Option(names = ["--offsets"], description = ["Not yet implemented: offsets"])
     var offsets: Boolean = false
@@ -120,6 +156,7 @@
         paramLabel = "THREADS",
         description = ["Maximum number of threads to use. Default: ${"$"}{DEFAULT-VALUE}"]
     )
+    var maxThreads: Int = Runtime.getRuntime().availableProcessors() / 2
     fun setThreads(threads: Int) {
         if (threads < 1) {
             throw ParameterException(spec.commandLine(), String.format("Invalid value `%d' for option '--threads': must be at least 1", threads))
@@ -127,7 +164,6 @@
         this.maxThreads = threads
         System.setProperty("java.util.concurrent.ForkJoinPool.common.parallelism", threads.toString())
     }
-    var maxThreads: Int = Runtime.getRuntime().availableProcessors() / 2
 
     private var taggerName: String? = null
     private var taggerModel: String? = null
@@ -216,6 +252,11 @@
     var taggerToolBridges: ConcurrentHashMap<Long, TaggerToolBridge?> = ConcurrentHashMap()
     var parserToolBridges: ConcurrentHashMap<Long, ParserToolBridge?> = ConcurrentHashMap()
 
+    var dbFactory: DocumentBuilderFactory? = null
+    var dBuilder: DocumentBuilder? = null
+    var byteArrayOutputStream: ByteArrayOutputStream? = null
+    var morphoZipOutputStream: ZipOutputStream? = null
+
     fun String.hasCorrespondingBaseZip(): Boolean {
         if (!this.matches(Regex(".*\\.([^/.]+)\\.zip$"))) return false
         val baseZip = this.replace(Regex("\\.([^/.]+)\\.zip$"), ".zip")
@@ -229,6 +270,10 @@
     }
 
     fun korapxml2conllu(args: Array<String>) {
+        if (outputFormat == OutputFormat.KORAPXML && annotateWith.isNotEmpty()) {
+            LOGGER.severe("Shell command annotation is not yet supported with output format $outputFormat")
+            exitProcess(1)
+        }
         Executors.newFixedThreadPool(maxThreads)
 
         if (annotateWith.isNotEmpty()) {
@@ -284,6 +329,12 @@
 
     private fun processZipFile(zipFilePath: String, foundry: String = "base") {
         LOGGER.info("Processing ${zipFilePath} in thread ${Thread.currentThread().id}")
+        if (outputFormat == OutputFormat.KORAPXML && dbFactory == null) {
+            dbFactory = DocumentBuilderFactory.newInstance()
+            dBuilder = dbFactory!!.newDocumentBuilder()
+            byteArrayOutputStream = ByteArrayOutputStream()
+            morphoZipOutputStream = ZipOutputStream(byteArrayOutputStream!!)
+        }
         if (zipFilePath.hasCorrespondingBaseZip()) {
             val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)
             Arrays.stream(zips).parallel().forEach { zip ->
@@ -302,6 +353,11 @@
                     }
             }
         }
+        if (outputFormat == OutputFormat.KORAPXML) {
+            morphoZipOutputStream!!.close()
+            val outputMorphoZipFileName = zipFilePath.replace(Regex("\\.zip$"), ".".plus(getMorphoFoundry()).plus(".zip"))
+            File(outputMorphoZipFileName).writeBytes(byteArrayOutputStream!!.toByteArray())
+        }
     }
 
     private fun processZipFileSequentially(zipFilePath: String, foundry: String = "base") {
@@ -438,24 +494,143 @@
         docId: String,
         foundry: String,
     ) {
-        var output =
-        if (lmTrainingData) {
+        var morphoFoundry = getMorphoFoundry()
+        val output =
+        if (outputFormat == OutputFormat.WORD2VEC) {
             lmTrainingOutput(docId)
         } else {
-            conlluOutput(foundry, docId)
+            if (taggerToolBridges[Thread.currentThread().id] != null) {
+                morpho[docId] = taggerToolBridges[Thread.currentThread().id]!!.tagText(
+                    tokens[docId]!!,
+                    sentences[docId],
+                    texts[docId]!!
+                )
+                if (parserToolBridges[Thread.currentThread().id] != null) {
+                    morpho[docId] = parserToolBridges[Thread.currentThread().id]!!.parseText(
+                        tokens[docId]!!,
+                        morpho[docId],
+                        sentences[docId],
+                        texts[docId]!!
+                    )
+                }
+            }
+            if (outputFormat == OutputFormat.KORAPXML && annotationWorkerPool == null) {
+                korapXmlOutput(getMorphoFoundry(), docId)
+            } else {
+                conlluOutput(foundry, docId)
+            }
         }
 
         if (annotationWorkerPool != null) {
             annotationWorkerPool?.pushToQueue(output.append("\n# eot\n").toString())
-        } else {
+        } else if (outputFormat != OutputFormat.KORAPXML) {
             synchronized(System.out) {
                 println(output.toString())
             }
+        } else {
+            korapXmlOutput(foundry, docId)
         }
 
+
         arrayOf(tokens, texts, sentences, morpho, fnames, metadata, extraFeatures).forEach { map ->
             map.remove(docId)
         }
+
+        if (outputFormat == OutputFormat.KORAPXML) {
+            val entryPath = docId.replace(Regex("[_.]"), "/").plus("/$morphoFoundry/").plus("morpho.xml")
+            val zipEntry = ZipEntry(entryPath)
+            // val zipEntry = org.apache.tools.zip.ZipEntry(entryPath)
+            // zipEntry.unixMode = 65535
+            synchronized(morphoZipOutputStream!!) {
+                morphoZipOutputStream!!.putNextEntry(zipEntry)
+                morphoZipOutputStream!!.write(output.toString().toByteArray())
+                morphoZipOutputStream!!.closeEntry()
+            }
+            output.clear()
+        }
+    }
+
+    private fun getMorphoFoundry() = taggerToolBridges[Thread.currentThread().id]?.foundry ?: "base"
+
+    private fun korapXmlOutput(foundry: String, docId: String): StringBuilder {
+        val doc: Document = dBuilder!!.newDocument()
+
+        // Root element
+        val layer = doc.createElement("layer")
+        layer.setAttribute("xmlns", "http://ids-mannheim.de/ns/KorAP")
+        layer.setAttribute("version", "KorAP-0.4")
+        layer.setAttribute("docid", docId)
+        doc.appendChild(layer)
+
+        val spanList = doc.createElement("spanList")
+        layer.appendChild(spanList)
+
+        var i = 0
+        morpho[docId]?.forEach { (spanString, mfs) ->
+            i++
+            val offsets = spanString.split("-")
+            val spanNode = doc.createElement("span")
+            spanNode.setAttribute("id", "t_$i")
+            spanNode.setAttribute("from", offsets[0])
+            spanNode.setAttribute("to", offsets[1])
+
+            // fs element
+            val fs = doc.createElement("fs")
+            fs.setAttribute("type", "lex")
+            fs.setAttribute("xmlns", "http://www.tei-c.org/ns/1.0")
+            spanNode.appendChild(fs)
+            val f = doc.createElement("f")
+            f.setAttribute("name", "lex")
+            fs.appendChild(f)
+
+            // Inner fs element
+            val innerFs = doc.createElement("fs")
+            f.appendChild(innerFs)
+
+            if (mfs.lemma != "_") {
+                val innerF = doc.createElement("f")
+                innerF.setAttribute("name", "lemma")
+                innerF.textContent = mfs.lemma
+                innerFs.appendChild(innerF)
+            }
+            if (mfs.upos != "_") {
+                val innerF = doc.createElement("f")
+                innerF.setAttribute("name", "upos")
+                innerF.textContent = mfs.upos
+                innerFs.appendChild(innerF)
+            }
+            if (mfs.xpos != "_") {
+                val innerF = doc.createElement("f")
+                innerF.setAttribute("name", "pos")
+                innerF.textContent = mfs.xpos
+                innerFs.appendChild(innerF)
+            }
+            if (mfs.feats != "_") {
+                val innerF = doc.createElement("f")
+                innerF.setAttribute("name", "msd")
+                innerF.textContent = mfs.feats
+                innerFs.appendChild(innerF)
+            }
+            if (mfs.misc != "_" && mfs.misc!!.matches(Regex("^[0-9.]+$"))) {
+                val innerF = doc.createElement("f")
+                innerF.setAttribute("name", "certainty")
+                innerF.textContent = mfs.misc
+                innerFs.appendChild(innerF)
+            }
+
+            spanList.appendChild(spanNode)
+        }
+        val transformerFactory = TransformerFactory.newInstance()
+        val transformer = transformerFactory.newTransformer()
+        transformer.setOutputProperty(OutputKeys.INDENT, "yes")
+        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no")
+        transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "1")
+        val domSource = DOMSource(doc)
+        val streamResult = StreamResult(StringWriter())
+        transformer.transform(domSource, streamResult)
+
+        return StringBuilder(streamResult.writer.toString())
+
     }
 
     private fun conlluOutput(foundry: String, docId: String): StringBuilder {
@@ -473,21 +648,6 @@
             output.append(metadata[docId]?.joinToString("\t", prefix = "# metadata=", postfix = "\n") ?: "")
         }
         var previousSpanStart = 0
-        if (taggerToolBridges[Thread.currentThread().id] != null) {
-            morpho[docId] = taggerToolBridges[Thread.currentThread().id]!!.tagText(
-                tokens[docId]!!,
-                sentences[docId],
-                texts[docId]!!
-            )
-            if (parserToolBridges[Thread.currentThread().id] != null) {
-                morpho[docId] = parserToolBridges[Thread.currentThread().id]!!.parseText(
-                    tokens[docId]!!,
-                    morpho[docId],
-                    sentences[docId],
-                    texts[docId]!!
-                )
-            }
-        }
         tokens[docId]?.forEach { span ->
             token_index++
             if (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to) {
@@ -747,3 +907,21 @@
 fun debug(args: Array<String>): Int {
     return (CommandLine(KorapXml2Conllu()).execute(*args))
 }
+
+enum class OutputFormat {
+    CONLLU, WORD2VEC, KORAPXML
+}
+
+object ConlluOutputFormat {
+    const val NAME = "conllu"
+}
+
+object Word2VecOutputFormat {
+    const val NAME = "word2vec"
+}
+
+object KorapXmlOutputFormat {
+    const val NAME = "korapxml"
+}
+
+
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
index cf4941a..9c8fe4d 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
@@ -136,7 +136,7 @@
     }
 
     @Test
-    fun w2vOptionWorks() {
+    fun deprecatedW2vOptionWorks() {
         val args = arrayOf("-w", loadResource("wdf19.zip").path)
         debug(args)
         assertContains(
@@ -147,6 +147,17 @@
     }
 
     @Test
+    fun w2vOptionWorks() {
+        val args = arrayOf("-f", "w2v", loadResource("wdf19.zip").path)
+        debug(args)
+        assertContains(
+            outContent.toString(),
+            "\nje ne suis pas du tout d'accord !\n"
+        )
+        assertFalse { outContent.toString().contains("WDF19_A0000.13865") }
+    }
+
+    @Test
     fun canConvertXMLwithInvalidComments() {
         val args = arrayOf("-w", zca20scrambled)
         debug(args)