Add support for CoreNLP SR parser and fast tagger

Change-Id: Ifa8e49b9145934f028a8a322b07dd9f601661fb7
diff --git a/app/build.gradle b/app/build.gradle
index f1c8c15..c50eecb 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -42,6 +42,7 @@
     implementation 'com.github.kupietz:cistern:v1.0.4'
     implementation 'org.maltparser:maltparser:1.9.2'
     implementation 'org.apache.opennlp:opennlp-tools:2.5.6'
+    implementation 'edu.stanford.nlp:stanford-corenlp:3.9.2'
     implementation 'org.slf4j:slf4j-simple:2.0.17'
     implementation 'org.apache.ant:ant:1.10.15'
     implementation 'org.apache.commons:commons-compress:1.28.0'
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
index a99a7bc..c7ae61d 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
@@ -18,18 +18,40 @@
 
 class AnnotationToolBridgeFactory {
     companion object {
-        const val taggerFoundries = "marmot|opennlp"
-        const val parserFoundries = "malt"
+        const val taggerFoundries = "marmot|opennlp|corenlp"
+        const val parserFoundries = "malt|corenlp"
 
         fun getAnnotationToolBridge(foundry: String, model: String, LOGGER: Logger): AnnotationToolBridge? {
             when (foundry) {
                 "marmot" -> return MarmotBridge(model, LOGGER)
                 "opennlp" -> return OpenNlpBridge(model, LOGGER)
                 "malt" -> return MaltParserBridge(model, LOGGER)
+                "corenlp" -> return CoreNLPBridge(model, LOGGER)
+                else -> LOGGER.severe("Unknown tagger/parser $foundry")
+            }
+            return null
+        }
+
+        // Get a tagger specifically
+        fun getTagger(foundry: String, model: String, LOGGER: Logger): TaggerToolBridge? {
+            when (foundry) {
+                "marmot" -> return MarmotBridge(model, LOGGER)
+                "opennlp" -> return OpenNlpBridge(model, LOGGER)
+                "corenlp" -> return CoreNLPTaggerBridge(model, LOGGER)
                 else -> LOGGER.severe("Unknown tagger $foundry")
             }
             return null
         }
+
+        // Get a parser specifically (dependency or constituency)
+        fun getParser(foundry: String, model: String, LOGGER: Logger, taggerModel: String? = null): AnnotationToolBridge? {
+            when (foundry) {
+                "malt" -> return MaltParserBridge(model, LOGGER)
+                "corenlp" -> return CoreNLPBridge(model, LOGGER, taggerModel)
+                else -> LOGGER.severe("Unknown parser $foundry")
+            }
+            return null
+        }
     }
 }
 
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index f2f6241..fa16e9a 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -433,6 +433,7 @@
     val sentences: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
     val tokens: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
     val morpho: ConcurrentHashMap<String, MutableMap<String, MorphoSpan>> = ConcurrentHashMap()
+    val constituencyTrees: ConcurrentHashMap<String, List<ConstituencyParserBridge.ConstituencyTree>> = ConcurrentHashMap()
     val fnames: ConcurrentHashMap<String, String> = ConcurrentHashMap()
     val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap()
     val extraFeatures: ConcurrentHashMap<String, MutableMap<String, String>> = ConcurrentHashMap()
@@ -444,6 +445,7 @@
     private var progressBar: ProgressBar? = null
     var taggerToolBridges: ConcurrentHashMap<Long, TaggerToolBridge> = ConcurrentHashMap()
     var parserToolBridges: ConcurrentHashMap<Long, ParserToolBridge> = ConcurrentHashMap()
+    var constituencyParserBridges: ConcurrentHashMap<Long, ConstituencyParserBridge> = ConcurrentHashMap()
 
     // Zip progress tracking for logging (zipNumber/zipTotal)
     private val zipOrdinals: ConcurrentHashMap<String, Int> = ConcurrentHashMap()
@@ -484,10 +486,10 @@
             trySetFeature("http://xml.org/sax/features/external-general-entities", false)
             trySetFeature("http://xml.org/sax/features/external-parameter-entities", false)
             try {
-                setAttribute(XMLConstants.ACCESS_EXTERNAL_DTD, "")
+                setAttribute("http://javax.xml.XMLConstants/property/accessExternalDTD", "")
             } catch (_: Exception) {}
             try {
-                setAttribute(XMLConstants.ACCESS_EXTERNAL_SCHEMA, "")
+                setAttribute("http://javax.xml.XMLConstants/property/accessExternalSchema", "")
             } catch (_: Exception) {}
         }
     }
@@ -1072,7 +1074,7 @@
             var targetFoundry = "base"
             val labelParts = mutableListOf<String>()
             if (taggerName != null) {
-                val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?
+                val tagger = AnnotationToolBridgeFactory.getTagger(taggerName!!, taggerModel!!, LOGGER)
                 if (tagger != null) {
                     labelParts.add(tagger.foundry)
                 }
@@ -1338,19 +1340,31 @@
         var waitForMorpho = passedWaitForMorpho
         LOGGER.finer("Processing ${zipEntry.name} in thread ${Thread.currentThread().threadId()}")
         if (taggerName != null && !taggerToolBridges.containsKey(Thread.currentThread().threadId())) {
-            val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?
+            val tagger = AnnotationToolBridgeFactory.getTagger(taggerName!!, taggerModel!!, LOGGER)
             if (tagger != null) {
                 taggerToolBridges[Thread.currentThread().threadId()] = tagger
                 foundry = tagger.foundry
             }
 
         }
-        if (parserName != null && !parserToolBridges.containsKey(Thread.currentThread().threadId())) {
-            val parser = AnnotationToolBridgeFactory.getAnnotationToolBridge(parserName!!, parserModel!!, LOGGER) as ParserToolBridge?
-            if (parser != null) {
-                parserToolBridges[Thread.currentThread().threadId()] = parser
-                foundry = "$foundry dependency:${parser.foundry}"
-                LOGGER.fine("Initialized parser ${parserName} with foundry $foundry in thread ${Thread.currentThread().threadId()}")
+        if (parserName != null && !parserToolBridges.containsKey(Thread.currentThread().threadId()) && !constituencyParserBridges.containsKey(Thread.currentThread().threadId())) {
+            // If both tagger and parser are CoreNLP, pass tagger model to parser for POS tagging
+            val taggerModelForParser = if (parserName == "corenlp" && taggerName == "corenlp") taggerModel else null
+            val parser = AnnotationToolBridgeFactory.getParser(parserName!!, parserModel!!, LOGGER, taggerModelForParser)
+            when (parser) {
+                is ParserToolBridge -> {
+                    parserToolBridges[Thread.currentThread().threadId()] = parser
+                    foundry = "$foundry dependency:${parser.foundry}"
+                    LOGGER.fine("Initialized dependency parser ${parserName} with foundry $foundry in thread ${Thread.currentThread().threadId()}")
+                }
+                is ConstituencyParserBridge -> {
+                    constituencyParserBridges[Thread.currentThread().threadId()] = parser
+                    foundry = "${parser.foundry}"
+                    LOGGER.fine("Initialized constituency parser ${parserName} with foundry $foundry in thread ${Thread.currentThread().threadId()}")
+                }
+                else -> {
+                    LOGGER.warning("Parser ${parserName} returned null or unknown type")
+                }
             }
         }
 
@@ -1361,6 +1375,9 @@
         parserToolBridges[Thread.currentThread().threadId()]?.let { activeParser ->
             foundry = "$foundry dependency:${activeParser.foundry}"
         }
+        constituencyParserBridges[Thread.currentThread().threadId()]?.let { activeParser ->
+            foundry = "${activeParser.foundry}"
+        }
 
         try {
             if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho|dependency|sentences|constituency)\\.xml$"))) {
@@ -1675,6 +1692,17 @@
                 )
                 LOGGER.finer("Parsed text: $docId in thread ${Thread.currentThread().threadId()}")
             }
+            if (constituencyParserBridges[Thread.currentThread().threadId()] != null) {
+                LOGGER.finer("Constituency parsing text: $docId in thread ${Thread.currentThread().threadId()}")
+                val trees = constituencyParserBridges[Thread.currentThread().threadId()]!!.parseConstituency(
+                    tokens[docId]!!,
+                    morpho[docId],
+                    sentences[docId],
+                    texts[docId]!!
+                )
+                constituencyTrees[docId] = trees
+                LOGGER.finer("Constituency parsed text: $docId, generated ${trees.size} trees in thread ${Thread.currentThread().threadId()}")
+            }
             if (outputFormat == OutputFormat.KORAPXML && annotationWorkerPool == null) {
                 korapXmlOutput(getMorphoFoundry(), docId)
             } else {
@@ -1741,6 +1769,20 @@
                  }
                  wroteOne = true
              }
+             // Write constituency.xml if a constituency parser is active
+             if (constituencyParserBridges[Thread.currentThread().threadId()] != null && constituencyTrees[docId] != null) {
+                val constDir = constituencyParserBridges[Thread.currentThread().threadId()]!!.foundry
+                val constXml = korapXmlConstituencyOutput(constDir, docId).toString()
+                val constPath = docId.replace(Regex("[_.]"), "/") + "/$constDir/constituency.xml"
+                 val constEntry = ZipArchiveEntry(constPath)
+                 constEntry.unixMode = ZIP_ENTRY_UNIX_MODE
+                 synchronized(morphoZipOutputStream!!) {
+                     morphoZipOutputStream!!.putArchiveEntry(constEntry)
+                     morphoZipOutputStream!!.write(constXml.toByteArray())
+                     morphoZipOutputStream!!.closeArchiveEntry()
+                 }
+                 wroteOne = true
+             }
              output.clear()
              // Track written docs once per document and update progress like with --annotate-with
              val written = if (wroteOne) docsWrittenToZip.incrementAndGet() else docsWrittenToZip.get()
@@ -1761,7 +1803,7 @@
          }
 
         // Release per-document data to free memory early
-        arrayOf(tokens, texts, sentences, morpho, fnames, metadata, extraFeatures).forEach { map ->
+        arrayOf(tokens, texts, sentences, morpho, constituencyTrees, fnames, metadata, extraFeatures).forEach { map ->
             if (map === morpho) {
                 morpho[docId]?.clear()
             }
@@ -1880,6 +1922,79 @@
         return StringBuilder(streamResult.writer.toString())
     }
 
+    private fun korapXmlConstituencyOutput(foundry: String, docId: String): StringBuilder {
+        val doc: Document = dBuilder!!.newDocument()
+
+        // Root element
+        val layer = doc.createElement("layer")
+        layer.setAttribute("xmlns", "http://ids-mannheim.de/ns/KorAP")
+        layer.setAttribute("version", "KorAP-0.4")
+        layer.setAttribute("docid", docId)
+        doc.appendChild(layer)
+
+        val spanList = doc.createElement("spanList")
+        layer.appendChild(spanList)
+
+        val trees = constituencyTrees[docId]
+        if (trees == null || trees.isEmpty()) {
+            LOGGER.warning("No constituency trees found for $docId")
+            return StringBuilder("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
+        }
+
+        // Process each tree
+        trees.forEach { tree ->
+            tree.nodes.forEach { node ->
+                // Create span element
+                val spanNode = doc.createElement("span")
+                spanNode.setAttribute("id", node.id)
+                spanNode.setAttribute("from", node.from.toString())
+                spanNode.setAttribute("to", node.to.toString())
+
+                // Create fs element for the constituency label
+                val fs = doc.createElement("fs")
+                fs.setAttribute("xmlns", "http://www.tei-c.org/ns/1.0")
+                fs.setAttribute("type", "node")
+
+                val f = doc.createElement("f")
+                f.setAttribute("name", "const")
+                f.textContent = node.label
+                fs.appendChild(f)
+
+                spanNode.appendChild(fs)
+
+                // Add rel elements for children
+                node.children.forEach { child ->
+                    val rel = doc.createElement("rel")
+                    rel.setAttribute("label", "dominates")
+
+                    when (child) {
+                        is ConstituencyParserBridge.ConstituencyChild.NodeRef -> {
+                            rel.setAttribute("target", child.targetId)
+                        }
+                        is ConstituencyParserBridge.ConstituencyChild.MorphoRef -> {
+                            rel.setAttribute("uri", "morpho.xml#${child.morphoId}")
+                        }
+                    }
+
+                    spanNode.appendChild(rel)
+                }
+
+                spanList.appendChild(spanNode)
+            }
+        }
+
+        val transformerFactory = TransformerFactory.newInstance()
+        val transformer = transformerFactory.newTransformer()
+        transformer.setOutputProperty(OutputKeys.INDENT, "yes")
+        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no")
+        transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3")
+        val domSource = DOMSource(doc)
+        val streamResult = StreamResult(StringWriter())
+        transformer.transform(domSource, streamResult)
+
+        return StringBuilder(streamResult.writer.toString())
+    }
+
     private fun korapXmlOutput(foundry: String, docId: String): StringBuilder {
         return if (parserName != null) {
             korapXmlDependencyOutput(foundry, docId)