Implement --extract-attributes-regex

Change-Id: I09030be8ec85173b191c71fcb535886531012615
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 8a75ae7..bc12b82 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -49,7 +49,8 @@
     @Option(
         names = ["--extract-attributes-regex", "-e"],
         paramLabel = "REGEX",
-        description = ["Not yet implemented: extract attributes regex"]
+        description = ["Extract additional attribute values from structure.xml and writes them as comment line in front of the first covered token.",
+            "Example: -e '(posting/id|div/id)'"]
     )
     var extractAttributesRegex: String = ""
 
@@ -136,6 +137,7 @@
     val morpho: ConcurrentHashMap<String, MutableMap<String, MorphoSpan>> = ConcurrentHashMap()
     val fnames: ConcurrentHashMap<String, String> = ConcurrentHashMap()
     val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap()
+    val extraFeatures: ConcurrentHashMap<String, MutableMap<String, String>> = ConcurrentHashMap()
     var waitForMorpho: Boolean = false
     fun korapxml2conllu(args: Array<String>) {
         val executor: ExecutorService = Executors.newFixedThreadPool(threads)
@@ -241,7 +243,10 @@
 
                                 "structure.xml" -> {
                                     val spans: NodeList = doc.getElementsByTagName("span")
+                                    if (extractAttributesRegex.isNotEmpty())
+                                        extraFeatures[docId] = extractMiscSpans(spans)
                                     sentences[docId] = extractSentenceSpans(spans)
+
                                 }
 
                                 "tokens.xml" -> {
@@ -339,6 +344,7 @@
             if (extractMetadataRegex.isNotEmpty()) {
                 output.append(metadata[docId]?.joinToString("\t", prefix = "# metadata=", postfix = "\n") ?: "")
             }
+            var previousSpanStart = 0
             tokens[docId]?.forEach { span ->
                 token_index++
                 if (span.from >= sentences[docId]!![sentence_index].to) {
@@ -351,8 +357,18 @@
                         )
                     )
                 }
+                if (extractAttributesRegex.isNotEmpty() && extraFeatures[docId] != null) {
+                    for (i in previousSpanStart until span.from+1) {
+                        if (extraFeatures[docId]?.containsKey("$i") == true) {
+                            output.append(extraFeatures[docId]!!["$i"])
+                            extraFeatures[docId]!!.remove("$i")
+                        }
+                    }
+                    previousSpanStart = span.from+1
+                }
                 if (waitForMorpho && morpho[docId]?.containsKey("${span.from}-${span.to}") == true) {
                     val mfs = morpho[docId]!!["${span.from}-${span.to}"]
+
                     output.append(
                         printConlluToken(
                             token_index,
@@ -387,7 +403,7 @@
             }
         }
 
-        arrayOf(tokens, texts, sentences, morpho, fnames, metadata).forEach { map ->
+        arrayOf(tokens, texts, sentences, morpho, fnames, metadata, extraFeatures).forEach { map ->
             map.remove(docId)
         }
     }
@@ -487,6 +503,57 @@
             }.toArray { size -> arrayOfNulls(size) }
     }
 
+    /*
+     <span id="s15" from="370" to="394" l="5">
+      <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+        <f name="name">posting</f>
+        <f name="attr">
+          <fs type="attr">
+            <f name="id">i.10894_1_3</f>
+            <f name="indentLevel">0</f>
+            <f name="who">WU00000000</f>
+          </fs>
+        </f>
+      </fs>
+    </span>
+
+     */
+    private fun extractMiscSpans(spans: NodeList): MutableMap<String, String> {
+        val miscLocal: MutableMap<String, String> = HashMap()
+
+        IntStream.range(0, spans.length).mapToObj(spans::item)
+            .filter { node ->
+                node is Element
+                        && node.getElementsByTagName("f").length > 1
+                        && (node.getElementsByTagName("f").item(0) as Element).getAttribute("name").equals("name")
+                        && (node.getElementsByTagName("f").item(1) as Element).getAttribute("name").equals("attr")
+            }
+            .forEach { node ->
+                if (node == null) return@forEach
+                val elementName = (node as Element).getElementsByTagName("f").item(0).textContent.trim()
+                val from = node.getAttribute("from")
+                val attributes = (node.getElementsByTagName("f").item(1) as Element).getElementsByTagName("f")
+                val res = StringBuilder()
+                IntStream.range(0, attributes.length).mapToObj(attributes::item).forEach { attr ->
+                    val attrName = "$elementName/${(attr as Element).getAttribute("name")}"
+                    if (attrName.matches(Regex(extractAttributesRegex))) {
+                         res.append("# $attrName = ${attr.textContent}\n")
+                        //LOGGER.info("" + from + ": $attrName = " + attr.textContent)
+                    }
+
+                }
+                if (res.isNotEmpty()) {
+                    if (miscLocal.containsKey(from)) {
+                        // LOGGER.info("ADDING TO $from: ${miscLocal[from]}")
+                        miscLocal[from] += res.toString()
+                    } else {
+                        miscLocal[from] = res.toString()
+                    }
+                }
+            }
+        return miscLocal
+    }
+
 
     class Span(var from: Int, var to: Int)
 
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
index 6131fd2..b44e337 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
@@ -176,6 +176,20 @@
         )
     }
 
+    @Test
+    fun canExtractExtraFeaturesByRegex() {
+        val args = arrayOf("-e" ,"(posting/id|div/id)",loadResource("wdf19.zip").path)
+        debug(args)
+        assertContains(
+            outContent.toString(),
+            "12\t)\t_\t_\t_\t_\t_\t_\t_\t_\n" +
+                    "# div/id = i.14293_8\n" +
+                    "13\tDifférentiation\t_\t_\t_\t_\t_\t_\t_\t_\n" +
+                    "# posting/id = i.14293_8_1\n" +
+                    "14\tAinsi\t_\t_\t_\t_\t_\t_\t_\t_\n"
+        )
+    }
+
     @Ignore("for some reason not working")
     fun canConvertMorphoFeatureAnnotations() {
         val args = arrayOf(goe, goeMarmot)