Implement --extract-metadata option

Change-Id: I3fb3a78787d62c88d7e885e268393d2ad4408cfa
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 7534df7..fbd0d09 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -94,7 +94,7 @@
     @Option(
         names = ["--extract-metadata-regex", "-m"],
         paramLabel = "REGEX",
-        description = ["Not yet implemented: extract metadata regex"]
+        description = ["Extract metadata regexes.\nExample: -m '<textSigle>([^<]+)' -m '<creatDate>([^<]+)'"]
     )
     var extractMetadataRegex: MutableList<String> = mutableListOf()
 
@@ -130,6 +130,7 @@
 
     private var workerPool : WorkerPool? = null
 
+    val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap()
     fun korapxml2conllu(args: Array<String>) {
         val executor: ExecutorService = Executors.newFixedThreadPool(threads)
         val texts: ConcurrentHashMap<String, String> = ConcurrentHashMap()
@@ -274,10 +275,30 @@
                                 }
                             }
 
-                            if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null && (!waitForMorpho || morpho[docId] != null)) {
+                            if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null
+                                && (!waitForMorpho || morpho[docId] != null)
+                                && (extractMetadataRegex.isEmpty() || metadata.containsKey(docId))
+                                ) {
                                 processText(tokens, docId, sentences, texts, foundry, fname, waitForMorpho, morpho)
 
                             }
+                        } else if (extractMetadataRegex.isNotEmpty() && zipEntry.name.matches(Regex(".*/header\\.xml$"))) {
+                            //LOGGER.info("Processing header file: " + zipEntry.name)
+                            val text = zipFile.getInputStream(zipEntry).bufferedReader().use { it.readText() }
+                            val docId =
+                                Regex("<textSigle>([^<]+)</textSigle>").find(text)?.destructured?.component1()
+                                    ?.replace(Regex("/"), "_")
+                            LOGGER.info("Processing header file: " + zipEntry.name + " docId: " + docId)
+                            val meta = ArrayList<String>()
+                            extractMetadataRegex.forEach { regex ->
+                                val match = Regex(regex).find(text)
+                                if (match != null) {
+                                    meta.add(match.destructured.component1())
+                                }
+                            }
+                            if (meta.isNotEmpty() && docId != null) {
+                                metadata[docId] = meta.toTypedArray()
+                            }
                         }
                     } catch (e: Exception) {
                         e.printStackTrace()
@@ -305,7 +326,9 @@
         val output: StringBuilder
         if (lmTrainingData) {
             output = StringBuilder()
-
+            if (extractMetadataRegex.isNotEmpty()) {
+                output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "")
+            }
             tokens[docId]?.forEach { span ->
                 token_index++
                 if (span.from >= sentences[docId]!![sentence_index].to) {
@@ -314,6 +337,9 @@
                     } else {
                         output.append("\n")
                     }
+                    if (extractMetadataRegex.isNotEmpty() && real_token_index < tokens[docId]!!.size - 1) {
+                        output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "")
+                    }
                     sentence_index++
                 }
                 output.append(texts[docId]!!.substring(span.from, span.to), " ")
@@ -329,6 +355,9 @@
                         sentences, docId, sentence_index, real_token_index, tokens
                     )
                 )
+            if (extractMetadataRegex.isNotEmpty()) {
+                output.append(metadata[docId]?.joinToString("\t", prefix = "# metadata=", postfix = "\n") ?: "")
+            }
             tokens[docId]?.forEach { span ->
                 token_index++
                 if (span.from >= sentences[docId]!![sentence_index].to) {
@@ -377,7 +406,7 @@
             }
         }
 
-        arrayOf(tokens, texts, sentences, morpho, fname).forEach { map ->
+        arrayOf(tokens, texts, sentences, morpho, fname, metadata).forEach { map ->
             map.remove(docId)
         }
     }
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
index c252fd4..6131fd2 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
@@ -166,6 +166,16 @@
             { outContent.toString().count { it == '\n'} >= 61511 })
     }
 
+    @Test
+    fun canExtractMetadata() {
+        val args = arrayOf("--word2vec", "-m" ,"<textSigle>([^<]+)", "-m", "<creatDate>([^<]+)", loadResource("wdf19.zip").path)
+        debug(args)
+        assertContains(
+            outContent.toString(),
+            "WDF19/A0000.12006\t2011.08.11\tmerci pour l'info je suis curieux !"
+        )
+    }
+
     @Ignore("for some reason not working")
     fun canConvertMorphoFeatureAnnotations() {
         val args = arrayOf(goe, goeMarmot)