Allow for multiple zips as arguments

Change-Id: Ic314380e43ff852c235621932e965f879d3387d6
diff --git a/app/build.gradle b/app/build.gradle
index d9c81a4..693a7be 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -22,6 +22,12 @@
     maven { url 'https://jitpack.io' }
 }
 
+test {
+    minHeapSize = "1000m" // initial heap size
+    maxHeapSize = "8000m" // maximum heap size
+    jvmArgs '-XX:MaxMetaspaceSize=8000m' // mem argument for the test JVM
+}
+
 dependencies {
     // Align versions of all Kotlin components
     implementation platform('org.jetbrains.kotlin:kotlin-bom')
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 28d0fd2..01ebd58 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -22,6 +22,7 @@
 import java.util.regex.Matcher
 import java.util.regex.Pattern
 import java.util.stream.IntStream
+import java.util.zip.ZipEntry
 import java.util.zip.ZipFile
 import javax.xml.parsers.DocumentBuilder
 import javax.xml.parsers.DocumentBuilderFactory
@@ -212,9 +213,21 @@
     val fnames: ConcurrentHashMap<String, String> = ConcurrentHashMap()
     val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap()
     val extraFeatures: ConcurrentHashMap<String, MutableMap<String, String>> = ConcurrentHashMap()
-    var waitForMorpho: Boolean = false
     var taggerToolBridges: ConcurrentHashMap<Long, TaggerToolBridge?> = ConcurrentHashMap()
     var parserToolBridges: ConcurrentHashMap<Long, ParserToolBridge?> = ConcurrentHashMap()
+
+    fun String.hasCorrespondingBaseZip(): Boolean {
+        if (!this.matches(Regex(".*\\.([^/.]+)\\.zip$"))) return false
+        val baseZip = this.replace(Regex("\\.([^/.]+)\\.zip$"), ".zip")
+        return File(baseZip).exists()
+    }
+
+    fun String.correspondingBaseZip(): String? {
+        if (!this.matches(Regex(".*\\.([^/.]+)\\.zip$"))) return null
+        val baseZip = this.replace(Regex("\\.([^/.]+)\\.zip$"), ".zip")
+        return if (File(baseZip).exists()) baseZip else null
+    }
+
     fun korapxml2conllu(args: Array<String>) {
         Executors.newFixedThreadPool(maxThreads)
 
@@ -223,14 +236,6 @@
         }
 
         var zips: Array<String> = args
-        if (args.size == 1 && args[0].matches(Regex(".*\\.([^/.]+)\\.zip$"))) {
-            val baseZip = args[0].replace(Regex("\\.([^/.]+)\\.zip$"), ".zip")
-            if (File(baseZip).exists()) {
-                zips = arrayOf(baseZip, zips[0])
-                LOGGER.info("Processing base zip file: $baseZip")
-            }
-        }
-        waitForMorpho = zips.size > 1
 
         if (maxThreads > 1) {
             LOGGER.info("Processing zip files in parallel with $maxThreads threads")
@@ -278,25 +283,51 @@
     }
 
     private fun processZipFile(zipFilePath: String, foundry: String = "base") {
-        ZipFile(zipFilePath).use { zipFile ->
-            zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
-                .parallel().forEach { zipEntry ->
-                    processZipEntry(zipFile, foundry, zipEntry)
+        if (zipFilePath.hasCorrespondingBaseZip()) {
+            val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)
+            Arrays.stream(zips).parallel().forEach { zip ->
+                ZipFile(zip).use { zipFile ->
+                    zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
+                        .parallel().forEach { zipEntry ->
+                            processZipEntry(zipFile, foundry, zipEntry, true)
+                        }
                 }
-        }
-    }
-    private fun processZipFileSequentially(zipFilePath: String, foundry: String = "base") {
-        ZipFile(zipFilePath).use { zipFile ->
-            zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
-                //.sorted({ o1, o2 -> o1.name.compareTo(o2.name) })
-                .forEachOrdered() { zipEntry ->
-                    processZipEntry(zipFile, foundry, zipEntry)
-                }
+            }
+        } else {
+            ZipFile(zipFilePath).use { zipFile ->
+                zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
+                    .parallel().forEach { zipEntry ->
+                        processZipEntry(zipFile, foundry, zipEntry, false)
+                    }
+            }
         }
     }
 
-    fun processZipEntry(zipFile: ZipFile, _foundry: String, zipEntry: java.util.zip.ZipEntry) {
+    private fun processZipFileSequentially(zipFilePath: String, foundry: String = "base") {
+        if (zipFilePath.hasCorrespondingBaseZip()) {
+            val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)
+            Arrays.stream(zips).parallel().forEach { zip ->
+                ZipFile(zip).use { zipFile ->
+                    zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
+                        .parallel().forEach { zipEntry ->
+                            processZipEntry(zipFile, foundry, zipEntry, true)
+                        }
+                }
+            }
+        } else {
+            ZipFile(zipFilePath).use { zipFile ->
+                zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
+                    //.sorted({ o1, o2 -> o1.name.compareTo(o2.name) })
+                    .forEachOrdered() { zipEntry ->
+                        processZipEntry(zipFile, foundry, zipEntry, false)
+                    }
+            }
+        }
+    }
+
+    fun processZipEntry(zipFile: ZipFile, _foundry: String, zipEntry: ZipEntry, passedWaitForMorpho: Boolean) {
         var foundry = _foundry
+        var waitForMorpho = passedWaitForMorpho
         LOGGER.info("Processing ${zipEntry.name} in thread ${Thread.currentThread().id}")
         if (taggerName != null && !taggerToolBridges.containsKey(Thread.currentThread().id)) {
             val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?
@@ -371,7 +402,7 @@
                     && (!waitForMorpho || morpho[docId] != null)
                     && (extractMetadataRegex.isEmpty() || metadata[docId] != null)
                 ) {
-                    processText(docId, foundry, waitForMorpho)
+                    processText(docId, foundry)
                 }
             } else if (extractMetadataRegex.isNotEmpty() && zipEntry.name.matches(Regex(".*/header\\.xml$"))) {
                 //LOGGER.info("Processing header file: " + zipEntry.name)
@@ -392,7 +423,7 @@
                     if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null
                         && (!waitForMorpho || morpho[docId] != null)
                     ) {
-                        processText(docId, foundry, waitForMorpho)
+                        processText(docId, foundry)
                     }
                 }
             }
@@ -404,7 +435,6 @@
     private fun processText(
         docId: String,
         foundry: String,
-        waitForMorpho: Boolean,
     ) {
         var token_index = 0
         var real_token_index = 0
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
index 65f4f6b..9bc4209 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
@@ -21,6 +21,7 @@
     val goeMarmot = loadResource("goe.marmot.zip").path
     val goeTreeTagger = loadResource("goe.tree_tagger.zip").path
     val zca20scrambled = loadResource("zca20-scrambled.zip").path
+    val wdf19 = loadResource("wdf19.zip").path
 
     @Before
     fun setUpStreams() {
@@ -55,7 +56,7 @@
     }
     @Test
     fun canConvertWithMorphoAnnotations() {
-        val args = arrayOf(loadResource("goe.zip").path, loadResource("goe.tree_tagger.zip").path)
+        val args = arrayOf(loadResource("goe.tree_tagger.zip").path)
         debug(args)
         assertContains(
             outContent.toString(),
@@ -82,7 +83,7 @@
 
     @Test
     fun canConvertWfdWithMorphoAnnotations() {
-        val args = arrayOf(loadResource("wdf19.zip").path, loadResource("wdf19.tree_tagger.zip").path)
+        val args = arrayOf(loadResource("wdf19.tree_tagger.zip").path)
         debug(args)
         assertContains(
             outContent.toString(),
@@ -202,6 +203,20 @@
     }
 
     @Ignore("for some reason not working")
+    @Test
+    fun canConvertMultipleZips() {
+        val args = arrayOf(wdf19, goe)
+        debug(args)
+        assertContains(
+            outContent.toString(),
+            "6\tautomatique\t_\t_\t_\t_\t_\t_\t_\t_\n"
+        )
+        assertContains(
+            outContent.toString(),
+            "36\tGedanken\t_\t_\t_\t_\t_\t_\t_\t_\n"
+        )
+    }
+
     fun canConvertMorphoFeatureAnnotations() {
         val args = arrayOf(goe, goeMarmot)
         debug(args)