Allow for multiple zips as arguments
Change-Id: Ic314380e43ff852c235621932e965f879d3387d6
diff --git a/app/build.gradle b/app/build.gradle
index d9c81a4..693a7be 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -22,6 +22,12 @@
maven { url 'https://jitpack.io' }
}
+test {
+ minHeapSize = "1000m" // initial heap size
+ maxHeapSize = "8000m" // maximum heap size
+ jvmArgs '-XX:MaxMetaspaceSize=8000m' // mem argument for the test JVM
+}
+
dependencies {
// Align versions of all Kotlin components
implementation platform('org.jetbrains.kotlin:kotlin-bom')
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 28d0fd2..01ebd58 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -22,6 +22,7 @@
import java.util.regex.Matcher
import java.util.regex.Pattern
import java.util.stream.IntStream
+import java.util.zip.ZipEntry
import java.util.zip.ZipFile
import javax.xml.parsers.DocumentBuilder
import javax.xml.parsers.DocumentBuilderFactory
@@ -212,9 +213,21 @@
val fnames: ConcurrentHashMap<String, String> = ConcurrentHashMap()
val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap()
val extraFeatures: ConcurrentHashMap<String, MutableMap<String, String>> = ConcurrentHashMap()
- var waitForMorpho: Boolean = false
var taggerToolBridges: ConcurrentHashMap<Long, TaggerToolBridge?> = ConcurrentHashMap()
var parserToolBridges: ConcurrentHashMap<Long, ParserToolBridge?> = ConcurrentHashMap()
+
+ fun String.hasCorrespondingBaseZip(): Boolean {
+ if (!this.matches(Regex(".*\\.([^/.]+)\\.zip$"))) return false
+ val baseZip = this.replace(Regex("\\.([^/.]+)\\.zip$"), ".zip")
+ return File(baseZip).exists()
+ }
+
+ fun String.correspondingBaseZip(): String? {
+ if (!this.matches(Regex(".*\\.([^/.]+)\\.zip$"))) return null
+ val baseZip = this.replace(Regex("\\.([^/.]+)\\.zip$"), ".zip")
+ return if (File(baseZip).exists()) baseZip else null
+ }
+
fun korapxml2conllu(args: Array<String>) {
Executors.newFixedThreadPool(maxThreads)
@@ -223,14 +236,6 @@
}
var zips: Array<String> = args
- if (args.size == 1 && args[0].matches(Regex(".*\\.([^/.]+)\\.zip$"))) {
- val baseZip = args[0].replace(Regex("\\.([^/.]+)\\.zip$"), ".zip")
- if (File(baseZip).exists()) {
- zips = arrayOf(baseZip, zips[0])
- LOGGER.info("Processing base zip file: $baseZip")
- }
- }
- waitForMorpho = zips.size > 1
if (maxThreads > 1) {
LOGGER.info("Processing zip files in parallel with $maxThreads threads")
@@ -278,25 +283,51 @@
}
private fun processZipFile(zipFilePath: String, foundry: String = "base") {
- ZipFile(zipFilePath).use { zipFile ->
- zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
- .parallel().forEach { zipEntry ->
- processZipEntry(zipFile, foundry, zipEntry)
+ if (zipFilePath.hasCorrespondingBaseZip()) {
+ val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)
+ Arrays.stream(zips).parallel().forEach { zip ->
+ ZipFile(zip).use { zipFile ->
+ zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
+ .parallel().forEach { zipEntry ->
+ processZipEntry(zipFile, foundry, zipEntry, true)
+ }
}
- }
- }
- private fun processZipFileSequentially(zipFilePath: String, foundry: String = "base") {
- ZipFile(zipFilePath).use { zipFile ->
- zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
- //.sorted({ o1, o2 -> o1.name.compareTo(o2.name) })
- .forEachOrdered() { zipEntry ->
- processZipEntry(zipFile, foundry, zipEntry)
- }
+ }
+ } else {
+ ZipFile(zipFilePath).use { zipFile ->
+ zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
+ .parallel().forEach { zipEntry ->
+ processZipEntry(zipFile, foundry, zipEntry, false)
+ }
+ }
}
}
- fun processZipEntry(zipFile: ZipFile, _foundry: String, zipEntry: java.util.zip.ZipEntry) {
+ private fun processZipFileSequentially(zipFilePath: String, foundry: String = "base") {
+ if (zipFilePath.hasCorrespondingBaseZip()) {
+ val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)
+ Arrays.stream(zips).parallel().forEach { zip ->
+ ZipFile(zip).use { zipFile ->
+ zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
+ .parallel().forEach { zipEntry ->
+ processZipEntry(zipFile, foundry, zipEntry, true)
+ }
+ }
+ }
+ } else {
+ ZipFile(zipFilePath).use { zipFile ->
+ zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
+ //.sorted({ o1, o2 -> o1.name.compareTo(o2.name) })
+ .forEachOrdered() { zipEntry ->
+ processZipEntry(zipFile, foundry, zipEntry, false)
+ }
+ }
+ }
+ }
+
+ fun processZipEntry(zipFile: ZipFile, _foundry: String, zipEntry: ZipEntry, passedWaitForMorpho: Boolean) {
var foundry = _foundry
+ var waitForMorpho = passedWaitForMorpho
LOGGER.info("Processing ${zipEntry.name} in thread ${Thread.currentThread().id}")
if (taggerName != null && !taggerToolBridges.containsKey(Thread.currentThread().id)) {
val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?
@@ -371,7 +402,7 @@
&& (!waitForMorpho || morpho[docId] != null)
&& (extractMetadataRegex.isEmpty() || metadata[docId] != null)
) {
- processText(docId, foundry, waitForMorpho)
+ processText(docId, foundry)
}
} else if (extractMetadataRegex.isNotEmpty() && zipEntry.name.matches(Regex(".*/header\\.xml$"))) {
//LOGGER.info("Processing header file: " + zipEntry.name)
@@ -392,7 +423,7 @@
if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null
&& (!waitForMorpho || morpho[docId] != null)
) {
- processText(docId, foundry, waitForMorpho)
+ processText(docId, foundry)
}
}
}
@@ -404,7 +435,6 @@
private fun processText(
docId: String,
foundry: String,
- waitForMorpho: Boolean,
) {
var token_index = 0
var real_token_index = 0
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
index 65f4f6b..9bc4209 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
@@ -21,6 +21,7 @@
val goeMarmot = loadResource("goe.marmot.zip").path
val goeTreeTagger = loadResource("goe.tree_tagger.zip").path
val zca20scrambled = loadResource("zca20-scrambled.zip").path
+ val wdf19 = loadResource("wdf19.zip").path
@Before
fun setUpStreams() {
@@ -55,7 +56,7 @@
}
@Test
fun canConvertWithMorphoAnnotations() {
- val args = arrayOf(loadResource("goe.zip").path, loadResource("goe.tree_tagger.zip").path)
+ val args = arrayOf(loadResource("goe.tree_tagger.zip").path)
debug(args)
assertContains(
outContent.toString(),
@@ -82,7 +83,7 @@
@Test
fun canConvertWfdWithMorphoAnnotations() {
- val args = arrayOf(loadResource("wdf19.zip").path, loadResource("wdf19.tree_tagger.zip").path)
+ val args = arrayOf(loadResource("wdf19.tree_tagger.zip").path)
debug(args)
assertContains(
outContent.toString(),
@@ -202,6 +203,20 @@
}
@Ignore("for some reason not working")
+ @Test
+ fun canConvertMultipleZips() {
+ val args = arrayOf(wdf19, goe)
+ debug(args)
+ assertContains(
+ outContent.toString(),
+ "6\tautomatique\t_\t_\t_\t_\t_\t_\t_\t_\n"
+ )
+ assertContains(
+ outContent.toString(),
+ "36\tGedanken\t_\t_\t_\t_\t_\t_\t_\t_\n"
+ )
+ }
+
fun canConvertMorphoFeatureAnnotations() {
val args = arrayOf(goe, goeMarmot)
debug(args)