Process zips in parallel with one tool instance/thread
Change-Id: If135a716bfe28cfd2cea1ca9faeddb214fad2895
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 5886116..33448a7 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -38,7 +38,6 @@
class KorapXml2Conllu : Callable<Int> {
val COMPATIBILITY_MODE = System.getenv("COMPATIBILITY_MODE") != null
- val marmotBridge = null
@Parameters(arity = "1..*", description = ["At least one zip file name"])
var zipFileNames: Array<String>? = null
@@ -152,15 +151,12 @@
val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap()
val extraFeatures: ConcurrentHashMap<String, MutableMap<String, String>> = ConcurrentHashMap()
var waitForMorpho: Boolean = false
- var annotationToolBridge: AnnotationToolBridge? = null
+ var annotationToolBridges: ConcurrentHashMap<Long, AnnotationToolBridge?> = ConcurrentHashMap()
fun korapxml2conllu(args: Array<String>) {
val executor: ExecutorService = Executors.newFixedThreadPool(threads)
if (annotateWith.isNotEmpty()) {
- if (annotateWith.contains(".jar")) {
- LOGGER.info("Annotating with jar file: $annotateWith")
- annotationToolBridge = AnnotationToolBridgeFactory.getAnnotationToolBridge(annotateWith, LOGGER)
- } else {
+ if (!annotateWith.contains(".jar")) {
annotationWorkerPool = AnnotationWorkerPool(annotateWith, threads, LOGGER)
}
}
@@ -237,12 +233,17 @@
) {
try {
ZipFile(zipFilePath).use { zipFile ->
- zipFile.stream().filter({ !it.name.contains("header.xml") })
+ zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
//.sorted({ o1, o2 -> o1.name.compareTo(o2.name) })
- .forEachOrdered { zipEntry ->
- LOGGER.info("Processing ${zipEntry.name} in thread ${Thread.currentThread().id}")
+ .parallel()
+ .forEach { zipEntry ->
+ LOGGER.info("Processing ${zipEntry.name} in thread ${Thread.currentThread().id}")
+ if (annotateWith.contains(".jar") && !annotationToolBridges.containsKey(Thread.currentThread().id)) {
+ annotationToolBridges[Thread.currentThread().id] =
+ AnnotationToolBridgeFactory.getAnnotationToolBridge(annotateWith, LOGGER)
+ }
- try {
+ try {
if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho)\\.xml$"))) {
val inputStream: InputStream = zipFile.getInputStream(zipEntry)
val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
@@ -251,13 +252,13 @@
dBuilder.parse(InputSource(InputStreamReader(inputStream, "UTF-8")))
} catch (e: SAXParseException) {
LOGGER.warning("Error parsing file: " + zipEntry.name + " " + e.message)
- return@forEachOrdered
+ return@forEach
}
doc.documentElement.normalize()
val docId: String = doc.documentElement.getAttribute("docid")
if (siglePattern != null && !Regex(siglePattern!!).containsMatchIn(docId)) {
- return@forEachOrdered
+ return@forEach
}
// LOGGER.info("Processing file: " + zipEntry.getName())
val fileName = zipEntry.name.replace(Regex(".*?/([^/]+\\.xml)$"), "$1")
@@ -373,8 +374,8 @@
output.append(metadata[docId]?.joinToString("\t", prefix = "# metadata=", postfix = "\n") ?: "")
}
var previousSpanStart = 0
- if (annotationToolBridge != null) {
- morpho[docId] = annotationToolBridge!!.tagText(tokens[docId]!!, sentences[docId], texts[docId]!!)
+ if (annotationToolBridges[Thread.currentThread().id] != null) {
+ morpho[docId] = annotationToolBridges[Thread.currentThread().id]!!.tagText(tokens[docId]!!, sentences[docId], texts[docId]!!)
}
tokens[docId]?.forEach { span ->
token_index++