Extract processZipEntry
Change-Id: I73ae3d703536bd8174f17acb80d934ed9985cb5f
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index cdf7c9b..11b2061 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -268,98 +268,98 @@
foundry: String = "base",
) {
- try {
ZipFile(zipFilePath).use { zipFile ->
zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
//.sorted({ o1, o2 -> o1.name.compareTo(o2.name) })
.parallel()
.forEach { zipEntry ->
- LOGGER.info("Processing ${zipEntry.name} in thread ${Thread.currentThread().id}")
- if (taggerName != null && !annotationToolBridges.containsKey(Thread.currentThread().id)) {
- annotationToolBridges[Thread.currentThread().id] =
- AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER)
- }
-
- try {
- if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho)\\.xml$"))) {
- val inputStream: InputStream = zipFile.getInputStream(zipEntry)
- val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
- val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()
- val doc: Document = try {
- dBuilder.parse(InputSource(InputStreamReader(inputStream, "UTF-8")))
- } catch (e: SAXParseException) {
- LOGGER.warning("Error parsing file: " + zipEntry.name + " " + e.message)
- return@forEach
- }
-
- doc.documentElement.normalize()
- val docId: String = doc.documentElement.getAttribute("docid")
- if (siglePattern != null && !Regex(siglePattern!!).containsMatchIn(docId)) {
- return@forEach
- }
- // LOGGER.info("Processing file: " + zipEntry.getName())
- val fileName = zipEntry.name.replace(Regex(".*?/([^/]+\\.xml)$"), "$1")
- when (fileName) {
- "data.xml" -> {
- val textsList: NodeList = doc.getElementsByTagName("text")
- if (textsList.length > 0) {
- texts[docId] = textsList.item(0).textContent
- }
- }
-
- "structure.xml" -> {
- val spans: NodeList = doc.getElementsByTagName("span")
- if (extractAttributesRegex.isNotEmpty())
- extraFeatures[docId] = extractMiscSpans(spans)
- sentences[docId] = extractSentenceSpans(spans)
-
- }
-
- "tokens.xml" -> {
- if (!fnames.contains(docId)) {
- fnames[docId] = zipEntry.name
- }
- val tokenSpans: NodeList = doc.getElementsByTagName("span")
- tokens[docId] = extractSpans(tokenSpans)
- }
-
- "morpho.xml" -> {
- waitForMorpho = true
- fnames[docId] = zipEntry.name
- val fsSpans: NodeList = doc.getElementsByTagName("span")
- morpho[docId] = extractMorphoSpans(fsSpans)
- tokens[docId] = extractSpans(fsSpans)
- }
- }
-
- if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null
- && (!waitForMorpho || morpho[docId] != null)
- && (extractMetadataRegex.isEmpty() || metadata.containsKey(docId))
- ) {
- processText(docId, foundry, waitForMorpho)
-
- }
- } else if (extractMetadataRegex.isNotEmpty() && zipEntry.name.matches(Regex(".*/header\\.xml$"))) {
- //LOGGER.info("Processing header file: " + zipEntry.name)
- val text = zipFile.getInputStream(zipEntry).bufferedReader().use { it.readText() }
- val docId =
- Regex("<textSigle>([^<]+)</textSigle>").find(text)?.destructured?.component1()
- ?.replace(Regex("/"), "_")
- LOGGER.info("Processing header file: " + zipEntry.name + " docId: " + docId)
- val meta = ArrayList<String>()
- extractMetadataRegex.forEach { regex ->
- val match = Regex(regex).find(text)
- if (match != null) {
- meta.add(match.destructured.component1())
- }
- }
- if (meta.isNotEmpty() && docId != null) {
- metadata[docId] = meta.toTypedArray()
- }
- }
- } catch (e: Exception) {
- e.printStackTrace()
+ processZipEntry(zipFile, foundry, zipEntry)
}
+ }
+ }
+
+ fun processZipEntry(zipFile: ZipFile, foundry: String, zipEntry: java.util.zip.ZipEntry) {
+ LOGGER.info("Processing ${zipEntry.name} in thread ${Thread.currentThread().id}")
+ if (taggerName != null && !annotationToolBridges.containsKey(Thread.currentThread().id)) {
+ annotationToolBridges[Thread.currentThread().id] =
+ AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER)
+ }
+
+ try {
+ if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho)\\.xml$"))) {
+ val inputStream: InputStream = zipFile.getInputStream(zipEntry)
+ val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
+ val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()
+ val doc: Document = try {
+ dBuilder.parse(InputSource(InputStreamReader(inputStream, "UTF-8")))
+ } catch (e: SAXParseException) {
+ LOGGER.warning("Error parsing file: " + zipEntry.name + " " + e.message)
+ return
+ }
+
+ doc.documentElement.normalize()
+ val docId: String = doc.documentElement.getAttribute("docid")
+ if (siglePattern != null && !Regex(siglePattern!!).containsMatchIn(docId)) {
+ return
+ }
+ // LOGGER.info("Processing file: " + zipEntry.getName())
+ val fileName = zipEntry.name.replace(Regex(".*?/([^/]+\\.xml)$"), "$1")
+ when (fileName) {
+ "data.xml" -> {
+ val textsList: NodeList = doc.getElementsByTagName("text")
+ if (textsList.length > 0) {
+ texts[docId] = textsList.item(0).textContent
+ }
+ }
+
+ "structure.xml" -> {
+ val spans: NodeList = doc.getElementsByTagName("span")
+ if (extractAttributesRegex.isNotEmpty())
+ extraFeatures[docId] = extractMiscSpans(spans)
+ sentences[docId] = extractSentenceSpans(spans)
+
+ }
+
+ "tokens.xml" -> {
+ if (!fnames.contains(docId)) {
+ fnames[docId] = zipEntry.name
+ }
+ val tokenSpans: NodeList = doc.getElementsByTagName("span")
+ tokens[docId] = extractSpans(tokenSpans)
+ }
+
+ "morpho.xml" -> {
+ waitForMorpho = true
+ fnames[docId] = zipEntry.name
+ val fsSpans: NodeList = doc.getElementsByTagName("span")
+ morpho[docId] = extractMorphoSpans(fsSpans)
+ tokens[docId] = extractSpans(fsSpans)
+ }
+ }
+
+ if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null
+ && (!waitForMorpho || morpho[docId] != null)
+ && (extractMetadataRegex.isEmpty() || metadata.containsKey(docId))
+ ) {
+ processText(docId, foundry, waitForMorpho)
+
+ }
+ } else if (extractMetadataRegex.isNotEmpty() && zipEntry.name.matches(Regex(".*/header\\.xml$"))) {
+ //LOGGER.info("Processing header file: " + zipEntry.name)
+ val text = zipFile.getInputStream(zipEntry).bufferedReader().use { it.readText() }
+ val docId =
+ Regex("<textSigle>([^<]+)</textSigle>").find(text)?.destructured?.component1()
+ ?.replace(Regex("/"), "_")
+ LOGGER.info("Processing header file: " + zipEntry.name + " docId: " + docId)
+ val meta = ArrayList<String>()
+ extractMetadataRegex.forEach { regex ->
+ val match = Regex(regex).find(text)
+ if (match != null) {
+ meta.add(match.destructured.component1())
+ }
+ }
+ if (meta.isNotEmpty() && docId != null) {
+ metadata[docId] = meta.toTypedArray()
}
}
} catch (e: Exception) {