Code cleanup Change-Id: Ie2de06f1954167012fa6ed5d71c9d6893673c83e

commit: fb9d842e482c5a4aba5633a9f41f731b5baa4ade [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sat Mar 02 17:45:37 2024 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sat Mar 02 17:45:37 2024 +0100
tree: fe0a767eee6e5ad0cdde4662a9797a2e2b0791db
parent: 135e02c183bb87a4346907b705000173fd9e655b [diff]
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 58f1cb5..cc36537 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt

@@ -1,27 +1,25 @@
 package de.ids_mannheim.korapxmltools
 
-import javax.xml.parsers.DocumentBuilder
-import javax.xml.parsers.DocumentBuilderFactory
-import java.io.InputStream
-import java.util.concurrent.ConcurrentHashMap
-import java.util.concurrent.ExecutorService
-import java.util.concurrent.Executors
-import java.util.stream.IntStream
-import java.util.zip.ZipFile
 import org.w3c.dom.Document
 import org.w3c.dom.Element
 import org.w3c.dom.NodeList
 import org.xml.sax.InputSource
 import picocli.CommandLine
-import picocli.CommandLine.Parameters
-import picocli.CommandLine.Option
-import picocli.CommandLine.Command
+import picocli.CommandLine.*
 import java.io.File
+import java.io.InputStream
 import java.io.InputStreamReader
 import java.util.*
 import java.util.concurrent.Callable
+import java.util.concurrent.ConcurrentHashMap
+import java.util.concurrent.ExecutorService
+import java.util.concurrent.Executors
 import java.util.logging.Level
 import java.util.logging.Logger
+import java.util.stream.IntStream
+import java.util.zip.ZipFile
+import javax.xml.parsers.DocumentBuilder
+import javax.xml.parsers.DocumentBuilderFactory
 import kotlin.system.exitProcess
 
 @Command(
@@ -36,32 +34,48 @@
     @Parameters(arity = "1..*", description = ["At least one zip file name"])
     var zipFileNames: Array<String>? = null
 
-    @Option(names = ["--sigle-pattern", "-p"], paramLabel = "PATTERN",
-        description = ["Extract only documents with sigle matching the pattern (regex)"])
+    @Option(
+        names = ["--sigle-pattern", "-p"],
+        paramLabel = "PATTERN",
+        description = ["Extract only documents with sigle matching the pattern (regex)"]
+    )
     var siglePattern: String? = null
 
-    @Option(names = ["--extract-attributes-regex", "-e"], paramLabel = "REGEX",
-        description = ["Not yet implemented: extract attributes regex"])
+    @Option(
+        names = ["--extract-attributes-regex", "-e"],
+        paramLabel = "REGEX",
+        description = ["Not yet implemented: extract attributes regex"]
+    )
     var extractAttributesRegex: String = ""
 
-    @Option(names = ["--s-bounds-from-morpho"],
-        description = ["Not yet implemented: s bounds from morpho"])
+    @Option(
+        names = ["--s-bounds-from-morpho"], description = ["Not yet implemented: s bounds from morpho"]
+    )
     var sBoundsFromMorpho: Boolean = false
 
-    @Option(names = ["--log", "-l"], paramLabel = "LEVEL",
-        description = ["Log level: one of SEVERE, WARNING, INFO, FINE, FINER, FINEST. Default: ${"$"}{DEFAULT-VALUE}])"])
+    @Option(
+        names = ["--log", "-l"],
+        paramLabel = "LEVEL",
+        description = ["Log level: one of SEVERE, WARNING, INFO, FINE, FINER, FINEST. Default: ${"$"}{DEFAULT-VALUE}])"]
+    )
     var logLevel: String = "WARNING"
 
-    @Option(names = ["--columns", "-c"], paramLabel = "NUMBER",
-        description = ["Not yet implemented: columns"])
+    @Option(
+        names = ["--columns", "-c"], paramLabel = "NUMBER", description = ["Not yet implemented: columns"]
+    )
     var columns: Int = 10
 
-    @Option(names = ["--word2vec", "-w"], description = ["Print text in LM training format: tokens " +
-            "separated by space, sentences separated by newline"])
+    @Option(
+        names = ["--word2vec", "-w"],
+        description = ["Print text in LM training format: tokens separated by space, sentences separated by newline"]
+    )
     var lmTrainingData: Boolean = false
 
-    @Option(names = ["--token-separator", "-s"], paramLabel = "SEPARATOR",
-        description = ["Not yet implemented: token separator"])
+    @Option(
+        names = ["--token-separator", "-s"],
+        paramLabel = "SEPARATOR",
+        description = ["Not yet implemented: token separator"]
+    )
     var tokenSeparator: String = "\n"
 
     @Option(names = ["--offsets"], description = ["Not yet implemented: offsets"])
@@ -70,8 +84,11 @@
     @Option(names = ["--comments"], description = ["Not yet implemented: comments"])
     var comments: Boolean = false
 
-    @Option(names = ["--extract-metadata-regex", "-m"], paramLabel = "REGEX",
-        description = ["Not yet implemented: extract metadata regex"])
+    @Option(
+        names = ["--extract-metadata-regex", "-m"],
+        paramLabel = "REGEX",
+        description = ["Not yet implemented: extract metadata regex"]
+    )
     var extractMetadataRegex: MutableList<String> = mutableListOf()
 
     override fun call(): Int {
@@ -87,6 +104,7 @@
         korapxml2conllu(zipFileNames!!)
         return 0
     }
+
     private val LOGGER: Logger = Logger.getLogger(KorapXml2Conllu::class.java.name)
 
     fun korapxml2conllu(args: Array<String>) {
@@ -98,10 +116,10 @@
         val fnames: ConcurrentHashMap<String, String> = ConcurrentHashMap()
 
         if (args.isEmpty()) {
-                LOGGER.severe("Usage: KorapXml2Conllu <zipfile1> [<zipfile2> ...]")
-                return
+            LOGGER.severe("Usage: KorapXml2Conllu <zipfile1> [<zipfile2> ...]")
+            return
         }
-        var zips:Array<String> = args
+        var zips: Array<String> = args
         if (args.size == 1 && args[0].matches(Regex(".*\\.([^/.]+)\\.zip$"))) {
             val baseZip = args[0].replace(Regex("\\.([^/.]+)\\.zip$"), ".zip")
             if (File(baseZip).exists()) {
@@ -175,8 +193,7 @@
                                 return@forEach
                             }
                             // LOGGER.info("Processing file: " + zipEntry.getName())
-                            val fileName =
-                                zipEntry.name.replace(Regex(".*?/([^/]+\\.xml)$"), "$1")
+                            val fileName = zipEntry.name.replace(Regex(".*?/([^/]+\\.xml)$"), "$1")
                             var token_index = 0
                             var real_token_index = 0
                             var sentence_index = 0
@@ -204,10 +221,8 @@
                                     morpho[docId] = extractMorphoSpans(fsSpans)
                                 }
                             }
-                            if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null
-                                && (!waitForMorpho || morpho[docId] != null)
-                            ) {
-                                val output : StringBuilder
+                            if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null && (!waitForMorpho || morpho[docId] != null)) {
+                                val output: StringBuilder
                                 if (lmTrainingData) {
                                     output = StringBuilder()
 
@@ -217,21 +232,16 @@
                                             output.append("\n")
                                             sentence_index++
                                         }
-                                        output.append(texts[docId]!!.substring(span.from, span.to) + " ")
+                                        output.append(texts[docId]!!.substring(span.from, span.to), " ")
                                         real_token_index++
                                     }
                                 } else {
                                     output =
-                                        StringBuilder("# foundry = $foundry\n# filename = ${fname[docId]}\n# text_id = $docId\n")
-                                    output.append(
-                                        tokenOffsetsInSentence(
-                                            sentences,
-                                            docId,
-                                            sentence_index,
-                                            real_token_index,
-                                            tokens
-                                        )
-                                    )
+                                        StringBuilder("# foundry = $foundry\n# filename = ${fname[docId]}\n# text_id = $docId\n").append(
+                                                tokenOffsetsInSentence(
+                                                    sentences, docId, sentence_index, real_token_index, tokens
+                                                )
+                                            )
                                     tokens[docId]?.forEach { span ->
                                         token_index++
                                         if (span.from >= sentences[docId]!![sentence_index].to) {
@@ -240,11 +250,7 @@
                                             token_index = 1
                                             output.append(
                                                 tokenOffsetsInSentence(
-                                                    sentences,
-                                                    docId,
-                                                    sentence_index,
-                                                    real_token_index,
-                                                    tokens
+                                                    sentences, docId, sentence_index, real_token_index, tokens
                                                 )
                                             )
                                         }
@@ -272,7 +278,6 @@
                                             )
                                         }
                                         real_token_index++
-
                                     }
                                 }
                                 synchronized(System.out) {
@@ -307,8 +312,8 @@
         deprel: String = "_",
         deps: String = "_",
         misc: String = "_"
-    ) : String {
-        return("$token_index\t$token\t$lemma\t$upos\t$xpos\t$feats\t$head\t$deprel\t$deps\t$misc\n")
+    ): String {
+        return ("$token_index\t$token\t$lemma\t$upos\t$xpos\t$feats\t$head\t$deprel\t$deps\t$misc\n")
     }
 
     private fun tokenOffsetsInSentence(
@@ -317,7 +322,7 @@
         sentence_index: Int,
         token_index: Int,
         tokens: ConcurrentHashMap<String, Array<Span>>
-    ) : String{
+    ): String {
         val sentenceEndOffset: Int
         sentenceEndOffset = if (sentences[docId] == null) {
             -1
@@ -327,42 +332,35 @@
         var i = token_index
         val start_offsets_string = StringBuilder()
         val end_offsets_string = StringBuilder()
-        val output = StringBuilder()
-        while (tokens[docId]!=null && i < tokens[docId]!!.size && tokens[docId]!![i].to <= sentenceEndOffset) {
+        while (tokens[docId] != null && i < tokens[docId]!!.size && tokens[docId]!![i].to <= sentenceEndOffset) {
             start_offsets_string.append(" ", tokens[docId]!![i].from)
             end_offsets_string.append(" ", tokens[docId]!![i].to)
             i++
         }
-        output.append("# start_offsets = ", tokens[docId]!![token_index].from, start_offsets_string, "\n")
-        output.append("# end_offsets = ", sentenceEndOffset, end_offsets_string, "\n")
-        return output.toString()
+        return (
+                StringBuilder() .append(
+                    "# start_offsets = ", tokens[docId]!![token_index].from, start_offsets_string, "\n",
+                    "# end_offsets = ", sentenceEndOffset, end_offsets_string, "\n"
+                ).toString())
     }
 
     private fun extractSpans(spans: NodeList): Array<Span> {
-        return IntStream.range(0, spans.length)
-            .mapToObj(spans::item)
-            .filter { node -> node is Element }
-            .map { node ->
+        return IntStream.range(0, spans.length).mapToObj(spans::item).filter { node -> node is Element }.map { node ->
                 Span(
-                    Integer.parseInt((node as Element).getAttribute("from")),
-                    Integer.parseInt(node.getAttribute("to"))
+                    Integer.parseInt((node as Element).getAttribute("from")), Integer.parseInt(node.getAttribute("to"))
                 )
-            }
-            .toArray { size -> arrayOfNulls(size) }
+            }.toArray { size -> arrayOfNulls(size) }
     }
 
     private fun extractMorphoSpans(
         fsSpans: NodeList
     ): MutableMap<String, MorphoSpan> {
         val res: MutableMap<String, MorphoSpan> = HashMap()
-        IntStream.range(0, fsSpans.length)
-            .mapToObj(fsSpans::item)
-            .forEach { node ->
+        IntStream.range(0, fsSpans.length).mapToObj(fsSpans::item).forEach { node ->
                 val features = (node as Element).getElementsByTagName("f")
                 val fs = MorphoSpan()
-                val fromTo = node.getAttribute("from") + "-" + node.getAttribute("to")
-                IntStream.range(0, features.length).mapToObj(features::item)
-                    .forEach { feature ->
+                val fromTo = "${node.getAttribute("from")}-${node.getAttribute("to")}"
+                IntStream.range(0, features.length).mapToObj(features::item).forEach { feature ->
                         val attr = (feature as Element).getAttribute("name")
                         val value = feature.textContent
                         when (attr) {
@@ -379,16 +377,13 @@
     }
 
     private fun extractSentenceSpans(spans: NodeList): Array<Span> {
-        return IntStream.range(0, spans.length)
-            .mapToObj(spans::item)
+        return IntStream.range(0, spans.length).mapToObj(spans::item)
             .filter { node -> node is Element && node.getElementsByTagName("f").item(0).textContent.equals("s") }
             .map { node ->
                 Span(
-                    Integer.parseInt((node as Element).getAttribute("from")),
-                    Integer.parseInt(node.getAttribute("to"))
+                    Integer.parseInt((node as Element).getAttribute("from")), Integer.parseInt(node.getAttribute("to"))
                 )
-            }
-            .toArray { size -> arrayOfNulls(size) }
+            }.toArray { size -> arrayOfNulls(size) }
     }
 
 
@@ -407,8 +402,8 @@
 
 }
 
-fun main(args: Array<String>) : Unit = exitProcess(CommandLine(KorapXml2Conllu()).execute(*args))
+fun main(args: Array<String>): Unit = exitProcess(CommandLine(KorapXml2Conllu()).execute(*args))
 
 fun debug(args: Array<String>): Int {
-    return(CommandLine(KorapXml2Conllu()).execute(*args))
+    return (CommandLine(KorapXml2Conllu()).execute(*args))
 }
commit	fb9d842e482c5a4aba5633a9f41f731b5baa4ade	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Mar 02 17:45:37 2024 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Mar 02 17:45:37 2024 +0100
tree	fe0a767eee6e5ad0cdde4662a9797a2e2b0791db
parent	135e02c183bb87a4346907b705000173fd9e655b [diff]