Initial import

Change-Id: Ifd0a927bc5c9fea0e675ccd730e11d501632eebb
diff --git a/app/.idea/.gitignore b/app/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/app/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/app/.idea/gradle.xml b/app/.idea/gradle.xml
new file mode 100644
index 0000000..038e045
--- /dev/null
+++ b/app/.idea/gradle.xml
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="GradleSettings">
+    <option name="linkedExternalProjectsSettings">
+      <GradleProjectSettings>
+        <option name="externalProjectPath" value="$PROJECT_DIR$" />
+        <option name="gradleJvm" value="17" />
+        <option name="modules">
+          <set>
+            <option value="$PROJECT_DIR$" />
+          </set>
+        </option>
+      </GradleProjectSettings>
+    </option>
+  </component>
+</project>
\ No newline at end of file
diff --git a/app/.idea/misc.xml b/app/.idea/misc.xml
new file mode 100644
index 0000000..6ed36dd
--- /dev/null
+++ b/app/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ExternalStorageConfigurationManager" enabled="true" />
+</project>
\ No newline at end of file
diff --git a/app/.idea/vcs.xml b/app/.idea/vcs.xml
new file mode 100644
index 0000000..b2bdec2
--- /dev/null
+++ b/app/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/app/build.gradle b/app/build.gradle
new file mode 100644
index 0000000..5d9953d
--- /dev/null
+++ b/app/build.gradle
@@ -0,0 +1,72 @@
+import org.jetbrains.kotlin.gradle.tasks.KotlinCompile
+
+/*
+ * This file was generated by the Gradle 'init' task.
+ *
+ * This generated file contains a sample Kotlin application project to get you started.
+ * For more details take a look at the 'Building Java & JVM projects' chapter in the Gradle
+ * User Manual available at https://docs.gradle.org/7.4.2/userguide/building_java_projects.html
+ */
+
+
+plugins {
+    // Apply the org.jetbrains.kotlin.jvm Plugin to add support for Kotlin.
+    id 'org.jetbrains.kotlin.jvm' version '1.9.22'
+
+    // Apply the application plugin to add support for building a CLI application in Java.
+    id 'application'
+    id 'com.github.johnrengelman.shadow' version '7.1.2'
+}
+
+
+repositories {
+    // Use Maven Central for resolving dependencies.
+    mavenCentral()
+}
+
+dependencies {
+    // Align versions of all Kotlin components
+    implementation platform('org.jetbrains.kotlin:kotlin-bom')
+
+    // Use the Kotlin JDK 8 standard library.
+    implementation 'org.jetbrains.kotlin:kotlin-stdlib'
+
+    // This dependency is used by the application.
+    implementation 'com.google.guava:guava:33.0.0-jre'
+
+    // Use the Kotlin test library.
+    testImplementation 'org.jetbrains.kotlin:kotlin-test'
+
+    // Use the Kotlin JUnit integration.
+    testImplementation 'org.jetbrains.kotlin:kotlin-test-junit'
+    testImplementation "org.jetbrains.kotlin:kotlin-test:1.9.22"
+}
+
+
+application {
+    // Define the main class for the application.
+    mainClass = 'de.ids_mannheim.korapxml2conllu.AppKt'
+}
+
+jar {
+    // Will include every single one of your dependencies, project or not
+    def lowerCasedName = baseName.toLowerCase()
+    def normalizedName = lowerCasedName.substring(0,1).toUpperCase() + lowerCasedName.substring(1)
+
+    manifest.attributes(
+            'Class-Path': configurations.compileClasspath.collect { it.getName() }.join(' '),
+            'Main-Class': "de.ids_mannheim.korapxml2conllu.AppKt"
+    )
+
+}
+
+
+configurations {
+    runtimeLib.extendsFrom implementation
+}
+
+tasks.withType(KotlinCompile).configureEach {
+    kotlinOptions {
+        jvmTarget = '17'
+    }
+}
\ No newline at end of file
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxml2conllu/App.kt b/app/src/main/kotlin/de/ids_mannheim/korapxml2conllu/App.kt
new file mode 100644
index 0000000..e49b01a
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxml2conllu/App.kt
@@ -0,0 +1,184 @@
+package de.ids_mannheim.korapxml2conllu
+
+import javax.xml.parsers.DocumentBuilder
+import javax.xml.parsers.DocumentBuilderFactory
+import java.io.InputStream
+import java.util.Arrays
+import java.util.concurrent.ConcurrentHashMap
+import java.util.concurrent.ExecutorService
+import java.util.concurrent.Executors
+import java.util.stream.IntStream
+import java.util.zip.ZipFile
+import org.w3c.dom.Document
+import org.w3c.dom.Element
+import org.w3c.dom.NodeList
+import org.xml.sax.InputSource
+import java.io.InputStreamReader
+import java.util.logging.Logger
+
+class App {
+    private val LOGGER: Logger = Logger.getLogger(App::class.java.name)
+
+    fun main(args: Array<String?>?) {
+        val executor: ExecutorService = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors())
+        val texts: ConcurrentHashMap<String, String> = ConcurrentHashMap()
+        val sentences: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
+        val tokens: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
+
+        Arrays.stream(args).forEach { zipFilePath ->
+            executor.submit {
+                processZipFile(
+                    zipFilePath ?: "",
+                    texts,
+                    sentences,
+                    tokens
+                )
+            }
+        }
+
+        executor.shutdown()
+        while (!executor.isTerminated) {
+            // Wait for all tasks to finish
+        }
+
+        // Further processing as needed
+    }
+
+    private fun processZipFile(
+        zipFilePath: String,
+        texts: ConcurrentHashMap<String, String>,
+        sentences: ConcurrentHashMap<String, Array<Span>>,
+        tokens: ConcurrentHashMap<String, Array<Span>>
+    ) {
+        try {
+            ZipFile(zipFilePath).use { zipFile ->
+                zipFile.stream().parallel().forEach { zipEntry ->
+                    try {
+                        if (zipEntry.name.matches(Regex(".*(data|tokens|structure)\\.xml$"))) {
+                            val inputStream: InputStream = zipFile.getInputStream(zipEntry)
+                            val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
+                            val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()
+                            val doc: Document = dBuilder.parse( InputSource( InputStreamReader(inputStream, "UTF-8")))
+
+                            doc.documentElement.normalize()
+                            val docId: String = doc.documentElement.getAttribute("docid")
+
+                            // LOGGER.info("Processing file: " + zipEntry.getName())
+                            val fileName =
+                                zipEntry.name.replace(Regex(".*?/((data|tokens|structure)\\.xml)$"), "$1")
+                            var token_index = 0
+                            var real_token_index = 0
+                            var sentence_index = 0
+                            var tokens_fname= ""
+                            when (fileName) {
+                                "data.xml" -> {
+                                    val textsList: NodeList = doc.getElementsByTagName("text")
+                                    if (textsList.length > 0) {
+                                        texts[docId] = textsList.item(0).textContent
+                                    }
+                                }
+
+                                "structure.xml" -> {
+                                    val spans: NodeList = doc.getElementsByTagName("span")
+                                    val sentenceSpans =
+                                        extractSentenceSpans(spans)
+                                    sentences[docId] = sentenceSpans
+                                }
+
+                                "tokens.xml" -> {
+                                    tokens_fname = zipEntry.name
+                                    val tokenSpans: NodeList = doc.getElementsByTagName("span")
+                                    val tokenSpanObjects =
+                                        extractSpans(tokenSpans)
+                                    tokens[docId] = tokenSpanObjects
+                                }
+                            }
+                            if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null) {
+                                synchronized(System.out) {
+                                    println("# foundry = base")
+                                    println("# filename = $tokens_fname")
+                                    printTokenOffsetsInSentence(sentences, docId, sentence_index, real_token_index, tokens)
+                                    tokens[docId]?.forEach { span ->
+                                        token_index++
+                                        if (span.from >= sentences[docId]!![sentence_index].to) {
+                                            println()
+                                            sentence_index++
+                                            token_index = 1
+                                            printTokenOffsetsInSentence(sentences, docId, sentence_index, real_token_index, tokens)
+                                        }
+                                        println("$token_index\t${span.from}\t${span.to}\t${sentences[docId]!![sentence_index].to}\t" + (texts[docId]?.substring(span.from, span.to) ?: ""))
+                                        real_token_index++
+
+                                    }
+                                }
+
+                            }
+                        }
+                    } catch (e: Exception) {
+                        e.printStackTrace()
+                    }
+                }
+            }
+        } catch (e: Exception) {
+            e.printStackTrace()
+        }
+    }
+
+    private fun printTokenOffsetsInSentence(
+        sentences: ConcurrentHashMap<String, Array<Span>>,
+        docId: String,
+        sentence_index: Int,
+        token_index: Int,
+        tokens: ConcurrentHashMap<String, Array<Span>>
+    ) {
+        val sentenceEndOffset = sentences[docId]!![sentence_index].to
+        var i = token_index
+        var start_offsets_string = ""
+        var end_offsets_string = ""
+        while (i < tokens[docId]!!.size && tokens[docId]!![i].to < sentenceEndOffset) {
+            start_offsets_string += " " + tokens[docId]!![i].from
+            end_offsets_string += " " + tokens[docId]!![i].to
+            i++
+        }
+        println("# start_offsets = " + tokens[docId]!![token_index].from + start_offsets_string)
+        println("# end_offsets = " + sentenceEndOffset + end_offsets_string)
+    }
+
+    private fun extractSpans(spans: NodeList): Array<Span> {
+        return IntStream.range(0, spans.length)
+            .mapToObj(spans::item)
+            .filter { node -> node is Element }
+            .map { node ->
+                Span(
+                    Integer.parseInt((node as Element).getAttribute("from")),
+                    Integer.parseInt(node.getAttribute("to"))
+                )
+            }
+            .toArray { size -> arrayOfNulls(size) }
+    }
+
+    private fun extractSentenceSpans(spans: NodeList): Array<Span> {
+        return IntStream.range(0, spans.length)
+            .mapToObj(spans::item)
+            .filter { node -> node is Element && node.getElementsByTagName("f").item(0).textContent.equals("s") }
+            .map { node ->
+                Span(
+                    Integer.parseInt((node as Element).getAttribute("from")),
+                    Integer.parseInt(node.getAttribute("to"))
+                )
+            }
+            .toArray { size -> arrayOfNulls(size) }
+    }
+
+
+    internal class Span(var from: Int, var to: Int)
+
+
+}
+
+
+fun main(args: Array<String?>?) {
+    System.setProperty("file.encoding", "UTF-8")
+    println(App().main(args))
+}
+
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxml2conllu/AppTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxml2conllu/AppTest.kt
new file mode 100644
index 0000000..2fe90b3
--- /dev/null
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxml2conllu/AppTest.kt
@@ -0,0 +1,19 @@
+package de.ids_mannheim.korapxml2conllu
+
+import java.net.URL
+import kotlin.test.Test
+import kotlin.test.assertNotNull
+
+class AppTest {
+    fun loadResource(path: String): URL {
+        val resource = Thread.currentThread().contextClassLoader.getResource(path)
+        requireNotNull(resource) { "Resource $path not found" }
+        return resource
+    }
+
+    @Test fun appHasAGreeting() {
+        val classUnderTest = App()
+        val args = arrayOf(loadResource("goe.zip").path)
+        assertNotNull(classUnderTest.main(args), "app should have a greeting")
+    }
+}
diff --git a/app/src/test/resources/goe.zip b/app/src/test/resources/goe.zip
new file mode 100644
index 0000000..db44e94
--- /dev/null
+++ b/app/src/test/resources/goe.zip
Binary files differ