Initial import
Change-Id: Ifd0a927bc5c9fea0e675ccd730e11d501632eebb
diff --git a/app/.idea/.gitignore b/app/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/app/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/app/.idea/gradle.xml b/app/.idea/gradle.xml
new file mode 100644
index 0000000..038e045
--- /dev/null
+++ b/app/.idea/gradle.xml
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+ <component name="GradleSettings">
+ <option name="linkedExternalProjectsSettings">
+ <GradleProjectSettings>
+ <option name="externalProjectPath" value="$PROJECT_DIR$" />
+ <option name="gradleJvm" value="17" />
+ <option name="modules">
+ <set>
+ <option value="$PROJECT_DIR$" />
+ </set>
+ </option>
+ </GradleProjectSettings>
+ </option>
+ </component>
+</project>
\ No newline at end of file
diff --git a/app/.idea/misc.xml b/app/.idea/misc.xml
new file mode 100644
index 0000000..6ed36dd
--- /dev/null
+++ b/app/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+ <component name="ExternalStorageConfigurationManager" enabled="true" />
+</project>
\ No newline at end of file
diff --git a/app/.idea/vcs.xml b/app/.idea/vcs.xml
new file mode 100644
index 0000000..b2bdec2
--- /dev/null
+++ b/app/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+ <component name="VcsDirectoryMappings">
+ <mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
+ </component>
+</project>
\ No newline at end of file
diff --git a/app/build.gradle b/app/build.gradle
new file mode 100644
index 0000000..5d9953d
--- /dev/null
+++ b/app/build.gradle
@@ -0,0 +1,72 @@
+import org.jetbrains.kotlin.gradle.tasks.KotlinCompile
+
+/*
+ * This file was generated by the Gradle 'init' task.
+ *
+ * This generated file contains a sample Kotlin application project to get you started.
+ * For more details take a look at the 'Building Java & JVM projects' chapter in the Gradle
+ * User Manual available at https://docs.gradle.org/7.4.2/userguide/building_java_projects.html
+ */
+
+
+plugins {
+ // Apply the org.jetbrains.kotlin.jvm Plugin to add support for Kotlin.
+ id 'org.jetbrains.kotlin.jvm' version '1.9.22'
+
+ // Apply the application plugin to add support for building a CLI application in Java.
+ id 'application'
+ id 'com.github.johnrengelman.shadow' version '7.1.2'
+}
+
+
+repositories {
+ // Use Maven Central for resolving dependencies.
+ mavenCentral()
+}
+
+dependencies {
+ // Align versions of all Kotlin components
+ implementation platform('org.jetbrains.kotlin:kotlin-bom')
+
+ // Use the Kotlin JDK 8 standard library.
+ implementation 'org.jetbrains.kotlin:kotlin-stdlib'
+
+ // This dependency is used by the application.
+ implementation 'com.google.guava:guava:33.0.0-jre'
+
+ // Use the Kotlin test library.
+ testImplementation 'org.jetbrains.kotlin:kotlin-test'
+
+ // Use the Kotlin JUnit integration.
+ testImplementation 'org.jetbrains.kotlin:kotlin-test-junit'
+ testImplementation "org.jetbrains.kotlin:kotlin-test:1.9.22"
+}
+
+
+application {
+ // Define the main class for the application.
+ mainClass = 'de.ids_mannheim.korapxml2conllu.AppKt'
+}
+
+jar {
+ // Will include every single one of your dependencies, project or not
+ def lowerCasedName = baseName.toLowerCase()
+ def normalizedName = lowerCasedName.substring(0,1).toUpperCase() + lowerCasedName.substring(1)
+
+ manifest.attributes(
+ 'Class-Path': configurations.compileClasspath.collect { it.getName() }.join(' '),
+ 'Main-Class': "de.ids_mannheim.korapxml2conllu.AppKt"
+ )
+
+}
+
+
+configurations {
+ runtimeLib.extendsFrom implementation
+}
+
+tasks.withType(KotlinCompile).configureEach {
+ kotlinOptions {
+ jvmTarget = '17'
+ }
+}
\ No newline at end of file
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxml2conllu/App.kt b/app/src/main/kotlin/de/ids_mannheim/korapxml2conllu/App.kt
new file mode 100644
index 0000000..e49b01a
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxml2conllu/App.kt
@@ -0,0 +1,184 @@
+package de.ids_mannheim.korapxml2conllu
+
+import javax.xml.parsers.DocumentBuilder
+import javax.xml.parsers.DocumentBuilderFactory
+import java.io.InputStream
+import java.util.Arrays
+import java.util.concurrent.ConcurrentHashMap
+import java.util.concurrent.ExecutorService
+import java.util.concurrent.Executors
+import java.util.stream.IntStream
+import java.util.zip.ZipFile
+import org.w3c.dom.Document
+import org.w3c.dom.Element
+import org.w3c.dom.NodeList
+import org.xml.sax.InputSource
+import java.io.InputStreamReader
+import java.util.logging.Logger
+
+class App {
+ private val LOGGER: Logger = Logger.getLogger(App::class.java.name)
+
+ fun main(args: Array<String?>?) {
+ val executor: ExecutorService = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors())
+ val texts: ConcurrentHashMap<String, String> = ConcurrentHashMap()
+ val sentences: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
+ val tokens: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
+
+ Arrays.stream(args).forEach { zipFilePath ->
+ executor.submit {
+ processZipFile(
+ zipFilePath ?: "",
+ texts,
+ sentences,
+ tokens
+ )
+ }
+ }
+
+ executor.shutdown()
+ while (!executor.isTerminated) {
+ // Wait for all tasks to finish
+ }
+
+ // Further processing as needed
+ }
+
+ private fun processZipFile(
+ zipFilePath: String,
+ texts: ConcurrentHashMap<String, String>,
+ sentences: ConcurrentHashMap<String, Array<Span>>,
+ tokens: ConcurrentHashMap<String, Array<Span>>
+ ) {
+ try {
+ ZipFile(zipFilePath).use { zipFile ->
+ zipFile.stream().parallel().forEach { zipEntry ->
+ try {
+ if (zipEntry.name.matches(Regex(".*(data|tokens|structure)\\.xml$"))) {
+ val inputStream: InputStream = zipFile.getInputStream(zipEntry)
+ val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
+ val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()
+ val doc: Document = dBuilder.parse( InputSource( InputStreamReader(inputStream, "UTF-8")))
+
+ doc.documentElement.normalize()
+ val docId: String = doc.documentElement.getAttribute("docid")
+
+ // LOGGER.info("Processing file: " + zipEntry.getName())
+ val fileName =
+ zipEntry.name.replace(Regex(".*?/((data|tokens|structure)\\.xml)$"), "$1")
+ var token_index = 0
+ var real_token_index = 0
+ var sentence_index = 0
+ var tokens_fname= ""
+ when (fileName) {
+ "data.xml" -> {
+ val textsList: NodeList = doc.getElementsByTagName("text")
+ if (textsList.length > 0) {
+ texts[docId] = textsList.item(0).textContent
+ }
+ }
+
+ "structure.xml" -> {
+ val spans: NodeList = doc.getElementsByTagName("span")
+ val sentenceSpans =
+ extractSentenceSpans(spans)
+ sentences[docId] = sentenceSpans
+ }
+
+ "tokens.xml" -> {
+ tokens_fname = zipEntry.name
+ val tokenSpans: NodeList = doc.getElementsByTagName("span")
+ val tokenSpanObjects =
+ extractSpans(tokenSpans)
+ tokens[docId] = tokenSpanObjects
+ }
+ }
+ if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null) {
+ synchronized(System.out) {
+ println("# foundry = base")
+ println("# filename = $tokens_fname")
+ printTokenOffsetsInSentence(sentences, docId, sentence_index, real_token_index, tokens)
+ tokens[docId]?.forEach { span ->
+ token_index++
+ if (span.from >= sentences[docId]!![sentence_index].to) {
+ println()
+ sentence_index++
+ token_index = 1
+ printTokenOffsetsInSentence(sentences, docId, sentence_index, real_token_index, tokens)
+ }
+ println("$token_index\t${span.from}\t${span.to}\t${sentences[docId]!![sentence_index].to}\t" + (texts[docId]?.substring(span.from, span.to) ?: ""))
+ real_token_index++
+
+ }
+ }
+
+ }
+ }
+ } catch (e: Exception) {
+ e.printStackTrace()
+ }
+ }
+ }
+ } catch (e: Exception) {
+ e.printStackTrace()
+ }
+ }
+
+ private fun printTokenOffsetsInSentence(
+ sentences: ConcurrentHashMap<String, Array<Span>>,
+ docId: String,
+ sentence_index: Int,
+ token_index: Int,
+ tokens: ConcurrentHashMap<String, Array<Span>>
+ ) {
+ val sentenceEndOffset = sentences[docId]!![sentence_index].to
+ var i = token_index
+ var start_offsets_string = ""
+ var end_offsets_string = ""
+ while (i < tokens[docId]!!.size && tokens[docId]!![i].to < sentenceEndOffset) {
+ start_offsets_string += " " + tokens[docId]!![i].from
+ end_offsets_string += " " + tokens[docId]!![i].to
+ i++
+ }
+ println("# start_offsets = " + tokens[docId]!![token_index].from + start_offsets_string)
+ println("# end_offsets = " + sentenceEndOffset + end_offsets_string)
+ }
+
+ private fun extractSpans(spans: NodeList): Array<Span> {
+ return IntStream.range(0, spans.length)
+ .mapToObj(spans::item)
+ .filter { node -> node is Element }
+ .map { node ->
+ Span(
+ Integer.parseInt((node as Element).getAttribute("from")),
+ Integer.parseInt(node.getAttribute("to"))
+ )
+ }
+ .toArray { size -> arrayOfNulls(size) }
+ }
+
+ private fun extractSentenceSpans(spans: NodeList): Array<Span> {
+ return IntStream.range(0, spans.length)
+ .mapToObj(spans::item)
+ .filter { node -> node is Element && node.getElementsByTagName("f").item(0).textContent.equals("s") }
+ .map { node ->
+ Span(
+ Integer.parseInt((node as Element).getAttribute("from")),
+ Integer.parseInt(node.getAttribute("to"))
+ )
+ }
+ .toArray { size -> arrayOfNulls(size) }
+ }
+
+
+ internal class Span(var from: Int, var to: Int)
+
+
+}
+
+
+fun main(args: Array<String?>?) {
+ System.setProperty("file.encoding", "UTF-8")
+ println(App().main(args))
+}
+
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxml2conllu/AppTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxml2conllu/AppTest.kt
new file mode 100644
index 0000000..2fe90b3
--- /dev/null
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxml2conllu/AppTest.kt
@@ -0,0 +1,19 @@
+package de.ids_mannheim.korapxml2conllu
+
+import java.net.URL
+import kotlin.test.Test
+import kotlin.test.assertNotNull
+
+class AppTest {
+ fun loadResource(path: String): URL {
+ val resource = Thread.currentThread().contextClassLoader.getResource(path)
+ requireNotNull(resource) { "Resource $path not found" }
+ return resource
+ }
+
+ @Test fun appHasAGreeting() {
+ val classUnderTest = App()
+ val args = arrayOf(loadResource("goe.zip").path)
+ assertNotNull(classUnderTest.main(args), "app should have a greeting")
+ }
+}
diff --git a/app/src/test/resources/goe.zip b/app/src/test/resources/goe.zip
new file mode 100644
index 0000000..db44e94
--- /dev/null
+++ b/app/src/test/resources/goe.zip
Binary files differ