Filter out comments to avoid parser aborts on illegal -- sequences

Older DeReKo-token.xml files contain illegal comments like below. It
seems be hard to convince the xml parser to ignore this any other way

<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="span.rng"
            type="application/xml"
            schematypens="http://relaxng.org/ns/structure/1.0"?>
<layer docid="ZCA20_NOV.00005"
       xmlns="http://ids-mannheim.de/ns/KorAP"
       version="KorAP-0.4">
  <spanList>
<!-- /usr/bin/java -jar
/opt/perl/perlbrew/perls/perl-5.24.0/lib/site_perl/5.24.0/auto/share/dist/tei2korapxml/KorAP-Tokenizer-2.0.0-standalone.jar
--no-tokens --positions -->
    <span id="t_0" from="0" to="8" />
    <span id="t_1" from="9" to="11" />
    <span id="t_2" from="12" to="17" />

Change-Id: I6718076983bae43257c86d7a46fdecb2f7d21f11
diff --git a/app/.idea/copilot/chatSessions/xd.lck b/app/.idea/copilot/chatSessions/xd.lck
new file mode 100644
index 0000000..6558e50
--- /dev/null
+++ b/app/.idea/copilot/chatSessions/xd.lck
@@ -0,0 +1,47 @@
+Private property of Exodus: 40047@nbmk
+
+jetbrains.exodus.io.LockingManager.lock(LockingManager.kt:88)
+jetbrains.exodus.io.LockingManager.lock(LockingManager.kt:39)
+jetbrains.exodus.io.FileDataWriter.lock(FileDataWriter.kt:70)
+jetbrains.exodus.log.Log.tryLock(Log.kt:804)
+jetbrains.exodus.log.Log.<init>(Log.kt:117)
+jetbrains.exodus.env.Environments.newLogInstance(Environments.kt:117)
+jetbrains.exodus.env.Environments.newLogInstance(Environments.kt:81)
+jetbrains.exodus.env.Environments.newLogInstance(Environments.kt:77)
+jetbrains.exodus.env.Environments$newInstance$4.invoke(Environments.kt:46)
+jetbrains.exodus.env.Environments$newInstance$4.invoke(Environments.kt:46)
+jetbrains.exodus.env.Environments.prepare(Environments.kt:120)
+jetbrains.exodus.env.Environments.newInstance(Environments.kt:46)
+kotlinx.dnq.store.container.EntityStoreHelperKt.createTransientEntityStore(EntityStoreHelper.kt:40)
+kotlinx.dnq.store.container.EntityStoreHelperKt.createTransientEntityStore(EntityStoreHelper.kt:31)
+kotlinx.dnq.store.container.EntityStoreHelperKt.createTransientEntityStore$default(EntityStoreHelper.kt:30)
+com.github.copilot.chat.session.persistence.xodus.XdChatSessionPersistenceService.initStore(XdChatSessionPersistenceService.kt:115)
+com.github.copilot.chat.session.persistence.xodus.XdChatSessionPersistenceService.<init>(XdChatSessionPersistenceService.kt:22)
+com.github.copilot.chat.session.persistence.xodus.XdChatSessionPersistenceService.<init>(XdChatSessionPersistenceService.kt:15)
+com.github.copilot.chat.session.persistence.ChatSessionPersistenceServiceKt.ChatSessionPersistenceService(ChatSessionPersistenceService.kt:43)
+com.github.copilot.chat.session.persistence.ChatSessionPersistenceServiceKt.chatSessionsPersistenceService(ChatSessionPersistenceService.kt:53)
+com.github.copilot.chat.session.ChatSessionManager.<init>(ChatSessionManager.kt:45)
+com.github.copilot.chat.session.ChatSessionManager.<init>(ChatSessionManager.kt:25)
+com.github.copilot.chat.window.CopilotChatToolWindow.onCopilotReady(CopilotChatToolWindow.kt:133)
+com.github.copilot.chat.window.CopilotChatToolWindow.access$onCopilotReady(CopilotChatToolWindow.kt:40)
+com.github.copilot.chat.window.CopilotChatToolWindow$initCopilotStatusListener$1.invoke(CopilotChatToolWindow.kt:118)
+com.github.copilot.chat.window.CopilotChatToolWindow$initCopilotStatusListener$1.invoke(CopilotChatToolWindow.kt:115)
+com.github.copilot.status.CopilotAuthStatusKt.subscribeToCopilotAuthStatus$lambda$0(CopilotAuthStatus.kt:44)
+com.intellij.util.messages.impl.MessageBusImplKt.invokeMethod(MessageBusImpl.kt:700)
+com.intellij.util.messages.impl.MessageBusImplKt.invokeListener(MessageBusImpl.kt:664)
+com.intellij.util.messages.impl.MessageBusImplKt.deliverMessage(MessageBusImpl.kt:423)
+com.intellij.util.messages.impl.MessageBusImplKt.pumpWaiting(MessageBusImpl.kt:402)
+com.intellij.util.messages.impl.MessageBusImplKt.access$pumpWaiting(MessageBusImpl.kt:1)
+com.intellij.util.messages.impl.MessagePublisher.invoke(MessageBusImpl.kt:461)
+jdk.proxy7/jdk.proxy7.$Proxy166.onCopilotStatus(Unknown Source)
+com.github.copilot.status.CopilotStatusService.notifyApplication(CopilotStatusService.java:76)
+com.github.copilot.status.CopilotStatusService.notifyApplication(CopilotStatusService.java:64)
+com.github.copilot.github.GitHubAuthStartupActivity.handleAuthNotifications(GitHubAuthStartupActivity.java:54)
+com.github.copilot.github.GitHubAuthStartupActivity.execute(GitHubAuthStartupActivity.java:35)
+com.intellij.ide.startup.impl.StartupManagerImplKt$launchActivity$1.invokeSuspend(StartupManagerImpl.kt:482)
+kotlin.coroutines.jvm.internal.BaseContinuationImpl.resumeWith(ContinuationImpl.kt:33)
+kotlinx.coroutines.DispatchedTask.run(DispatchedTask.kt:108)
+kotlinx.coroutines.scheduling.CoroutineScheduler.runSafely(CoroutineScheduler.kt:584)
+kotlinx.coroutines.scheduling.CoroutineScheduler$Worker.executeTask(CoroutineScheduler.kt:793)
+kotlinx.coroutines.scheduling.CoroutineScheduler$Worker.runWorker(CoroutineScheduler.kt:697)
+kotlinx.coroutines.scheduling.CoroutineScheduler$Worker.run(CoroutineScheduler.kt:684)
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 1a63b32..998d071 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -11,7 +11,6 @@
 import picocli.CommandLine.*
 import java.io.File
 import java.io.InputStream
-import java.io.InputStreamReader
 import java.util.*
 import java.util.concurrent.Callable
 import java.util.concurrent.ConcurrentHashMap
@@ -320,8 +319,9 @@
                 val inputStream: InputStream = zipFile.getInputStream(zipEntry)
                 val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
                 val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()
+
                 val doc: Document = try {
-                    dBuilder.parse(InputSource(InputStreamReader(inputStream, "UTF-8")))
+                    dBuilder.parse(InputSource(XMLCommentFilterReader(inputStream, "UTF-8")))
                 } catch (e: SAXParseException) {
                     LOGGER.warning("Error parsing file: " + zipEntry.name + " " + e.message)
                     return
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/XMLCommentFilterReader.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/XMLCommentFilterReader.kt
new file mode 100644
index 0000000..cb05dc6
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/XMLCommentFilterReader.kt
@@ -0,0 +1,74 @@
+package de.ids_mannheim.korapxmltools
+
+import sun.nio.cs.StreamDecoder
+import java.io.InputStream
+import java.io.InputStreamReader
+
+class XMLCommentFilterReader(`in`: InputStream, private val charsetName: String) : InputStreamReader(`in`, charsetName) {
+    companion object {
+        const val COMMENT_START = "<!--"
+        const val COMMENT_END = "-->"
+    }
+
+    private var commentBuffer: StringBuilder? = null
+
+    override fun read(cbuf: CharArray, off: Int, len: Int): Int {
+        var bytesRead = super.read(cbuf, off, len)
+        if (bytesRead <= 0) {
+            return bytesRead
+        }
+
+        val filteredBuffer = StringBuilder()
+        var currentIndex = off
+        var isFiltered = false
+
+        while (currentIndex < off + bytesRead) {
+            val currentChar = cbuf[currentIndex]
+            if (commentBuffer != null) {
+                if (currentChar == COMMENT_END[commentBuffer!!.length]) {
+                    commentBuffer!!.append(currentChar)
+                    if (commentBuffer!!.endsWith(COMMENT_END)) {
+                        commentBuffer = null // End of comment
+                    }
+                } else {
+                    commentBuffer!!.clear()
+                }
+            } else if (currentChar == COMMENT_START[0]) {
+                // Check if starting a comment
+                val peekBuffer = StringBuilder()
+                var peekIndex = currentIndex
+                while (peekIndex < off + bytesRead && peekBuffer.length < COMMENT_START.length) {
+                    peekBuffer.append(cbuf[peekIndex++])
+                }
+                if (peekBuffer.toString() == COMMENT_START) {
+                    isFiltered = true
+                    commentBuffer = StringBuilder()
+                    currentIndex = peekIndex // Skip ahead
+                    continue // Continue without appending the current character
+                }
+                filteredBuffer.append(currentChar)
+            } else {
+                filteredBuffer.append(currentChar)
+            }
+            currentIndex++
+        }
+        if (!isFiltered) {
+            return bytesRead
+        }
+        val filterdString = filteredBuffer.toString()// Copy filtered characters to the original buffer
+        val filteredChars = filterdString.toCharArray()
+        System.arraycopy(filteredChars, 0, cbuf, off, filteredChars.size)
+        bytesRead = filteredChars.size
+        return bytesRead
+    }
+
+    override fun read(): Int {
+        // Not implemented, you should use read(char[], int, int) instead
+        throw UnsupportedOperationException("read() is not supported. Use read(char[], int, int) instead.")
+    }
+
+    override fun close() {
+        super.close()
+        commentBuffer = null // Reset the comment buffer on close
+    }
+}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
index a4de206..65f4f6b 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
@@ -20,6 +20,7 @@
     val goe = loadResource("goe.zip").path
     val goeMarmot = loadResource("goe.marmot.zip").path
     val goeTreeTagger = loadResource("goe.tree_tagger.zip").path
+    val zca20scrambled = loadResource("zca20-scrambled.zip").path
 
     @Before
     fun setUpStreams() {
@@ -145,6 +146,16 @@
     }
 
     @Test
+    fun canConvertXMLwithInvalidComments() {
+        val args = arrayOf("-w", zca20scrambled)
+        debug(args)
+        assertContains(
+            outContent.toString(),
+            "\nDys est yuch dyr Grund dyfür , dyss ys schon myl myhryry Wochyn dyuyrn kynn .\n"
+        )
+    }
+
+    @Test
     fun canSetLogLevel() {
         val args = arrayOf("-l", "info", loadResource("wdf19.zip").path)
         debug(args)
diff --git a/app/src/test/resources/zca20-scrambled.zip b/app/src/test/resources/zca20-scrambled.zip
new file mode 100644
index 0000000..8f23582
--- /dev/null
+++ b/app/src/test/resources/zca20-scrambled.zip
Binary files differ