Filter out comments to avoid parser aborts on illegal -- sequences
Older DeReKo-token.xml files contain illegal comments like below. It
seems be hard to convince the xml parser to ignore this any other way
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="span.rng"
type="application/xml"
schematypens="http://relaxng.org/ns/structure/1.0"?>
<layer docid="ZCA20_NOV.00005"
xmlns="http://ids-mannheim.de/ns/KorAP"
version="KorAP-0.4">
<spanList>
<!-- /usr/bin/java -jar
/opt/perl/perlbrew/perls/perl-5.24.0/lib/site_perl/5.24.0/auto/share/dist/tei2korapxml/KorAP-Tokenizer-2.0.0-standalone.jar
--no-tokens --positions -->
<span id="t_0" from="0" to="8" />
<span id="t_1" from="9" to="11" />
<span id="t_2" from="12" to="17" />
Change-Id: I6718076983bae43257c86d7a46fdecb2f7d21f11
diff --git a/app/.idea/copilot/chatSessions/xd.lck b/app/.idea/copilot/chatSessions/xd.lck
new file mode 100644
index 0000000..6558e50
--- /dev/null
+++ b/app/.idea/copilot/chatSessions/xd.lck
@@ -0,0 +1,47 @@
+Private property of Exodus: 40047@nbmk
+
+jetbrains.exodus.io.LockingManager.lock(LockingManager.kt:88)
+jetbrains.exodus.io.LockingManager.lock(LockingManager.kt:39)
+jetbrains.exodus.io.FileDataWriter.lock(FileDataWriter.kt:70)
+jetbrains.exodus.log.Log.tryLock(Log.kt:804)
+jetbrains.exodus.log.Log.<init>(Log.kt:117)
+jetbrains.exodus.env.Environments.newLogInstance(Environments.kt:117)
+jetbrains.exodus.env.Environments.newLogInstance(Environments.kt:81)
+jetbrains.exodus.env.Environments.newLogInstance(Environments.kt:77)
+jetbrains.exodus.env.Environments$newInstance$4.invoke(Environments.kt:46)
+jetbrains.exodus.env.Environments$newInstance$4.invoke(Environments.kt:46)
+jetbrains.exodus.env.Environments.prepare(Environments.kt:120)
+jetbrains.exodus.env.Environments.newInstance(Environments.kt:46)
+kotlinx.dnq.store.container.EntityStoreHelperKt.createTransientEntityStore(EntityStoreHelper.kt:40)
+kotlinx.dnq.store.container.EntityStoreHelperKt.createTransientEntityStore(EntityStoreHelper.kt:31)
+kotlinx.dnq.store.container.EntityStoreHelperKt.createTransientEntityStore$default(EntityStoreHelper.kt:30)
+com.github.copilot.chat.session.persistence.xodus.XdChatSessionPersistenceService.initStore(XdChatSessionPersistenceService.kt:115)
+com.github.copilot.chat.session.persistence.xodus.XdChatSessionPersistenceService.<init>(XdChatSessionPersistenceService.kt:22)
+com.github.copilot.chat.session.persistence.xodus.XdChatSessionPersistenceService.<init>(XdChatSessionPersistenceService.kt:15)
+com.github.copilot.chat.session.persistence.ChatSessionPersistenceServiceKt.ChatSessionPersistenceService(ChatSessionPersistenceService.kt:43)
+com.github.copilot.chat.session.persistence.ChatSessionPersistenceServiceKt.chatSessionsPersistenceService(ChatSessionPersistenceService.kt:53)
+com.github.copilot.chat.session.ChatSessionManager.<init>(ChatSessionManager.kt:45)
+com.github.copilot.chat.session.ChatSessionManager.<init>(ChatSessionManager.kt:25)
+com.github.copilot.chat.window.CopilotChatToolWindow.onCopilotReady(CopilotChatToolWindow.kt:133)
+com.github.copilot.chat.window.CopilotChatToolWindow.access$onCopilotReady(CopilotChatToolWindow.kt:40)
+com.github.copilot.chat.window.CopilotChatToolWindow$initCopilotStatusListener$1.invoke(CopilotChatToolWindow.kt:118)
+com.github.copilot.chat.window.CopilotChatToolWindow$initCopilotStatusListener$1.invoke(CopilotChatToolWindow.kt:115)
+com.github.copilot.status.CopilotAuthStatusKt.subscribeToCopilotAuthStatus$lambda$0(CopilotAuthStatus.kt:44)
+com.intellij.util.messages.impl.MessageBusImplKt.invokeMethod(MessageBusImpl.kt:700)
+com.intellij.util.messages.impl.MessageBusImplKt.invokeListener(MessageBusImpl.kt:664)
+com.intellij.util.messages.impl.MessageBusImplKt.deliverMessage(MessageBusImpl.kt:423)
+com.intellij.util.messages.impl.MessageBusImplKt.pumpWaiting(MessageBusImpl.kt:402)
+com.intellij.util.messages.impl.MessageBusImplKt.access$pumpWaiting(MessageBusImpl.kt:1)
+com.intellij.util.messages.impl.MessagePublisher.invoke(MessageBusImpl.kt:461)
+jdk.proxy7/jdk.proxy7.$Proxy166.onCopilotStatus(Unknown Source)
+com.github.copilot.status.CopilotStatusService.notifyApplication(CopilotStatusService.java:76)
+com.github.copilot.status.CopilotStatusService.notifyApplication(CopilotStatusService.java:64)
+com.github.copilot.github.GitHubAuthStartupActivity.handleAuthNotifications(GitHubAuthStartupActivity.java:54)
+com.github.copilot.github.GitHubAuthStartupActivity.execute(GitHubAuthStartupActivity.java:35)
+com.intellij.ide.startup.impl.StartupManagerImplKt$launchActivity$1.invokeSuspend(StartupManagerImpl.kt:482)
+kotlin.coroutines.jvm.internal.BaseContinuationImpl.resumeWith(ContinuationImpl.kt:33)
+kotlinx.coroutines.DispatchedTask.run(DispatchedTask.kt:108)
+kotlinx.coroutines.scheduling.CoroutineScheduler.runSafely(CoroutineScheduler.kt:584)
+kotlinx.coroutines.scheduling.CoroutineScheduler$Worker.executeTask(CoroutineScheduler.kt:793)
+kotlinx.coroutines.scheduling.CoroutineScheduler$Worker.runWorker(CoroutineScheduler.kt:697)
+kotlinx.coroutines.scheduling.CoroutineScheduler$Worker.run(CoroutineScheduler.kt:684)
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 1a63b32..998d071 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -11,7 +11,6 @@
import picocli.CommandLine.*
import java.io.File
import java.io.InputStream
-import java.io.InputStreamReader
import java.util.*
import java.util.concurrent.Callable
import java.util.concurrent.ConcurrentHashMap
@@ -320,8 +319,9 @@
val inputStream: InputStream = zipFile.getInputStream(zipEntry)
val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()
+
val doc: Document = try {
- dBuilder.parse(InputSource(InputStreamReader(inputStream, "UTF-8")))
+ dBuilder.parse(InputSource(XMLCommentFilterReader(inputStream, "UTF-8")))
} catch (e: SAXParseException) {
LOGGER.warning("Error parsing file: " + zipEntry.name + " " + e.message)
return
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/XMLCommentFilterReader.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/XMLCommentFilterReader.kt
new file mode 100644
index 0000000..cb05dc6
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/XMLCommentFilterReader.kt
@@ -0,0 +1,74 @@
+package de.ids_mannheim.korapxmltools
+
+import sun.nio.cs.StreamDecoder
+import java.io.InputStream
+import java.io.InputStreamReader
+
+class XMLCommentFilterReader(`in`: InputStream, private val charsetName: String) : InputStreamReader(`in`, charsetName) {
+ companion object {
+ const val COMMENT_START = "<!--"
+ const val COMMENT_END = "-->"
+ }
+
+ private var commentBuffer: StringBuilder? = null
+
+ override fun read(cbuf: CharArray, off: Int, len: Int): Int {
+ var bytesRead = super.read(cbuf, off, len)
+ if (bytesRead <= 0) {
+ return bytesRead
+ }
+
+ val filteredBuffer = StringBuilder()
+ var currentIndex = off
+ var isFiltered = false
+
+ while (currentIndex < off + bytesRead) {
+ val currentChar = cbuf[currentIndex]
+ if (commentBuffer != null) {
+ if (currentChar == COMMENT_END[commentBuffer!!.length]) {
+ commentBuffer!!.append(currentChar)
+ if (commentBuffer!!.endsWith(COMMENT_END)) {
+ commentBuffer = null // End of comment
+ }
+ } else {
+ commentBuffer!!.clear()
+ }
+ } else if (currentChar == COMMENT_START[0]) {
+ // Check if starting a comment
+ val peekBuffer = StringBuilder()
+ var peekIndex = currentIndex
+ while (peekIndex < off + bytesRead && peekBuffer.length < COMMENT_START.length) {
+ peekBuffer.append(cbuf[peekIndex++])
+ }
+ if (peekBuffer.toString() == COMMENT_START) {
+ isFiltered = true
+ commentBuffer = StringBuilder()
+ currentIndex = peekIndex // Skip ahead
+ continue // Continue without appending the current character
+ }
+ filteredBuffer.append(currentChar)
+ } else {
+ filteredBuffer.append(currentChar)
+ }
+ currentIndex++
+ }
+ if (!isFiltered) {
+ return bytesRead
+ }
+ val filterdString = filteredBuffer.toString()// Copy filtered characters to the original buffer
+ val filteredChars = filterdString.toCharArray()
+ System.arraycopy(filteredChars, 0, cbuf, off, filteredChars.size)
+ bytesRead = filteredChars.size
+ return bytesRead
+ }
+
+ override fun read(): Int {
+ // Not implemented, you should use read(char[], int, int) instead
+ throw UnsupportedOperationException("read() is not supported. Use read(char[], int, int) instead.")
+ }
+
+ override fun close() {
+ super.close()
+ commentBuffer = null // Reset the comment buffer on close
+ }
+}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
index a4de206..65f4f6b 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
@@ -20,6 +20,7 @@
val goe = loadResource("goe.zip").path
val goeMarmot = loadResource("goe.marmot.zip").path
val goeTreeTagger = loadResource("goe.tree_tagger.zip").path
+ val zca20scrambled = loadResource("zca20-scrambled.zip").path
@Before
fun setUpStreams() {
@@ -145,6 +146,16 @@
}
@Test
+ fun canConvertXMLwithInvalidComments() {
+ val args = arrayOf("-w", zca20scrambled)
+ debug(args)
+ assertContains(
+ outContent.toString(),
+ "\nDys est yuch dyr Grund dyfür , dyss ys schon myl myhryry Wochyn dyuyrn kynn .\n"
+ )
+ }
+
+ @Test
fun canSetLogLevel() {
val args = arrayOf("-l", "info", loadResource("wdf19.zip").path)
debug(args)
diff --git a/app/src/test/resources/zca20-scrambled.zip b/app/src/test/resources/zca20-scrambled.zip
new file mode 100644
index 0000000..8f23582
--- /dev/null
+++ b/app/src/test/resources/zca20-scrambled.zip
Binary files differ