Turn some exceptions into warnings
Change-Id: I69d29c4e6d18509760dcb9657a964725922d985e
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 4e592cc..df2c5a1 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -853,6 +853,7 @@
return output
}
tokens[docId]?.forEach { span ->
+ if (span == null) return@forEach
token_index++
if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) {
if (output.isNotEmpty()) {
@@ -865,16 +866,19 @@
}
sentence_index++
}
+ // Bounds safety
+ val safeFrom = span.from.coerceIn(0, texts[docId]!!.length)
+ val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length)
if (useLemma && morpho[docId] != null) {
val key = "${span.from}-${span.to}"
val lemmaVal = morpho[docId]!![key]?.lemma
if (lemmaVal != null && lemmaVal != "_") {
output.append(lemmaVal, " ")
} else {
- output.append(texts[docId]!!.substring(span.from, span.to), " ")
+ output.append(texts[docId]!!.substring(safeFrom, safeTo), " ")
}
} else {
- output.append(texts[docId]!!.substring(span.from, span.to), " ")
+ output.append(texts[docId]!!.substring(safeFrom, safeTo), " ")
}
real_token_index++
}
@@ -898,6 +902,7 @@
}
tokens[docId]?.forEach { span ->
+ if (span == null) return@forEach
token_index++
if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) {
// Replace sentence end with <p> tag instead of newline
@@ -906,16 +911,19 @@
}
sentence_index++
}
+ // Bounds safety
+ val safeFrom = span.from.coerceIn(0, texts[docId]!!.length)
+ val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length)
if (useLemma && morpho[docId] != null) {
val key = "${span.from}-${span.to}"
val lemmaVal = morpho[docId]!![key]?.lemma
if (lemmaVal != null && lemmaVal != "_") {
output.append(lemmaVal, " ")
} else {
- output.append(texts[docId]!!.substring(span.from, span.to), " ")
+ output.append(texts[docId]!!.substring(safeFrom, safeTo), " ")
}
} else {
- output.append(texts[docId]!!.substring(span.from, span.to), " ")
+ output.append(texts[docId]!!.substring(safeFrom, safeTo), " ")
}
real_token_index++
}
@@ -978,11 +986,26 @@
}
private fun extractSpans(spans: NodeList): Array<Span> {
- return IntStream.range(0, spans.length).mapToObj(spans::item).filter { node -> node is Element }.map { node ->
- Span(
- Integer.parseInt((node as Element).getAttribute("from")), Integer.parseInt(node.getAttribute("to"))
- )
- }.toArray { size -> arrayOfNulls(size) }
+ val list = ArrayList<Span>()
+ IntStream.range(0, spans.length).forEach { idx ->
+ val node = spans.item(idx)
+ if (node is Element) {
+ val fromAttr = node.getAttribute("from")
+ val toAttr = node.getAttribute("to")
+ if (fromAttr.isNullOrEmpty() || toAttr.isNullOrEmpty()) {
+ LOGGER.warning("Skipping span with empty from/to attribute: from='$fromAttr' to='$toAttr'")
+ } else {
+ try {
+ val from = Integer.parseInt(fromAttr)
+ val to = Integer.parseInt(toAttr)
+ list.add(Span(from, to))
+ } catch (e: NumberFormatException) {
+ LOGGER.warning("Skipping span with invalid numeric offsets: from='$fromAttr' to='$toAttr' : ${e.message}")
+ }
+ }
+ }
+ }
+ return list.toTypedArray()
}
private fun extractMorphoSpans(