Handle morpho annotations
Change-Id: Ie1177c201d1f1824b171780411f82ce05bb73c2d
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 80f8d16..c6fb696 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -24,6 +24,8 @@
val texts: ConcurrentHashMap<String, String> = ConcurrentHashMap()
val sentences: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
val tokens: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
+ val morpho: ConcurrentHashMap<String, MutableMap<String, MorphoSpan>> = ConcurrentHashMap()
+ val fnames: ConcurrentHashMap<String, String> = ConcurrentHashMap()
Arrays.stream(args).forEach { zipFilePath ->
executor.submit {
@@ -31,7 +33,10 @@
zipFilePath ?: "",
texts,
sentences,
- tokens
+ tokens,
+ fnames,
+ morpho,
+ args!!.size > 1
)
}
}
@@ -48,28 +53,30 @@
zipFilePath: String,
texts: ConcurrentHashMap<String, String>,
sentences: ConcurrentHashMap<String, Array<Span>>,
- tokens: ConcurrentHashMap<String, Array<Span>>
+ tokens: ConcurrentHashMap<String, Array<Span>>,
+ fname: ConcurrentHashMap<String, String>,
+ morpho: ConcurrentHashMap<String, MutableMap<String, MorphoSpan>>,
+ waitForMorpho: Boolean = false
) {
try {
ZipFile(zipFilePath).use { zipFile ->
zipFile.stream().parallel().forEach { zipEntry ->
try {
- if (zipEntry.name.matches(Regex(".*(data|tokens|structure)\\.xml$"))) {
+ if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho)\\.xml$"))) {
val inputStream: InputStream = zipFile.getInputStream(zipEntry)
val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()
- val doc: Document = dBuilder.parse( InputSource( InputStreamReader(inputStream, "UTF-8")))
+ val doc: Document = dBuilder.parse(InputSource(InputStreamReader(inputStream, "UTF-8")))
doc.documentElement.normalize()
val docId: String = doc.documentElement.getAttribute("docid")
// LOGGER.info("Processing file: " + zipEntry.getName())
val fileName =
- zipEntry.name.replace(Regex(".*?/((data|tokens|structure)\\.xml)$"), "$1")
+ zipEntry.name.replace(Regex(".*?/([^/]+\\.xml)$"), "$1")
var token_index = 0
var real_token_index = 0
var sentence_index = 0
- var tokens_fname= ""
when (fileName) {
"data.xml" -> {
val textsList: NodeList = doc.getElementsByTagName("text")
@@ -86,32 +93,69 @@
}
"tokens.xml" -> {
- tokens_fname = zipEntry.name
+ fname[docId] = zipEntry.name
val tokenSpans: NodeList = doc.getElementsByTagName("span")
val tokenSpanObjects =
extractSpans(tokenSpans)
tokens[docId] = tokenSpanObjects
}
+
+ "morpho.xml" -> {
+ val fsSpans: NodeList = doc.getElementsByTagName("span")
+ extractMorphoSpans(fsSpans, docId, morpho)
+ }
}
- if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null) {
+ if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null
+ && (!waitForMorpho || morpho[docId] != null)
+ ) {
synchronized(System.out) {
println("# foundry = base")
- println("# filename = $tokens_fname")
+ println("# filename = ${fname[docId]}")
println("# text_id = $docId")
- printTokenOffsetsInSentence(sentences, docId, sentence_index, real_token_index, tokens)
+ printTokenOffsetsInSentence(
+ sentences,
+ docId,
+ sentence_index,
+ real_token_index,
+ tokens
+ )
tokens[docId]?.forEach { span ->
token_index++
if (span.from >= sentences[docId]!![sentence_index].to) {
println()
sentence_index++
token_index = 1
- printTokenOffsetsInSentence(sentences, docId, sentence_index, real_token_index, tokens)
+ printTokenOffsetsInSentence(
+ sentences,
+ docId,
+ sentence_index,
+ real_token_index,
+ tokens
+ )
}
- printConlluToken(token_index, texts[docId]!!.substring(span.from, span.to) )
+ if (waitForMorpho && morpho[docId]?.containsKey("${span.from}-${span.to}") == true) {
+ val mfs = morpho[docId]!!["${span.from}-${span.to}"]
+ printConlluToken(
+ token_index,
+ texts[docId]!!.substring(span.from, span.to),
+ mfs!!.lemma!!,
+ mfs.upos!!,
+ mfs.xpos!!,
+ mfs.feats!!,
+ mfs.head!!,
+ mfs.deprel!!,
+ mfs.deps!!,
+ mfs.misc!!
+ )
+ } else {
+ printConlluToken(
+ token_index, texts[docId]!!.substring(span.from, span.to)
+ )
+ }
real_token_index++
}
- arrayOf(tokens, texts, sentences).forEach { map ->
+ arrayOf(tokens, texts, sentences, morpho).forEach { map ->
map.remove(docId)
}
println()
@@ -129,6 +173,7 @@
}
}
+
private fun printConlluToken(
token_index: Int,
token: String,
@@ -143,6 +188,7 @@
) {
println("$token_index\t$token\t$lemma\t$upos\t$xpos\t$feats\t$head\t$deprel\t$deps\t$misc")
}
+
private fun printTokenOffsetsInSentence(
sentences: ConcurrentHashMap<String, Array<Span>>,
docId: String,
@@ -150,7 +196,12 @@
token_index: Int,
tokens: ConcurrentHashMap<String, Array<Span>>
) {
- val sentenceEndOffset = sentences[docId]!![sentence_index].to
+ val sentenceEndOffset: Int
+ if (sentences[docId] == null) {
+ sentenceEndOffset = -1
+ } else {
+ sentenceEndOffset = sentences[docId]!![sentence_index].to
+ }
var i = token_index
var start_offsets_string = ""
var end_offsets_string = ""
@@ -176,6 +227,36 @@
.toArray { size -> arrayOfNulls(size) }
}
+ private fun extractMorphoSpans(
+ fsSpans: NodeList,
+ docId: String,
+ morpho: ConcurrentHashMap<String, MutableMap<String, MorphoSpan>>
+ ) {
+ IntStream.range(0, fsSpans.length)
+ .mapToObj(fsSpans::item)
+ .forEach { node ->
+ val features = (node as Element).getElementsByTagName("f")
+ var fs = MorphoSpan()
+ val fromTo = node.getAttribute("from") + "-" + node.getAttribute("to")
+ IntStream.range(0, features.length).mapToObj(features::item)
+ .forEach { feature ->
+ val attr = (feature as Element).getAttribute("name")
+ val value = feature.textContent
+ when (attr) {
+ "lemma" -> fs.lemma = value
+ "upos" -> fs.upos = value
+ "xpos" -> fs.xpos = value
+ "certainty" -> fs.misc = value
+ "ctag", "pos" -> fs.xpos = value
+ }
+ }
+ if (morpho[docId] == null) {
+ morpho[docId] = mutableMapOf()
+ }
+ morpho[docId]!![fromTo] = fs
+ }
+ }
+
private fun extractSentenceSpans(spans: NodeList): Array<Span> {
return IntStream.range(0, spans.length)
.mapToObj(spans::item)
@@ -192,6 +273,16 @@
internal class Span(var from: Int, var to: Int)
+ internal class MorphoSpan(
+ var lemma: String? = "_",
+ var upos: String? = "_",
+ var xpos: String? = "_",
+ var feats: String? = "_",
+ var head: String? = "_",
+ var deprel: String? = "_",
+ var deps: String? = "_",
+ var misc: String? = "_"
+ )
}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
index c11940f..f8ef4cb 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
@@ -34,8 +34,8 @@
}
@Test
- fun appWorks() {
- val classUnderTest = de.ids_mannheim.korapxmltools.KorapXml2Conllu()
+ fun canConvertGOE() {
+ val classUnderTest = KorapXml2Conllu()
val args = arrayOf(loadResource("goe.zip").path)
classUnderTest.main(args)
assertContains(
@@ -43,4 +43,25 @@
"# start_offsets = 55 55 59 63 70 75 82 87 94 102 105 111 120 124 130 134 140 144 151 153 163 175 187 191 207 209 213 218 222 239 248 255 259 264 267 271 277 283 297 307"
)
}
+ @Test
+ fun canConvertWithMorphoAnnotations() {
+ val classUnderTest = KorapXml2Conllu()
+ val args = arrayOf(loadResource("goe.zip").path, loadResource("goe.tree_tagger.zip").path)
+ classUnderTest.main(args)
+ assertContains(
+ outContent.toString(),
+ "9\tentzücke\tentzücken\t_\tVVFIN\t_\t_\t_\t_\t1.000000"
+ )
+ }
+
+ @Test
+ fun canConvertWfdWithMorphoAnnotations() {
+ val classUnderTest = KorapXml2Conllu()
+ val args = arrayOf(loadResource("wdf19.zip").path, loadResource("wdf19.tree_tagger.zip").path)
+ classUnderTest.main(args)
+ assertContains(
+ outContent.toString(),
+ "30\tvraie\tvrai\t_\tADJ\t_\t_\t_\t_\t1.000000"
+ )
+ }
}
diff --git a/app/src/test/resources/goe.tree_tagger.zip b/app/src/test/resources/goe.tree_tagger.zip
new file mode 100644
index 0000000..d7bf483
--- /dev/null
+++ b/app/src/test/resources/goe.tree_tagger.zip
Binary files differ
diff --git a/app/src/test/resources/wdf19.tree_tagger.zip b/app/src/test/resources/wdf19.tree_tagger.zip
new file mode 100644
index 0000000..aef5987
--- /dev/null
+++ b/app/src/test/resources/wdf19.tree_tagger.zip
Binary files differ
diff --git a/app/src/test/resources/wdf19.zip b/app/src/test/resources/wdf19.zip
new file mode 100644
index 0000000..61a8bdf
--- /dev/null
+++ b/app/src/test/resources/wdf19.zip
Binary files differ