Implement --extract-metadata option
Change-Id: I3fb3a78787d62c88d7e885e268393d2ad4408cfa
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 7534df7..fbd0d09 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -94,7 +94,7 @@
@Option(
names = ["--extract-metadata-regex", "-m"],
paramLabel = "REGEX",
- description = ["Not yet implemented: extract metadata regex"]
+ description = ["Extract metadata regexes.\nExample: -m '<textSigle>([^<]+)' -m '<creatDate>([^<]+)'"]
)
var extractMetadataRegex: MutableList<String> = mutableListOf()
@@ -130,6 +130,7 @@
private var workerPool : WorkerPool? = null
+ val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap()
fun korapxml2conllu(args: Array<String>) {
val executor: ExecutorService = Executors.newFixedThreadPool(threads)
val texts: ConcurrentHashMap<String, String> = ConcurrentHashMap()
@@ -274,10 +275,30 @@
}
}
- if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null && (!waitForMorpho || morpho[docId] != null)) {
+ if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null
+ && (!waitForMorpho || morpho[docId] != null)
+ && (extractMetadataRegex.isEmpty() || metadata.containsKey(docId))
+ ) {
processText(tokens, docId, sentences, texts, foundry, fname, waitForMorpho, morpho)
}
+ } else if (extractMetadataRegex.isNotEmpty() && zipEntry.name.matches(Regex(".*/header\\.xml$"))) {
+ //LOGGER.info("Processing header file: " + zipEntry.name)
+ val text = zipFile.getInputStream(zipEntry).bufferedReader().use { it.readText() }
+ val docId =
+ Regex("<textSigle>([^<]+)</textSigle>").find(text)?.destructured?.component1()
+ ?.replace(Regex("/"), "_")
+ LOGGER.info("Processing header file: " + zipEntry.name + " docId: " + docId)
+ val meta = ArrayList<String>()
+ extractMetadataRegex.forEach { regex ->
+ val match = Regex(regex).find(text)
+ if (match != null) {
+ meta.add(match.destructured.component1())
+ }
+ }
+ if (meta.isNotEmpty() && docId != null) {
+ metadata[docId] = meta.toTypedArray()
+ }
}
} catch (e: Exception) {
e.printStackTrace()
@@ -305,7 +326,9 @@
val output: StringBuilder
if (lmTrainingData) {
output = StringBuilder()
-
+ if (extractMetadataRegex.isNotEmpty()) {
+ output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "")
+ }
tokens[docId]?.forEach { span ->
token_index++
if (span.from >= sentences[docId]!![sentence_index].to) {
@@ -314,6 +337,9 @@
} else {
output.append("\n")
}
+ if (extractMetadataRegex.isNotEmpty() && real_token_index < tokens[docId]!!.size - 1) {
+ output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "")
+ }
sentence_index++
}
output.append(texts[docId]!!.substring(span.from, span.to), " ")
@@ -329,6 +355,9 @@
sentences, docId, sentence_index, real_token_index, tokens
)
)
+ if (extractMetadataRegex.isNotEmpty()) {
+ output.append(metadata[docId]?.joinToString("\t", prefix = "# metadata=", postfix = "\n") ?: "")
+ }
tokens[docId]?.forEach { span ->
token_index++
if (span.from >= sentences[docId]!![sentence_index].to) {
@@ -377,7 +406,7 @@
}
}
- arrayOf(tokens, texts, sentences, morpho, fname).forEach { map ->
+ arrayOf(tokens, texts, sentences, morpho, fname, metadata).forEach { map ->
map.remove(docId)
}
}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
index c252fd4..6131fd2 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
@@ -166,6 +166,16 @@
{ outContent.toString().count { it == '\n'} >= 61511 })
}
+ @Test
+ fun canExtractMetadata() {
+ val args = arrayOf("--word2vec", "-m" ,"<textSigle>([^<]+)", "-m", "<creatDate>([^<]+)", loadResource("wdf19.zip").path)
+ debug(args)
+ assertContains(
+ outContent.toString(),
+ "WDF19/A0000.12006\t2011.08.11\tmerci pour l'info je suis curieux !"
+ )
+ }
+
@Ignore("for some reason not working")
fun canConvertMorphoFeatureAnnotations() {
val args = arrayOf(goe, goeMarmot)