Add lemma option to print lemmas in w2v/now exports Change-Id: If6f99edb6840088b12507010534a75be59d24a20

commit: eb11eac2612e94596787004c8bbc588fb900d016 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Aug 28 20:21:54 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Thu Aug 28 20:26:06 2025 +0200
tree: 26ce719a444175f81089059d692fc68d4a044b3c
parent: b7de1e8879e9ef6d8d7582f3ea252d66cc8212a6 [diff]
diff --git a/Readme.md b/Readme.md
index 2dfcda4..2a3d6d4 100644
--- a/Readme.md
+++ b/Readme.md

@@ -68,26 +68,40 @@
 java -jar korapxmltool.jar -f now /vol/corpora/DeReKo/current/KorAP/zip/*24.zip | pv > dach24.txt
 ```
 
+### Using lemmas instead of surface forms in word2vec / NOW output
+
+If lemma annotations (morpho layer) are present alongside the base tokens, you can output lemmas instead of surface tokens with `--lemma`.
+
+```shell script
+# Word2Vec style output with lemmas where available
+java -jar ./app/build/libs/korapxmltool.jar --lemma -f w2v app/src/test/resources/goe.tree_tagger.zip | head -3
+
+# NOW corpus style output with lemmas
+java -jar ./app/build/libs/korapxmltool.jar --lemma -f now app/src/test/resources/goe.tree_tagger.zip | head -1
+```
+
+If a lemma for a token is missing (`_`) the surface form is used as fallback.
+
 ## Annotation
 
 ### Tagging with integrated MarMoT POS tagger directly to a new KorAP-XML ZIP file
 
-You need to download the pre-trained MarMoT models from the [here](http://cistern.cis.lmu.de/marmot/models/CURRENT/).
+You need to download the pre-trained MarMoT models from the [MarMoT models repository](http://cistern.cis.lmu.de/marmot/models/CURRENT/).
 
 ```shell script
-$ java -jar ./app/build/libs/korapxmltool.jar -f zip -t marmot:models/de.marmot app/src/test/resources/goe.zip
+java -jar ./app/build/libs/korapxmltool.jar -f zip -t marmot:models/de.marmot app/src/test/resources/goe.zip
 ```
 
 ### Tagging with integrated OpenNLP POS tagger directly to a new KorAP-XML ZIP file
 
-You need to download the pre-trained OpenNLP models from [here](https://opennlp.apache.org/models.html#part_of_speech_tagging) or older models from  [here](http://opennlp.sourceforge.net/models-1.5/).
+You need to download the pre-trained OpenNLP models from the [OpenNLP model download page](https://opennlp.apache.org/models.html#part_of_speech_tagging) or older models from the [legacy OpenNLP models archive](http://opennlp.sourceforge.net/models-1.5/).
 ```shell script
 java -jar ./app/build/libs/korapxmltool.jar -f zip -t opennlp:/usr/local/kl/korap/Ingestion/lib/models/opennlp/de-pos-maxent.bin /tmp/zca24.zip
 ```
 
 ### Tag and lemmatize with TreeTagger
 
-This requires the [TreeTagger Docker Image with CoNLL-U Support](https://gitlab.ids-mannheim.de/KorAP/CoNLL-U-Treetagger). 
+This requires the [TreeTagger Docker Image with CoNLL-U Support](https://gitlab.ids-mannheim.de/KorAP/CoNLL-U-Treetagger).
 Language models are downloaded automatically.
 
 ```shell script
@@ -106,7 +120,7 @@
 
 ### Using the integrated Maltparser
 
-You need to download the pre-trained MaltParser models from the [here](http://www.maltparser.org/mco/mco.html).
+You need to download the pre-trained MaltParser models from the [MaltParser model repository](http://www.maltparser.org/mco/mco.html).
 Note that parsers take POS tagged input.
 
 ```shell script

diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 100d78d..4e592cc 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt

@@ -175,6 +175,12 @@
     )
     var overwrite: Boolean = false
 
+    @Option(
+        names = ["--lemma"],
+        description = ["In word2vec/now output modes, output lemmas instead of surface tokens when lemma annotations are available (requires corresponding morpho annotation XML)"]
+    )
+    var useLemma: Boolean = false
+
     private var taggerName: String? = null
     private var taggerModel: String? = null
     @Option(
@@ -859,7 +865,17 @@
                 }
                 sentence_index++
             }
-            output.append(texts[docId]!!.substring(span.from, span.to), " ")
+            if (useLemma && morpho[docId] != null) {
+                val key = "${span.from}-${span.to}"
+                val lemmaVal = morpho[docId]!![key]?.lemma
+                if (lemmaVal != null && lemmaVal != "_") {
+                    output.append(lemmaVal, " ")
+                } else {
+                    output.append(texts[docId]!!.substring(span.from, span.to), " ")
+                }
+            } else {
+                output.append(texts[docId]!!.substring(span.from, span.to), " ")
+            }
             real_token_index++
         }
         if (output.isNotEmpty()) {
@@ -890,7 +906,17 @@
                 }
                 sentence_index++
             }
-            output.append(texts[docId]!!.substring(span.from, span.to), " ")
+            if (useLemma && morpho[docId] != null) {
+                val key = "${span.from}-${span.to}"
+                val lemmaVal = morpho[docId]!![key]?.lemma
+                if (lemmaVal != null && lemmaVal != "_") {
+                    output.append(lemmaVal, " ")
+                } else {
+                    output.append(texts[docId]!!.substring(span.from, span.to), " ")
+                }
+            } else {
+                output.append(texts[docId]!!.substring(span.from, span.to), " ")
+            }
             real_token_index++
         }
         

diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index c9d41d6..55eaf25 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt

@@ -295,4 +295,23 @@
         assert(File(resultFile).exists())
         assert(File(resultFile).length() > 0)
      }
+
+    @Test
+    fun canWord2VecLemma() {
+        val args = arrayOf("--lemma", "-f", "w2v", loadResource("goe.tree_tagger.zip").path)
+        debug(args)
+        val out = outContent.toString()
+        // Expect lemma sequence containing "mein Ankunft" (surface would include inflected form elsewhere)
+        assertContains(out, " mein Ankunft ")
+    }
+
+    @Test
+    fun canNowLemma() {
+        val args = arrayOf("--lemma", "-f", "now", loadResource("goe.tree_tagger.zip").path)
+        debug(args)
+        val out = outContent.toString()
+        assertContains(out, "@@")
+        assertContains(out, " <p> ")
+        assertContains(out, " mein Ankunft ")
+    }
 }
commit	eb11eac2612e94596787004c8bbc588fb900d016	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Aug 28 20:21:54 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Aug 28 20:26:06 2025 +0200
tree	26ce719a444175f81089059d692fc68d4a044b3c
parent	b7de1e8879e9ef6d8d7582f3ea252d66cc8212a6 [diff]