Mark non-word tokens for index

Necessary for distance operator compatibility with COSMAS II.

Change-Id: Ie5b3ffdf9058011859142c917572523b266a0624
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
index d858858..e131a9b 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
@@ -252,6 +252,9 @@
         if (textData.sentences != null) {
             layerInfos.add("dereko/s=spans")
         }
+        if (includeNonWordTokens) {
+            layerInfos.add("base/p=tokens")
+        }
         externalSentenceFoundries.forEach { layerInfos.add("$it/s=spans") }
         externalConstitFoundries.forEach { layerInfos.add("$it/c=spans") }
 
@@ -635,6 +638,11 @@
                 tokenAnnotations.add(jsonString("i:${surfaceForm.lowercase().escapeKrillValue()}"))
             }
 
+            // Mark non-word tokens (punctuation, symbols, etc.) with a base-layer flag
+            if (!shouldKeepTokenForKrill(text, token)) {
+                tokenAnnotations.add(jsonString("base/p:_"))
+            }
+
             // Add inverse dependency annotations (<:) for dependents pointing to this token as head
             inverseDeps[index]?.sortedBy { "${it.foundry}/${it.deprel}" }?.forEach { inv ->
                 tokenAnnotations.add(jsonString("<:${inv.foundry}/d:${inv.deprel.escapeKrillValue()}\$<b>32<i>${inv.dependentIndex}"))
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
index cbe9ef5..c859c64 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
@@ -313,6 +313,8 @@
         assertTrue(defaultJsons.isNotEmpty())
         assertTrue(defaultJsons.all { !it.contains("\"s:,\"") })
         assertTrue(defaultJsons.all { !it.contains("\"s:!\"") })
+        assertTrue(defaultJsons.all { !it.contains("\"base/p:_\"") }, "Default output should not have base/p:_ marker")
+        assertTrue(defaultJsons.all { !it.contains("base/p=tokens") }, "Default output should not declare base/p=tokens layer")
 
         val flagTar = ensureKrillTar("wud24_default_corenlp_nwt") { outputDir ->
             arrayOf("-t", "krill", "-q", "--non-word-tokens", "-D", outputDir.path, baseZip, spacyZip, wud24Corenlp)
@@ -323,6 +325,8 @@
         assertTrue(flagJsons.isNotEmpty())
         assertTrue(flagJsons.any { it.contains("\"s:,\"") })
         assertTrue(flagJsons.any { it.contains("\"s:!\"") })
+        assertTrue(flagJsons.any { it.contains("\"base/p:_\"") }, "NWT output should mark non-word tokens with base/p:_")
+        assertTrue(flagJsons.all { it.contains("base/p=tokens") }, "NWT output should declare base/p=tokens in layerInfos")
     }
 
     @Test