Fix fetchAnnotations getting all morph values Now using xml2 package to properly parse the query result resolves #30 Change-Id: I827cae94791f14ce0955352fd64fd7e5683a92e6

commit: cd4521884fd23558757338fcbbf6e67702f9a780 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Oct 09 13:28:41 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Thu Oct 09 14:53:00 2025 +0200
tree: db32df9430c4d52b629dbe9368782d8174f8a9b0
parent: 336c85d5c1d3986e1846c2b7e5dee6a4c40192e9 [diff]
diff --git a/DESCRIPTION b/DESCRIPTION
index 24ee9ca..700c473 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION

@@ -43,7 +43,8 @@
     PTXQC,
     purrr,
     stringr,
-    urltools
+    urltools,
+    xml2
 Suggests:
     lifecycle,
     testthat,

diff --git a/NEWS.md b/NEWS.md
index bd4e50b..dbe2e16 100644
--- a/NEWS.md
+++ b/NEWS.md

@@ -1,6 +1,7 @@
 # RKorAPClient 1.2.0.9000
 
 - warnings are now issued, when queries had to be rewritten (for licensing reasons)
+- fixed `fetchAnnotations()` morphology so MarMoT and other foundries keep all features from nested spans ([#30](https://github.com/KorAP/RKorAPClient/issues/30))
 
 # RKorAPClient 1.2.0
 

diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index e43467d..1afb122 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R

@@ -785,6 +785,47 @@
     return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
   }
 
+  extract_morph_via_xml <- function(fragment) {
+    snippet <- paste0("<root>", fragment, "</root>")
+    doc <- tryCatch(xml2::read_html(snippet), error = function(e) NULL)
+    if (is.null(doc)) return(NULL)
+
+    nodes <- xml2::xml_find_all(doc, ".//span[not(.//span)]")
+    if (length(nodes) == 0) return(list(tokens = character(0), morph = character(0)))
+
+    tokens_xml <- character(0)
+    morph_vals <- character(0)
+
+    for (node in nodes) {
+      token_text <- trimws(xml2::xml_text(node))
+      if (identical(token_text, "")) next
+
+      tokens_xml <- c(tokens_xml, token_text)
+
+      ancestors <- xml2::xml_find_all(node, "ancestor-or-self::span")
+      titles <- xml2::xml_attr(ancestors, "title")
+      titles <- titles[!is.na(titles)]
+
+      feature_tokens <- character(0)
+      if (length(titles) > 0) {
+        bits <- unlist(strsplit(titles, "[[:space:]]+"))
+        bits <- bits[grepl('/m:', bits)]
+        if (length(bits) > 0) {
+          feature_tokens <- sub('.*?/m:(.*)$', '\\1', bits, perl = TRUE)
+          feature_tokens <- feature_tokens[!duplicated(feature_tokens)]
+        }
+      }
+
+      if (length(feature_tokens) == 0) {
+        morph_vals <- c(morph_vals, NA_character_)
+      } else {
+        morph_vals <- c(morph_vals, paste(feature_tokens, collapse = "|"))
+      }
+    }
+
+    list(tokens = tokens_xml, morph = morph_vals)
+  }
+
   # Extract content within <span class="match">...</span> using a more robust approach
   if (grepl('<span class="match">', xml_snippet)) {
     # Find the start of match span
@@ -837,7 +878,7 @@
 
         lemma <- NA
         pos_tag <- NA
-        morph_tag <- NA
+        morph_features <- character(0)
 
         if (title_matches[[1]][1] != -1) {
           all_titles <- regmatches(part, title_matches)[[1]]
@@ -852,7 +893,7 @@
               } else if (grepl('^[^/]+/p:', annotation)) {
                 pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
               } else if (grepl('^[^/]+/m:', annotation)) {
-                morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
+                morph_features <- c(morph_features, gsub('^[^/]+/m:(.*)$', '\\1', annotation))
               }
             }
           }
@@ -860,6 +901,11 @@
 
         lemmas <- c(lemmas, lemma)
         pos_tags <- c(pos_tags, pos_tag)
+        morph_tag <- if (length(morph_features) > 0) {
+          paste(unique(morph_features), collapse = "|")
+        } else {
+          NA
+        }
         morph_tags <- c(morph_tags, morph_tag)
       }
     }
@@ -885,7 +931,7 @@
           # Parse space-separated annotations in title
           lemma <- NA
           pos_tag <- NA
-          morph_tag <- NA
+          morph_features <- character(0)
 
           annotations <- unlist(strsplit(title, "\\s+"))
           for (annotation in annotations) {
@@ -894,18 +940,28 @@
             } else if (grepl('^[^/]+/p:', annotation)) {
               pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
             } else if (grepl('^[^/]+/m:', annotation)) {
-              morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
+              morph_features <- c(morph_features, gsub('^[^/]+/m:(.*)$', '\\1', annotation))
             }
           }
 
           lemmas <- c(lemmas, lemma)
           pos_tags <- c(pos_tags, pos_tag)
+          morph_tag <- if (length(morph_features) > 0) {
+            paste(unique(morph_features), collapse = "|")
+          } else {
+            NA
+          }
           morph_tags <- c(morph_tags, morph_tag)
         }
       }
     }
   }
 
+  xml_morph <- extract_morph_via_xml(xml_snippet)
+  if (!is.null(xml_morph) && length(xml_morph$morph) > 0) {
+    morph_tags <- xml_morph$morph
+  }
+
   # Ensure all vectors have the same length
   max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
   if (max_length > 0) {
@@ -944,6 +1000,49 @@
     ))
   }
 
+  extract_morphological_features_via_xml <- function(section_content) {
+    snippet <- paste0("<root>", section_content, "</root>")
+    doc <- tryCatch(xml2::read_html(snippet), error = function(e) NULL)
+    if (is.null(doc)) return(NULL)
+
+    nodes <- xml2::xml_find_all(doc, ".//span[not(.//span)]")
+    if (length(nodes) == 0) {
+      return(list(tokens = character(0), morph = character(0)))
+    }
+
+    tokens_xml <- character(0)
+    morph_vals <- character(0)
+
+    for (node in nodes) {
+      token_text <- trimws(xml2::xml_text(node))
+      if (identical(token_text, "")) next
+
+      tokens_xml <- c(tokens_xml, token_text)
+
+      ancestors <- xml2::xml_find_all(node, "ancestor-or-self::span")
+      titles <- xml2::xml_attr(ancestors, "title")
+      titles <- titles[!is.na(titles)]
+
+      feature_tokens <- character(0)
+      if (length(titles) > 0) {
+        bits <- unlist(strsplit(titles, "[[:space:]]+"))
+        bits <- bits[grepl('/m:', bits)]
+        if (length(bits) > 0) {
+          feature_tokens <- sub('.*?/m:(.*)$', '\\1', bits, perl = TRUE)
+          feature_tokens <- feature_tokens[!duplicated(feature_tokens)]
+        }
+      }
+
+      if (length(feature_tokens) == 0) {
+        morph_vals <- c(morph_vals, NA_character_)
+      } else {
+        morph_vals <- c(morph_vals, paste(feature_tokens, collapse = "|"))
+      }
+    }
+
+    list(tokens = tokens_xml, morph = morph_vals)
+  }
+
   # Helper function to extract annotations from a span section
   extract_annotations_from_section <- function(section_content) {
     # Remove any <mark>...</mark> tags that may interrupt token boundaries
@@ -1007,19 +1106,24 @@
         }
 
         # If POS missing, keep NA; morphological features may also appear in outer titles
-        if (length(morph_features) == 0) {
-          ctx_start <- max(1, s - 500)
-          context <- substr(section_norm, ctx_start, s - 1)
-          tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
-          if (tmm[[1]][1] != -1) {
-            ctx_titles <- regmatches(context, tmm)[[1]]
-            for (ti in rev(ctx_titles)) {
-              cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
-              if (grepl('/m:', cont)) {
-                mparts <- unlist(strsplit(cont, "[[:space:]]+"))
-                for (mp in mparts) if (grepl('/m:', mp)) morph_features <- c(morph_features, sub('.*?/m:(.*)$', '\\1', mp, perl = TRUE))
-                break
+        ctx_start <- max(1, s - 500)
+        context <- substr(section_norm, ctx_start, s - 1)
+        tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
+        if (tmm[[1]][1] != -1) {
+          ctx_titles <- regmatches(context, tmm)[[1]]
+          collecting <- FALSE
+          for (ti in rev(ctx_titles)) {
+            cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
+            if (grepl('/m:', cont)) {
+              collecting <- TRUE
+              mparts <- unlist(strsplit(cont, "[[:space:]]+"))
+              features <- sub('.*?/m:(.*)$', '\\1', mparts[grepl('/m:', mparts)], perl = TRUE)
+              if (length(features) > 0) {
+                new_features <- features[!features %in% morph_features]
+                morph_features <- c(morph_features, new_features)
               }
+            } else if (collecting) {
+              break
             }
           }
         }
@@ -1027,10 +1131,16 @@
         tokens <- c(tokens, text_content)
         lemmas <- c(lemmas, if (!is.null(lemma)) lemma else NA)
         pos_tags <- c(pos_tags, if (!is.null(pos_tag)) pos_tag else NA)
-        morph_tags <- c(morph_tags, if (length(morph_features) > 0) paste(morph_features, collapse = "|") else NA)
+        morph_tags <- c(morph_tags, if (length(morph_features) > 0) paste(unique(morph_features), collapse = "|") else NA)
       }
     }
 
+    # Optionally replace morphological tags using XML-based extraction if it aligns with tokens
+    xml_morph <- extract_morphological_features_via_xml(section_content)
+    if (!is.null(xml_morph) && length(xml_morph$morph) > 0) {
+      morph_tags <- xml_morph$morph
+    }
+
     # Ensure all vectors have the same length
     max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
     if (max_length > 0) {

diff --git a/tests/testthat/test-fetchAnnotations.R b/tests/testthat/test-fetchAnnotations.R
index 117f93a..20a73ca 100644
--- a/tests/testthat/test-fetchAnnotations.R
+++ b/tests/testthat/test-fetchAnnotations.R

@@ -338,6 +338,34 @@
   }
 })
 
+test_that("issue #30 regression keeps all morphological features", {
+  skip_if_offline()
+
+  matches <- KorAPConnection(verbose = FALSE, cache = FALSE) %>%
+    corpusQuery("aufgrund einer Ameisenplage", vc = "availability=/CC.*/", metadataOnly = FALSE) %>%
+    fetchNext(maxFetch = 5) %>%
+    fetchAnnotations("marmot", verbose = FALSE) %>%
+    slot("collectedMatches")
+
+  skip_if(is.null(matches) || nrow(matches) == 0, "No matches found for issue #30 query")
+
+  pos_tail <- sapply(matches[["pos"]][["match"]], tail, 1)
+  morph_tail <- sapply(matches[["morph"]][["match"]], tail, 1)
+
+  # Ensure we actually received morphological annotations for this query
+  skip_if(all(is.na(morph_tail)), "No morphological annotations returned for issue #30 query")
+
+  # POS should still be available for the same tokens (original sanity check from issue report)
+  expect_true(all(!is.na(pos_tail)))
+
+  # Regression assertion: MarMoT morph tail must keep all features, not just the last one
+  observed_morph <- morph_tail[!is.na(morph_tail)]
+  expect_true(all(grepl("\\|", observed_morph)))
+  expect_true(all(grepl("case:", observed_morph)))
+  expect_true(all(grepl("gender:", observed_morph)))
+  expect_true(all(grepl("number:", observed_morph)))
+})
+
 test_that("fetchAnnotations adds missing layer without overwriting existing, and can overwrite when requested", {
   # Define a separate dummy connection that serves different snippets by foundry
   if (!isClass("DummyKCO2")) setClass('DummyKCO2', slots = c(apiUrl='character', verbose='logical'))

diff --git a/tests/testthat/test-parse-annotations.R b/tests/testthat/test-parse-annotations.R
index 502f17a..4f65d57 100644
--- a/tests/testthat/test-parse-annotations.R
+++ b/tests/testthat/test-parse-annotations.R

@@ -31,7 +31,7 @@
   parsed <- RKorAPClient:::parse_xml_annotations(xml_snippet)
 
   expect_equal(parsed$token, c("Wir", "können", "alles", "außer", "Plan"))
-  expect_equal(parsed$pos,   c("PPER", "VVFIN", "PIS", "APPR", "NN"))
+  expect_equal(parsed$pos, c("PPER", "VVFIN", "PIS", "APPR", "NN"))
   expect_equal(parsed$lemma, c("Wir", "können", "alles", "außer", "Plan"))
 
   # morph not present in snippet; should be NA-aligned to tokens
@@ -50,9 +50,9 @@
   parsed <- RKorAPClient:::parse_xml_annotations(xml_snippet)
 
   expect_equal(parsed$token, c("Haus", "können", "gehen"))
-  expect_equal(parsed$pos,   c("NN",   "VVFIN", NA))
-  expect_equal(parsed$lemma, c(NA,      "können", "gehen"))
-  expect_equal(parsed$morph, c(NA,      "verbform:fin", NA))
+  expect_equal(parsed$pos, c("NN", "VVFIN", NA))
+  expect_equal(parsed$lemma, c(NA, "können", "gehen"))
+  expect_equal(parsed$morph, c(NA, "verbform:fin", NA))
 
   # Vectors must be equal length
   n <- length(parsed$token)
@@ -61,3 +61,42 @@
   expect_length(parsed$morph, n)
 })
 
+test_that("parsers retain all morphological features from nested spans", {
+  xml_snippet <- '<span class="context-left"></span>
+  <span class="match">
+    <mark>
+      <span title="marmot/m:number:sg">
+        <span title="marmot/m:case:* marmot/m:case:fem">
+          <span title="tt/l:Ameisenplage tt/p:NN">Ameisenplage</span>
+        </span>
+      </span>
+    </mark>
+  </span>
+  <span class="context-right"></span>'
+
+  basic <- RKorAPClient:::parse_xml_annotations(xml_snippet)
+  structured <- RKorAPClient:::parse_xml_annotations_structured(xml_snippet)
+
+  expect_equal(basic$token, "Ameisenplage")
+  expect_equal(structured$atokens$match, "Ameisenplage")
+
+  basic_feats <- unlist(strsplit(basic$morph, "\\|"))
+  structured_feats <- unlist(strsplit(structured$morph$match, "\\|"))
+
+  expect_setequal(basic_feats, c("case:*", "case:fem", "number:sg"))
+  expect_setequal(structured_feats, c("case:*", "case:fem", "number:sg"))
+})
+
+test_that("multiple lemma and POS values are preserved", {
+  xml_snippet <- '<span class="match">
+    <mark><span title="tt/l:gehen tt/l:geh tt/p:VVFIN tt/p:VVINF">gehen</span></mark>
+  </span>'
+
+  basic <- RKorAPClient:::parse_xml_annotations(xml_snippet)
+  structured <- RKorAPClient:::parse_xml_annotations_structured(xml_snippet)
+
+  expect_equal(basic$lemma, "gehen|geh")
+  expect_equal(basic$pos, "VVFIN|VVINF")
+  expect_equal(structured$lemma$match, "gehen|geh")
+  expect_equal(structured$pos$match, "VVFIN|VVINF")
+})
commit	cd4521884fd23558757338fcbbf6e67702f9a780	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Oct 09 13:28:41 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Oct 09 14:53:00 2025 +0200
tree	db32df9430c4d52b629dbe9368782d8174f8a9b0
parent	336c85d5c1d3986e1846c2b7e5dee6a4c40192e9 [diff]