Fix fetchAnnotations getting all morph values
Now using xml2 package to properly parse the query result
resolves #30
Change-Id: I827cae94791f14ce0955352fd64fd7e5683a92e6
diff --git a/DESCRIPTION b/DESCRIPTION
index 24ee9ca..700c473 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -43,7 +43,8 @@
PTXQC,
purrr,
stringr,
- urltools
+ urltools,
+ xml2
Suggests:
lifecycle,
testthat,
diff --git a/NEWS.md b/NEWS.md
index bd4e50b..dbe2e16 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,6 +1,7 @@
# RKorAPClient 1.2.0.9000
- warnings are now issued, when queries had to be rewritten (for licensing reasons)
+- fixed `fetchAnnotations()` morphology so MarMoT and other foundries keep all features from nested spans ([#30](https://github.com/KorAP/RKorAPClient/issues/30))
# RKorAPClient 1.2.0
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index e43467d..1afb122 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -785,6 +785,47 @@
return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
}
+ extract_morph_via_xml <- function(fragment) {
+ snippet <- paste0("<root>", fragment, "</root>")
+ doc <- tryCatch(xml2::read_html(snippet), error = function(e) NULL)
+ if (is.null(doc)) return(NULL)
+
+ nodes <- xml2::xml_find_all(doc, ".//span[not(.//span)]")
+ if (length(nodes) == 0) return(list(tokens = character(0), morph = character(0)))
+
+ tokens_xml <- character(0)
+ morph_vals <- character(0)
+
+ for (node in nodes) {
+ token_text <- trimws(xml2::xml_text(node))
+ if (identical(token_text, "")) next
+
+ tokens_xml <- c(tokens_xml, token_text)
+
+ ancestors <- xml2::xml_find_all(node, "ancestor-or-self::span")
+ titles <- xml2::xml_attr(ancestors, "title")
+ titles <- titles[!is.na(titles)]
+
+ feature_tokens <- character(0)
+ if (length(titles) > 0) {
+ bits <- unlist(strsplit(titles, "[[:space:]]+"))
+ bits <- bits[grepl('/m:', bits)]
+ if (length(bits) > 0) {
+ feature_tokens <- sub('.*?/m:(.*)$', '\\1', bits, perl = TRUE)
+ feature_tokens <- feature_tokens[!duplicated(feature_tokens)]
+ }
+ }
+
+ if (length(feature_tokens) == 0) {
+ morph_vals <- c(morph_vals, NA_character_)
+ } else {
+ morph_vals <- c(morph_vals, paste(feature_tokens, collapse = "|"))
+ }
+ }
+
+ list(tokens = tokens_xml, morph = morph_vals)
+ }
+
# Extract content within <span class="match">...</span> using a more robust approach
if (grepl('<span class="match">', xml_snippet)) {
# Find the start of match span
@@ -837,7 +878,7 @@
lemma <- NA
pos_tag <- NA
- morph_tag <- NA
+ morph_features <- character(0)
if (title_matches[[1]][1] != -1) {
all_titles <- regmatches(part, title_matches)[[1]]
@@ -852,7 +893,7 @@
} else if (grepl('^[^/]+/p:', annotation)) {
pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
} else if (grepl('^[^/]+/m:', annotation)) {
- morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
+ morph_features <- c(morph_features, gsub('^[^/]+/m:(.*)$', '\\1', annotation))
}
}
}
@@ -860,6 +901,11 @@
lemmas <- c(lemmas, lemma)
pos_tags <- c(pos_tags, pos_tag)
+ morph_tag <- if (length(morph_features) > 0) {
+ paste(unique(morph_features), collapse = "|")
+ } else {
+ NA
+ }
morph_tags <- c(morph_tags, morph_tag)
}
}
@@ -885,7 +931,7 @@
# Parse space-separated annotations in title
lemma <- NA
pos_tag <- NA
- morph_tag <- NA
+ morph_features <- character(0)
annotations <- unlist(strsplit(title, "\\s+"))
for (annotation in annotations) {
@@ -894,18 +940,28 @@
} else if (grepl('^[^/]+/p:', annotation)) {
pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
} else if (grepl('^[^/]+/m:', annotation)) {
- morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
+ morph_features <- c(morph_features, gsub('^[^/]+/m:(.*)$', '\\1', annotation))
}
}
lemmas <- c(lemmas, lemma)
pos_tags <- c(pos_tags, pos_tag)
+ morph_tag <- if (length(morph_features) > 0) {
+ paste(unique(morph_features), collapse = "|")
+ } else {
+ NA
+ }
morph_tags <- c(morph_tags, morph_tag)
}
}
}
}
+ xml_morph <- extract_morph_via_xml(xml_snippet)
+ if (!is.null(xml_morph) && length(xml_morph$morph) > 0) {
+ morph_tags <- xml_morph$morph
+ }
+
# Ensure all vectors have the same length
max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
if (max_length > 0) {
@@ -944,6 +1000,49 @@
))
}
+ extract_morphological_features_via_xml <- function(section_content) {
+ snippet <- paste0("<root>", section_content, "</root>")
+ doc <- tryCatch(xml2::read_html(snippet), error = function(e) NULL)
+ if (is.null(doc)) return(NULL)
+
+ nodes <- xml2::xml_find_all(doc, ".//span[not(.//span)]")
+ if (length(nodes) == 0) {
+ return(list(tokens = character(0), morph = character(0)))
+ }
+
+ tokens_xml <- character(0)
+ morph_vals <- character(0)
+
+ for (node in nodes) {
+ token_text <- trimws(xml2::xml_text(node))
+ if (identical(token_text, "")) next
+
+ tokens_xml <- c(tokens_xml, token_text)
+
+ ancestors <- xml2::xml_find_all(node, "ancestor-or-self::span")
+ titles <- xml2::xml_attr(ancestors, "title")
+ titles <- titles[!is.na(titles)]
+
+ feature_tokens <- character(0)
+ if (length(titles) > 0) {
+ bits <- unlist(strsplit(titles, "[[:space:]]+"))
+ bits <- bits[grepl('/m:', bits)]
+ if (length(bits) > 0) {
+ feature_tokens <- sub('.*?/m:(.*)$', '\\1', bits, perl = TRUE)
+ feature_tokens <- feature_tokens[!duplicated(feature_tokens)]
+ }
+ }
+
+ if (length(feature_tokens) == 0) {
+ morph_vals <- c(morph_vals, NA_character_)
+ } else {
+ morph_vals <- c(morph_vals, paste(feature_tokens, collapse = "|"))
+ }
+ }
+
+ list(tokens = tokens_xml, morph = morph_vals)
+ }
+
# Helper function to extract annotations from a span section
extract_annotations_from_section <- function(section_content) {
# Remove any <mark>...</mark> tags that may interrupt token boundaries
@@ -1007,19 +1106,24 @@
}
# If POS missing, keep NA; morphological features may also appear in outer titles
- if (length(morph_features) == 0) {
- ctx_start <- max(1, s - 500)
- context <- substr(section_norm, ctx_start, s - 1)
- tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
- if (tmm[[1]][1] != -1) {
- ctx_titles <- regmatches(context, tmm)[[1]]
- for (ti in rev(ctx_titles)) {
- cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
- if (grepl('/m:', cont)) {
- mparts <- unlist(strsplit(cont, "[[:space:]]+"))
- for (mp in mparts) if (grepl('/m:', mp)) morph_features <- c(morph_features, sub('.*?/m:(.*)$', '\\1', mp, perl = TRUE))
- break
+ ctx_start <- max(1, s - 500)
+ context <- substr(section_norm, ctx_start, s - 1)
+ tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
+ if (tmm[[1]][1] != -1) {
+ ctx_titles <- regmatches(context, tmm)[[1]]
+ collecting <- FALSE
+ for (ti in rev(ctx_titles)) {
+ cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
+ if (grepl('/m:', cont)) {
+ collecting <- TRUE
+ mparts <- unlist(strsplit(cont, "[[:space:]]+"))
+ features <- sub('.*?/m:(.*)$', '\\1', mparts[grepl('/m:', mparts)], perl = TRUE)
+ if (length(features) > 0) {
+ new_features <- features[!features %in% morph_features]
+ morph_features <- c(morph_features, new_features)
}
+ } else if (collecting) {
+ break
}
}
}
@@ -1027,10 +1131,16 @@
tokens <- c(tokens, text_content)
lemmas <- c(lemmas, if (!is.null(lemma)) lemma else NA)
pos_tags <- c(pos_tags, if (!is.null(pos_tag)) pos_tag else NA)
- morph_tags <- c(morph_tags, if (length(morph_features) > 0) paste(morph_features, collapse = "|") else NA)
+ morph_tags <- c(morph_tags, if (length(morph_features) > 0) paste(unique(morph_features), collapse = "|") else NA)
}
}
+ # Optionally replace morphological tags using XML-based extraction if it aligns with tokens
+ xml_morph <- extract_morphological_features_via_xml(section_content)
+ if (!is.null(xml_morph) && length(xml_morph$morph) > 0) {
+ morph_tags <- xml_morph$morph
+ }
+
# Ensure all vectors have the same length
max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
if (max_length > 0) {
diff --git a/tests/testthat/test-fetchAnnotations.R b/tests/testthat/test-fetchAnnotations.R
index 117f93a..20a73ca 100644
--- a/tests/testthat/test-fetchAnnotations.R
+++ b/tests/testthat/test-fetchAnnotations.R
@@ -338,6 +338,34 @@
}
})
+test_that("issue #30 regression keeps all morphological features", {
+ skip_if_offline()
+
+ matches <- KorAPConnection(verbose = FALSE, cache = FALSE) %>%
+ corpusQuery("aufgrund einer Ameisenplage", vc = "availability=/CC.*/", metadataOnly = FALSE) %>%
+ fetchNext(maxFetch = 5) %>%
+ fetchAnnotations("marmot", verbose = FALSE) %>%
+ slot("collectedMatches")
+
+ skip_if(is.null(matches) || nrow(matches) == 0, "No matches found for issue #30 query")
+
+ pos_tail <- sapply(matches[["pos"]][["match"]], tail, 1)
+ morph_tail <- sapply(matches[["morph"]][["match"]], tail, 1)
+
+ # Ensure we actually received morphological annotations for this query
+ skip_if(all(is.na(morph_tail)), "No morphological annotations returned for issue #30 query")
+
+ # POS should still be available for the same tokens (original sanity check from issue report)
+ expect_true(all(!is.na(pos_tail)))
+
+ # Regression assertion: MarMoT morph tail must keep all features, not just the last one
+ observed_morph <- morph_tail[!is.na(morph_tail)]
+ expect_true(all(grepl("\\|", observed_morph)))
+ expect_true(all(grepl("case:", observed_morph)))
+ expect_true(all(grepl("gender:", observed_morph)))
+ expect_true(all(grepl("number:", observed_morph)))
+})
+
test_that("fetchAnnotations adds missing layer without overwriting existing, and can overwrite when requested", {
# Define a separate dummy connection that serves different snippets by foundry
if (!isClass("DummyKCO2")) setClass('DummyKCO2', slots = c(apiUrl='character', verbose='logical'))
diff --git a/tests/testthat/test-parse-annotations.R b/tests/testthat/test-parse-annotations.R
index 502f17a..4f65d57 100644
--- a/tests/testthat/test-parse-annotations.R
+++ b/tests/testthat/test-parse-annotations.R
@@ -31,7 +31,7 @@
parsed <- RKorAPClient:::parse_xml_annotations(xml_snippet)
expect_equal(parsed$token, c("Wir", "können", "alles", "außer", "Plan"))
- expect_equal(parsed$pos, c("PPER", "VVFIN", "PIS", "APPR", "NN"))
+ expect_equal(parsed$pos, c("PPER", "VVFIN", "PIS", "APPR", "NN"))
expect_equal(parsed$lemma, c("Wir", "können", "alles", "außer", "Plan"))
# morph not present in snippet; should be NA-aligned to tokens
@@ -50,9 +50,9 @@
parsed <- RKorAPClient:::parse_xml_annotations(xml_snippet)
expect_equal(parsed$token, c("Haus", "können", "gehen"))
- expect_equal(parsed$pos, c("NN", "VVFIN", NA))
- expect_equal(parsed$lemma, c(NA, "können", "gehen"))
- expect_equal(parsed$morph, c(NA, "verbform:fin", NA))
+ expect_equal(parsed$pos, c("NN", "VVFIN", NA))
+ expect_equal(parsed$lemma, c(NA, "können", "gehen"))
+ expect_equal(parsed$morph, c(NA, "verbform:fin", NA))
# Vectors must be equal length
n <- length(parsed$token)
@@ -61,3 +61,42 @@
expect_length(parsed$morph, n)
})
+test_that("parsers retain all morphological features from nested spans", {
+ xml_snippet <- '<span class="context-left"></span>
+ <span class="match">
+ <mark>
+ <span title="marmot/m:number:sg">
+ <span title="marmot/m:case:* marmot/m:case:fem">
+ <span title="tt/l:Ameisenplage tt/p:NN">Ameisenplage</span>
+ </span>
+ </span>
+ </mark>
+ </span>
+ <span class="context-right"></span>'
+
+ basic <- RKorAPClient:::parse_xml_annotations(xml_snippet)
+ structured <- RKorAPClient:::parse_xml_annotations_structured(xml_snippet)
+
+ expect_equal(basic$token, "Ameisenplage")
+ expect_equal(structured$atokens$match, "Ameisenplage")
+
+ basic_feats <- unlist(strsplit(basic$morph, "\\|"))
+ structured_feats <- unlist(strsplit(structured$morph$match, "\\|"))
+
+ expect_setequal(basic_feats, c("case:*", "case:fem", "number:sg"))
+ expect_setequal(structured_feats, c("case:*", "case:fem", "number:sg"))
+})
+
+test_that("multiple lemma and POS values are preserved", {
+ xml_snippet <- '<span class="match">
+ <mark><span title="tt/l:gehen tt/l:geh tt/p:VVFIN tt/p:VVINF">gehen</span></mark>
+ </span>'
+
+ basic <- RKorAPClient:::parse_xml_annotations(xml_snippet)
+ structured <- RKorAPClient:::parse_xml_annotations_structured(xml_snippet)
+
+ expect_equal(basic$lemma, "gehen|geh")
+ expect_equal(basic$pos, "VVFIN|VVINF")
+ expect_equal(structured$lemma$match, "gehen|geh")
+ expect_equal(structured$pos$match, "VVFIN|VVINF")
+})