Fix annotation parsing in fetchAnnotations
Resolves #27
Change-Id: Id0fc0cc99835926f75e2679ce10c407763bf7dc3
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index d2de0ba..ad867c5 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -913,65 +913,88 @@
# Helper function to extract annotations from a span section
extract_annotations_from_section <- function(section_content) {
- # Handle both spaced tokens and nested single tokens
+ # Remove any <mark>...</mark> tags that may interrupt token boundaries
+ section_no_marks <- gsub('</?mark[^>]*>', '', section_content, perl = TRUE)
+ # Normalize separators between adjacent top-level spans so splitting is robust.
+ # Replace any punctuation/entity/space run between one-or-more closing spans and the next opening span
+ # with a single space, preserving all closing spans.
+ section_norm <- gsub('((?:</span>)+)[[:space:]]*(?:&[^;]+;|[[:punct:]]|[[:space:]])*[[:space:]]*(<span)', '\\1 \\2', section_no_marks, perl = TRUE)
+ # Handle both spaced tokens and nested single tokens by scanning innermost spans with direct text
tokens <- character(0)
lemmas <- character(0)
pos_tags <- character(0)
morph_tags <- character(0)
- # First try to split by spaces between span groups (for multiple tokens)
- # Look for spaces that separate token groups
- if (grepl('</span>\\s+<span', section_content)) {
- # Multiple tokens separated by spaces
- token_groups <- unlist(strsplit(section_content, '(?<=</span>)\\s+(?=<span)', perl = TRUE))
- } else {
- # Single token (or no spaces between tokens)
- token_groups <- c(section_content)
- }
+ pat_token <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
+ mm <- gregexpr(pat_token, section_norm, perl = TRUE)
+ if (mm[[1]][1] != -1) {
+ starts <- mm[[1]]
+ lens <- attr(mm[[1]], 'match.length')
+ for (k in seq_along(starts)) {
+ s <- starts[k]
+ e <- s + lens[k] - 1
+ fragment <- substr(section_norm, s, e)
+ text_content <- sub(pat_token, '\\2', fragment, perl = TRUE)
+ text_content <- trimws(text_content)
+ title_content <- sub(pat_token, '\\1', fragment, perl = TRUE)
- for (group in token_groups) {
- group <- trimws(group)
- if (nchar(group) == 0) next
+ if (nchar(text_content) == 0) next
- # Extract the actual text content (the innermost text)
- text_match <- regexpr('>([^<>]+)</span>', group, perl = TRUE)
- if (text_match > 0) {
- # Find all possible text contents and take the last one (innermost)
- all_texts <- regmatches(group, gregexpr('>([^<>]+)</span>', group, perl = TRUE))[[1]]
- if (length(all_texts) > 0) {
- # Take the last match (innermost text)
- text_content <- sub('.*>([^<>]+)</span>.*', '\\1', all_texts[length(all_texts)], perl = TRUE)
- text_content <- trimws(text_content)
+ lemma <- NA
+ pos_tag <- NA
+ morph_features <- character(0)
- if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
- tokens <- c(tokens, text_content)
-
- # Extract all title attributes from this group
- titles <- regmatches(group, gregexpr('title="([^"]*)"', group, perl = TRUE))[[1]]
-
- morph_features <- character(0)
- lemma <- NA
- pos_tag <- NA
-
- for (title in titles) {
- content <- sub('title="([^"]*)"', '\\1', title, perl = TRUE)
-
- if (grepl('^[^/]+/l:', content)) {
- lemma <- sub('^[^/]+/l:(.*)$', '\\1', content)
- } else if (grepl('^[^/]+/p:', content)) {
- pos_tag <- sub('^[^/]+/p:(.*)$', '\\1', content)
- } else if (grepl('^[^/]+/m:', content)) {
- morph_feature <- sub('^[^/]+/m:(.*)$', '\\1', content)
- morph_features <- c(morph_features, morph_feature)
- }
- }
-
- lemmas <- c(lemmas, lemma)
- pos_tags <- c(pos_tags, pos_tag)
- morph_tag <- if (length(morph_features) > 0) paste(morph_features, collapse = "|") else NA
- morph_tags <- c(morph_tags, morph_tag)
+ # parse inner title
+ ann <- unlist(strsplit(title_content, "[[:space:]]+"))
+ for (a in ann) {
+ if (grepl('/l:', a)) {
+ lemma <- sub('.*?/l:(.*)$', '\\1', a, perl = TRUE)
+ } else if (grepl('/p:', a)) {
+ pos_tag <- sub('.*?/p:(.*)$', '\\1', a, perl = TRUE)
+ } else if (grepl('/m:', a)) {
+ morph_features <- c(morph_features, sub('.*?/m:(.*)$', '\\1', a, perl = TRUE))
}
}
+
+ # If lemma missing, look back in nearby context for the nearest title containing l:
+ if (is.na(lemma) || nchar(lemma) == 0) {
+ ctx_start <- max(1, s - 500)
+ context <- substr(section_norm, ctx_start, s - 1)
+ tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
+ if (tmm[[1]][1] != -1) {
+ ctx_titles <- regmatches(context, tmm)[[1]]
+ for (ti in rev(ctx_titles)) {
+ cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
+ if (grepl('/l:', cont)) {
+ lemma <- sub('.*?/l:([^ ]+).*', '\\1', cont, perl = TRUE)
+ break
+ }
+ }
+ }
+ }
+
+ # If POS missing, keep NA; morphological features may also appear in outer titles
+ if (length(morph_features) == 0) {
+ ctx_start <- max(1, s - 500)
+ context <- substr(section_norm, ctx_start, s - 1)
+ tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
+ if (tmm[[1]][1] != -1) {
+ ctx_titles <- regmatches(context, tmm)[[1]]
+ for (ti in rev(ctx_titles)) {
+ cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
+ if (grepl('/m:', cont)) {
+ mparts <- unlist(strsplit(cont, "[[:space:]]+"))
+ for (mp in mparts) if (grepl('/m:', mp)) morph_features <- c(morph_features, sub('.*?/m:(.*)$', '\\1', mp, perl = TRUE))
+ break
+ }
+ }
+ }
+ }
+
+ tokens <- c(tokens, text_content)
+ lemmas <- c(lemmas, if (!is.null(lemma)) lemma else NA)
+ pos_tags <- c(pos_tags, if (!is.null(pos_tag)) pos_tag else NA)
+ morph_tags <- c(morph_tags, if (length(morph_features) > 0) paste(morph_features, collapse = "|") else NA)
}
}
@@ -1010,17 +1033,22 @@
# Now find the <mark> and </mark> positions within this content
mark_start <- regexpr('<mark[^>]*>', match_span_content, perl = TRUE)
- mark_end <- regexpr('</mark>', match_span_content, perl = TRUE)
+ # Use the LAST closing </mark> to cover multi-part matches
+ mark_end_gre <- gregexpr('</mark>', match_span_content, perl = TRUE)
+ mark_end_positions <- mark_end_gre[[1]]
+ mark_end <- if (!is.null(mark_end_positions) && length(mark_end_positions) > 0 && mark_end_positions[1] != -1)
+ mark_end_positions[length(mark_end_positions)] else -1
+ mark_end_len <- if (mark_end != -1) attr(mark_end_gre[[1]], "match.length")[length(mark_end_positions)] else 0
if (mark_start > 0 && mark_end > 0) {
- # Left context: everything before <mark>
+ # Left context: everything before first <mark>
left_content <- substr(match_span_content, 1, mark_start - 1)
- # Match content: everything between <mark> and </mark> (including the mark tags for now)
- match_content <- substr(match_span_content, mark_start, mark_end + attr(mark_end, "match.length") - 1)
+ # Match content: everything between first <mark> and last </mark>
+ match_content <- substr(match_span_content, mark_start, mark_end + mark_end_len - 1)
- # Right context: everything after </mark>
- right_content_start <- mark_end + attr(mark_end, "match.length")
+ # Right context: everything after last </mark>
+ right_content_start <- mark_end + mark_end_len
right_content <- substr(match_span_content, right_content_start, nchar(match_span_content))
} else {
# No mark tags found, treat entire match span as match content
diff --git a/tests/testthat/test-fetchAnnotations.R b/tests/testthat/test-fetchAnnotations.R
index edc415b..3ff7c2c 100644
--- a/tests/testthat/test-fetchAnnotations.R
+++ b/tests/testthat/test-fetchAnnotations.R
@@ -148,6 +148,117 @@
}
})
+test_that("parser covers full span across multiple <mark> blocks", {
+ # Local, offline test to ensure correct match extraction when multiple
+ # <mark>…</mark> segments occur within the match span.
+ xml_snippet <- '<span class="context-left"></span>
+ <span class="match">
+ <span title="tt/l:Wir"><span title="tt/p:PPER">Wir</span></span>
+ <mark>
+ <span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span>
+ </mark>
+ <span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span>
+ <mark>
+ <span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span>
+ <span title="tt/l:Plan"><span title="tt/p:NN">Plan</span></span>
+ </mark>
+ </span>
+ <span class="context-right"></span>'
+
+ parsed <- parse_xml_annotations_structured(xml_snippet)
+
+ # Left context contains the pre-mark token
+ expect_equal(parsed$atokens$left, c("Wir"))
+
+ # Match should include everything from the first <mark> to the last </mark>,
+ # including tokens between them
+ expect_equal(parsed$atokens$match, c("können", "alles", "außer", "Plan"))
+
+ # POS and lemma lengths align with tokens in each section
+ expect_length(parsed$pos$match, length(parsed$atokens$match))
+ expect_length(parsed$lemma$match, length(parsed$atokens$match))
+ expect_equal(parsed$pos$match, c("VVFIN", "PIS", "APPR", "NN"))
+ expect_equal(parsed$lemma$match, c("können", "alles", "außer", "Plan"))
+
+ # Right context should be empty in this snippet
+ expect_length(parsed$atokens$right, 0)
+})
+
+test_that("parser keeps tokens separated across between spans", {
+ xml_snippet <- '<span class="context-left"></span>
+ <span class="match">
+ <mark><span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span></mark> <span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span><mark><span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span></mark>
+ </span>
+ <span class="context-right"></span>'
+
+ parsed <- parse_xml_annotations_structured(xml_snippet)
+ expect_equal(parsed$atokens$match, c("können", "alles", "außer"))
+ expect_equal(parsed$pos$match, c("VVFIN", "PIS", "APPR"))
+ expect_equal(parsed$lemma$match, c("können", "alles", "außer"))
+})
+
+test_that("parser keeps tokens separated across punctuation between spans", {
+ xml_snippet <- '<span class="context-left"></span>
+ <span class="match">
+ <mark><span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span></mark>, <span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span><mark><span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span></mark>
+ </span>
+ <span class="context-right"></span>'
+
+ parsed <- parse_xml_annotations_structured(xml_snippet)
+ expect_equal(parsed$atokens$match, c("können", "alles", "außer"))
+ expect_equal(parsed$pos$match, c("VVFIN", "PIS", "APPR"))
+ expect_equal(parsed$lemma$match, c("können", "alles", "außer"))
+})
+
+test_that("online: fetchAnnotations aligns pos/lemma with tokens for complex query", {
+ skip_if_offline()
+
+ kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
+ q <- kco %>%
+ corpusQuery('[orth="[wW]ir"] können alles [orth="ausser" | orth="außer"] [tt/pos=NN]',
+ metadataOnly = FALSE,
+ fields = c("textSigle", "snippet", "tokens")) %>%
+ fetchNext(maxFetch = 20)
+
+ skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for online test")
+
+ q2 <- fetchAnnotations(q, foundry = "tt", verbose = FALSE)
+
+ # For each match, POS and lemma counts must equal token count in the match span
+ for (i in seq_len(nrow(q2@collectedMatches))) {
+ tt <- q2@collectedMatches$tokens$match[[i]]
+ pp <- q2@collectedMatches$pos$match[[i]]
+ ll <- q2@collectedMatches$lemma$match[[i]]
+ expect_equal(length(tt), length(pp))
+ expect_equal(length(tt), length(ll))
+ }
+})
+
+test_that("fetchAnnotations aligns tokens and annotations across multiple <mark> blocks (stubbed API)", {
+ # Define a minimal dummy KorAPConnection-like S4 class for offline testing
+ setClass('DummyKCO', slots = c(apiUrl='character', verbose='logical'))
+ setMethod('apiCall', 'DummyKCO', function(kco, url, json = TRUE, getHeaders = FALSE, cache = FALSE, timeout = 10) {
+ list(snippet = '<span class="context-left"></span> <span class="match"> <span title="tt/l:Wir"><span title="tt/p:PPER">Wir</span></span> <mark> <span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span> </mark> <span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span> <mark> <span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span> <span title="tt/l:Plan"><span title="tt/p:NN">Plan</span></span> </mark> </span> <span class="context-right"></span>')
+ })
+
+ # Build a minimal KorAPQuery with a dummy connection and a single match row
+ kco <- new('DummyKCO', apiUrl = 'http://dummy/', verbose = FALSE)
+ df <- data.frame(textSigle = 'A/B/C', matchStart = 1, matchEnd = 5, matchID = 'match-A/B/C-p1-5', stringsAsFactors = FALSE)
+ q <- KorAPQuery(korapConnection = kco, collectedMatches = df)
+
+ q2 <- fetchAnnotations(q, foundry = 'tt', verbose = FALSE)
+
+ # Expect full match span to be covered
+ expect_equal(q2@collectedMatches$atokens$left[[1]], c('Wir'))
+ expect_equal(q2@collectedMatches$atokens$match[[1]], c('können','alles','außer','Plan'))
+ expect_equal(q2@collectedMatches$pos$match[[1]], c('VVFIN','PIS','APPR','NN'))
+ expect_equal(q2@collectedMatches$lemma$match[[1]], c('können','alles','außer','Plan'))
+
+ # Alignment checks
+ expect_length(q2@collectedMatches$pos$match[[1]], length(q2@collectedMatches$atokens$match[[1]]))
+ expect_length(q2@collectedMatches$lemma$match[[1]], length(q2@collectedMatches$atokens$match[[1]]))
+})
+
test_that("matchID is preserved in collectedMatches", {
skip_if_offline()