Switch all annotation parsing to xml2
Change-Id: I047b6ce1debb88ff20912c60743221fe33f70b7d
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 1afb122..fa78884 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -771,6 +771,79 @@
return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
})
+# Helper to collapse multiple annotation values while preserving order
+collapse_features <- function(values) {
+ if (length(values) == 0) {
+ return(NA_character_)
+ }
+ unique_values <- values[!duplicated(values)]
+ paste(unique_values, collapse = "|")
+}
+
+# Extract token-level annotations from a DOM node
+collect_token_annotations <- function(parent_node) {
+ if (inherits(parent_node, "xml_missing")) {
+ return(list(
+ node = list(),
+ token = character(0),
+ lemma = character(0),
+ pos = character(0),
+ morph = character(0)
+ ))
+ }
+
+ leaf_nodes <- xml2::xml_find_all(parent_node, ".//span[not(.//span)]")
+
+ if (length(leaf_nodes) == 0) {
+ return(list(
+ node = list(),
+ token = character(0),
+ lemma = character(0),
+ pos = character(0),
+ morph = character(0)
+ ))
+ }
+
+ tokens <- character(0)
+ lemmas <- character(0)
+ pos_tags <- character(0)
+ morph_tags <- character(0)
+ kept_nodes <- list()
+
+ for (idx in seq_along(leaf_nodes)) {
+ leaf <- leaf_nodes[[idx]]
+ token_text <- trimws(xml2::xml_text(leaf))
+ if (identical(token_text, "")) {
+ next
+ }
+
+ kept_nodes[[length(kept_nodes) + 1]] <- leaf
+ tokens <- c(tokens, token_text)
+
+ ancestors <- xml2::xml_find_all(leaf, "ancestor-or-self::span")
+ titles <- xml2::xml_attr(ancestors, "title")
+ titles <- titles[!is.na(titles)]
+
+ feature_pieces <- if (length(titles) > 0) unlist(strsplit(titles, "[[:space:]]+")) else character(0)
+
+ lemma_values <- sub('.*?/l:(.*)$', '\\1', feature_pieces[grepl('/l:', feature_pieces)], perl = TRUE)
+ pos_values <- sub('.*?/p:(.*)$', '\\1', feature_pieces[grepl('/p:', feature_pieces)], perl = TRUE)
+ morph_values <- sub('.*?/m:(.*)$', '\\1', feature_pieces[grepl('/m:', feature_pieces)], perl = TRUE)
+
+ lemmas <- c(lemmas, collapse_features(lemma_values))
+ pos_tags <- c(pos_tags, collapse_features(pos_values))
+ morph_tags <- c(morph_tags, collapse_features(morph_values))
+ }
+
+ list(
+ node = kept_nodes,
+ token = tokens,
+ lemma = lemmas,
+ pos = pos_tags,
+ morph = morph_tags
+ )
+}
+
#'
#' Parse XML annotations into linguistic layers
#'
@@ -785,198 +858,27 @@
return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
}
- extract_morph_via_xml <- function(fragment) {
- snippet <- paste0("<root>", fragment, "</root>")
- doc <- tryCatch(xml2::read_html(snippet), error = function(e) NULL)
- if (is.null(doc)) return(NULL)
-
- nodes <- xml2::xml_find_all(doc, ".//span[not(.//span)]")
- if (length(nodes) == 0) return(list(tokens = character(0), morph = character(0)))
-
- tokens_xml <- character(0)
- morph_vals <- character(0)
-
- for (node in nodes) {
- token_text <- trimws(xml2::xml_text(node))
- if (identical(token_text, "")) next
-
- tokens_xml <- c(tokens_xml, token_text)
-
- ancestors <- xml2::xml_find_all(node, "ancestor-or-self::span")
- titles <- xml2::xml_attr(ancestors, "title")
- titles <- titles[!is.na(titles)]
-
- feature_tokens <- character(0)
- if (length(titles) > 0) {
- bits <- unlist(strsplit(titles, "[[:space:]]+"))
- bits <- bits[grepl('/m:', bits)]
- if (length(bits) > 0) {
- feature_tokens <- sub('.*?/m:(.*)$', '\\1', bits, perl = TRUE)
- feature_tokens <- feature_tokens[!duplicated(feature_tokens)]
- }
- }
-
- if (length(feature_tokens) == 0) {
- morph_vals <- c(morph_vals, NA_character_)
- } else {
- morph_vals <- c(morph_vals, paste(feature_tokens, collapse = "|"))
- }
- }
-
- list(tokens = tokens_xml, morph = morph_vals)
+ doc <- tryCatch(xml2::read_html(paste0("<root>", xml_snippet, "</root>")), error = function(e) NULL)
+ if (is.null(doc)) {
+ return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
}
- # Extract content within <span class="match">...</span> using a more robust approach
- if (grepl('<span class="match">', xml_snippet)) {
- # Find the start of match span
- start_pos <- regexpr('<span class="match">', xml_snippet)
- if (start_pos > 0) {
- # Find the end by counting nested spans
- content_start <- start_pos + attr(start_pos, "match.length")
- remaining <- substr(xml_snippet, content_start, nchar(xml_snippet))
-
- # Simple approach: extract everything until we hit context-right or end
- if (grepl('<span class="context-right">', remaining)) {
- content_to_parse <- gsub('(.*?)<span class="context-right">.*', '\\1', remaining)
- } else {
- # Find the closing </span> that matches our opening span
- # For now, use a simpler approach - take everything until the last </span> sequence
- content_to_parse <- gsub('(.*)</span>\\s*$', '\\1', remaining)
- }
- } else {
- content_to_parse <- xml_snippet
- }
- } else {
- content_to_parse <- xml_snippet
- }
-
- # Initialize result vectors
- tokens <- character(0)
- lemmas <- character(0)
- pos_tags <- character(0)
- morph_tags <- character(0)
-
- # Split the content by </span> and process each meaningful part
- parts <- unlist(strsplit(content_to_parse, '</span>'))
-
- for (part in parts) {
- part <- trimws(part)
- if (nchar(part) == 0) next
-
- # Look for parts that have title attributes and end with text
- if (grepl('<span[^>]*title=', part)) {
- # Extract the text content (everything after the last >)
- text_content <- gsub('.*>([^<]*)$', '\\1', part)
- text_content <- trimws(text_content)
-
- if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
- tokens <- c(tokens, text_content)
-
- # Extract all title attributes from this part
- title_pattern <- 'title="([^"]*)"'
- title_matches <- gregexpr(title_pattern, part)
-
- lemma <- NA
- pos_tag <- NA
- morph_features <- character(0)
-
- if (title_matches[[1]][1] != -1) {
- all_titles <- regmatches(part, title_matches)[[1]]
- for (title_match in all_titles) {
- title_content <- gsub(title_pattern, '\\1', title_match)
-
- # Split by spaces and process each annotation
- annotations <- unlist(strsplit(title_content, "\\s+"))
- for (annotation in annotations) {
- if (grepl('^[^/]+/l:', annotation)) {
- lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
- } else if (grepl('^[^/]+/p:', annotation)) {
- pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
- } else if (grepl('^[^/]+/m:', annotation)) {
- morph_features <- c(morph_features, gsub('^[^/]+/m:(.*)$', '\\1', annotation))
- }
- }
- }
- }
-
- lemmas <- c(lemmas, lemma)
- pos_tags <- c(pos_tags, pos_tag)
- morph_tag <- if (length(morph_features) > 0) {
- paste(unique(morph_features), collapse = "|")
- } else {
- NA
- }
- morph_tags <- c(morph_tags, morph_tag)
- }
+ match_node <- xml2::xml_find_first(doc, ".//span[contains(@class, 'match')]")
+ if (inherits(match_node, "xml_missing")) {
+ match_node <- xml2::xml_find_first(doc, ".//span")
+ if (inherits(match_node, "xml_missing")) {
+ return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
}
}
- # If no tokens found with the splitting approach, try a different method
- if (length(tokens) == 0) {
- # Look for the innermost spans that contain actual text
- innermost_pattern <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
- innermost_matches <- gregexpr(innermost_pattern, content_to_parse, perl = TRUE)
+ token_info <- collect_token_annotations(match_node)
- if (innermost_matches[[1]][1] != -1) {
- matches <- regmatches(content_to_parse, innermost_matches)[[1]]
-
- for (match in matches) {
- title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
- text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
- text <- trimws(text)
-
- if (nchar(text) > 0) {
- tokens <- c(tokens, text)
-
- # Parse space-separated annotations in title
- lemma <- NA
- pos_tag <- NA
- morph_features <- character(0)
-
- annotations <- unlist(strsplit(title, "\\s+"))
- for (annotation in annotations) {
- if (grepl('^[^/]+/l:', annotation)) {
- lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
- } else if (grepl('^[^/]+/p:', annotation)) {
- pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
- } else if (grepl('^[^/]+/m:', annotation)) {
- morph_features <- c(morph_features, gsub('^[^/]+/m:(.*)$', '\\1', annotation))
- }
- }
-
- lemmas <- c(lemmas, lemma)
- pos_tags <- c(pos_tags, pos_tag)
- morph_tag <- if (length(morph_features) > 0) {
- paste(unique(morph_features), collapse = "|")
- } else {
- NA
- }
- morph_tags <- c(morph_tags, morph_tag)
- }
- }
- }
- }
-
- xml_morph <- extract_morph_via_xml(xml_snippet)
- if (!is.null(xml_morph) && length(xml_morph$morph) > 0) {
- morph_tags <- xml_morph$morph
- }
-
- # Ensure all vectors have the same length
- max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
- if (max_length > 0) {
- tokens <- c(tokens, rep(NA, max_length - length(tokens)))
- lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
- pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
- morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
- }
-
- return(list(
- token = tokens,
- lemma = lemmas,
- pos = pos_tags,
- morph = morph_tags
- ))
+ list(
+ token = token_info$token,
+ lemma = token_info$lemma,
+ pos = token_info$pos,
+ morph = token_info$morph
+ )
}
#'
@@ -1000,239 +902,103 @@
))
}
- extract_morphological_features_via_xml <- function(section_content) {
- snippet <- paste0("<root>", section_content, "</root>")
- doc <- tryCatch(xml2::read_html(snippet), error = function(e) NULL)
- if (is.null(doc)) return(NULL)
-
- nodes <- xml2::xml_find_all(doc, ".//span[not(.//span)]")
- if (length(nodes) == 0) {
- return(list(tokens = character(0), morph = character(0)))
- }
-
- tokens_xml <- character(0)
- morph_vals <- character(0)
-
- for (node in nodes) {
- token_text <- trimws(xml2::xml_text(node))
- if (identical(token_text, "")) next
-
- tokens_xml <- c(tokens_xml, token_text)
-
- ancestors <- xml2::xml_find_all(node, "ancestor-or-self::span")
- titles <- xml2::xml_attr(ancestors, "title")
- titles <- titles[!is.na(titles)]
-
- feature_tokens <- character(0)
- if (length(titles) > 0) {
- bits <- unlist(strsplit(titles, "[[:space:]]+"))
- bits <- bits[grepl('/m:', bits)]
- if (length(bits) > 0) {
- feature_tokens <- sub('.*?/m:(.*)$', '\\1', bits, perl = TRUE)
- feature_tokens <- feature_tokens[!duplicated(feature_tokens)]
- }
- }
-
- if (length(feature_tokens) == 0) {
- morph_vals <- c(morph_vals, NA_character_)
- } else {
- morph_vals <- c(morph_vals, paste(feature_tokens, collapse = "|"))
- }
- }
-
- list(tokens = tokens_xml, morph = morph_vals)
- }
-
- # Helper function to extract annotations from a span section
- extract_annotations_from_section <- function(section_content) {
- # Remove any <mark>...</mark> tags that may interrupt token boundaries
- section_no_marks <- gsub('</?mark[^>]*>', '', section_content, perl = TRUE)
- # Normalize separators between adjacent top-level spans so splitting is robust.
- # Replace any punctuation/entity/space run between one-or-more closing spans and the next opening span
- # with a single space, preserving all closing spans.
- section_norm <- gsub('((?:</span>)+)[[:space:]]*(?:&[^;]+;|[[:punct:]]|[[:space:]])*[[:space:]]*(<span)', '\\1 \\2', section_no_marks, perl = TRUE)
- # Handle both spaced tokens and nested single tokens by scanning innermost spans with direct text
- tokens <- character(0)
- lemmas <- character(0)
- pos_tags <- character(0)
- morph_tags <- character(0)
-
- pat_token <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
- mm <- gregexpr(pat_token, section_norm, perl = TRUE)
- if (mm[[1]][1] != -1) {
- starts <- mm[[1]]
- lens <- attr(mm[[1]], 'match.length')
- for (k in seq_along(starts)) {
- s <- starts[k]
- e <- s + lens[k] - 1
- fragment <- substr(section_norm, s, e)
- text_content <- sub(pat_token, '\\2', fragment, perl = TRUE)
- text_content <- trimws(text_content)
- title_content <- sub(pat_token, '\\1', fragment, perl = TRUE)
-
- if (nchar(text_content) == 0) next
-
- lemma <- NA
- pos_tag <- NA
- morph_features <- character(0)
-
- # parse inner title
- ann <- unlist(strsplit(title_content, "[[:space:]]+"))
- for (a in ann) {
- if (grepl('/l:', a)) {
- lemma <- sub('.*?/l:(.*)$', '\\1', a, perl = TRUE)
- } else if (grepl('/p:', a)) {
- pos_tag <- sub('.*?/p:(.*)$', '\\1', a, perl = TRUE)
- } else if (grepl('/m:', a)) {
- morph_features <- c(morph_features, sub('.*?/m:(.*)$', '\\1', a, perl = TRUE))
- }
- }
-
- # If lemma missing, look back in nearby context for the nearest title containing l:
- if (is.na(lemma) || nchar(lemma) == 0) {
- ctx_start <- max(1, s - 500)
- context <- substr(section_norm, ctx_start, s - 1)
- tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
- if (tmm[[1]][1] != -1) {
- ctx_titles <- regmatches(context, tmm)[[1]]
- for (ti in rev(ctx_titles)) {
- cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
- if (grepl('/l:', cont)) {
- lemma <- sub('.*?/l:([^ ]+).*', '\\1', cont, perl = TRUE)
- break
- }
- }
- }
- }
-
- # If POS missing, keep NA; morphological features may also appear in outer titles
- ctx_start <- max(1, s - 500)
- context <- substr(section_norm, ctx_start, s - 1)
- tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
- if (tmm[[1]][1] != -1) {
- ctx_titles <- regmatches(context, tmm)[[1]]
- collecting <- FALSE
- for (ti in rev(ctx_titles)) {
- cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
- if (grepl('/m:', cont)) {
- collecting <- TRUE
- mparts <- unlist(strsplit(cont, "[[:space:]]+"))
- features <- sub('.*?/m:(.*)$', '\\1', mparts[grepl('/m:', mparts)], perl = TRUE)
- if (length(features) > 0) {
- new_features <- features[!features %in% morph_features]
- morph_features <- c(morph_features, new_features)
- }
- } else if (collecting) {
- break
- }
- }
- }
-
- tokens <- c(tokens, text_content)
- lemmas <- c(lemmas, if (!is.null(lemma)) lemma else NA)
- pos_tags <- c(pos_tags, if (!is.null(pos_tag)) pos_tag else NA)
- morph_tags <- c(morph_tags, if (length(morph_features) > 0) paste(unique(morph_features), collapse = "|") else NA)
- }
- }
-
- # Optionally replace morphological tags using XML-based extraction if it aligns with tokens
- xml_morph <- extract_morphological_features_via_xml(section_content)
- if (!is.null(xml_morph) && length(xml_morph$morph) > 0) {
- morph_tags <- xml_morph$morph
- }
-
- # Ensure all vectors have the same length
- max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
- if (max_length > 0) {
- tokens <- c(tokens, rep(NA, max_length - length(tokens)))
- lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
- pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
- morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
- }
-
+ doc <- tryCatch(xml2::read_html(paste0("<root>", xml_snippet, "</root>")), error = function(e) NULL)
+ if (is.null(doc)) {
+ empty_result <- list(left = character(0), match = character(0), right = character(0))
return(list(
- tokens = tokens,
- lemmas = lemmas,
- pos_tags = pos_tags,
- morph_tags = morph_tags
+ atokens = empty_result,
+ lemma = empty_result,
+ pos = empty_result,
+ morph = empty_result
))
}
- # Split the XML into three parts: left context, match content, and right context
- # The structure is: <span class="match">...left...<mark>...match...</mark>...right...</span>
-
- # First extract the content within the match span using DOTALL modifier
- match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*<span class="context-right">'
- match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
-
- if (match_span_match == -1) {
- # Try alternative pattern if no context-right
- match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*$'
- match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
+ match_node <- xml2::xml_find_first(doc, ".//span[contains(@class, 'match')]")
+ if (inherits(match_node, "xml_missing")) {
+ empty_result <- list(left = character(0), match = character(0), right = character(0))
+ return(list(
+ atokens = empty_result,
+ lemma = empty_result,
+ pos = empty_result,
+ morph = empty_result
+ ))
}
- if (match_span_match > 0) {
- match_span_content <- gsub(match_span_pattern, '\\1', xml_snippet, perl = TRUE)
+ token_info <- collect_token_annotations(match_node)
+ tokens <- token_info$token
+ lemmas <- token_info$lemma
+ pos_tags <- token_info$pos
+ morph_tags <- token_info$morph
+ nodes <- token_info$node
- # Now find the <mark> and </mark> positions within this content
- mark_start <- regexpr('<mark[^>]*>', match_span_content, perl = TRUE)
- # Use the LAST closing </mark> to cover multi-part matches
- mark_end_gre <- gregexpr('</mark>', match_span_content, perl = TRUE)
- mark_end_positions <- mark_end_gre[[1]]
- mark_end <- if (!is.null(mark_end_positions) && length(mark_end_positions) > 0 && mark_end_positions[1] != -1)
- mark_end_positions[length(mark_end_positions)] else -1
- mark_end_len <- if (mark_end != -1) attr(mark_end_gre[[1]], "match.length")[length(mark_end_positions)] else 0
+ if (length(tokens) == 0) {
+ empty_result <- list(left = character(0), match = character(0), right = character(0))
+ return(list(
+ atokens = empty_result,
+ lemma = empty_result,
+ pos = empty_result,
+ morph = empty_result
+ ))
+ }
- if (mark_start > 0 && mark_end > 0) {
- # Left context: everything before first <mark>
- left_content <- substr(match_span_content, 1, mark_start - 1)
+ mark_flags <- vapply(nodes, function(n) {
+ !inherits(xml2::xml_find_first(n, "ancestor::mark"), "xml_missing")
+ }, logical(1))
- # Match content: everything between first <mark> and last </mark>
- match_content <- substr(match_span_content, mark_start, mark_end + mark_end_len - 1)
-
- # Right context: everything after last </mark>
- right_content_start <- mark_end + mark_end_len
- right_content <- substr(match_span_content, right_content_start, nchar(match_span_content))
- } else {
- # No mark tags found, treat entire match span as match content
- left_content <- ""
- match_content <- match_span_content
- right_content <- ""
- }
+ if (any(mark_flags)) {
+ first_idx <- which(mark_flags)[1]
+ last_idx <- tail(which(mark_flags), 1)
} else {
- # No match span found, treat entire content as match
- left_content <- ""
- match_content <- xml_snippet
- right_content <- ""
+ first_idx <- 1
+ last_idx <- length(tokens)
}
- # Process each section
- left_annotations <- extract_annotations_from_section(left_content)
- match_annotations <- extract_annotations_from_section(match_content)
- right_annotations <- extract_annotations_from_section(right_content)
+ sections <- rep("match", length(tokens))
+ if (first_idx > 1) {
+ sections[seq_len(first_idx - 1)] <- "left"
+ }
+ if (last_idx < length(tokens)) {
+ sections[seq(from = last_idx + 1, to = length(tokens))] <- "right"
+ }
- return(list(
- atokens = list(
- left = left_annotations$tokens,
- match = match_annotations$tokens,
- right = right_annotations$tokens
- ),
- lemma = list(
- left = left_annotations$lemmas,
- match = match_annotations$lemmas,
- right = right_annotations$lemmas
- ),
- pos = list(
- left = left_annotations$pos_tags,
- match = match_annotations$pos_tags,
- right = right_annotations$pos_tags
- ),
- morph = list(
- left = left_annotations$morph_tags,
- match = match_annotations$morph_tags,
- right = right_annotations$morph_tags
- )
- ))
+ subset_by_section <- function(values, section) {
+ idx <- sections == section
+ if (!any(idx)) {
+ return(character(0))
+ }
+ values[idx]
+ }
+
+ atokens <- list(
+ left = subset_by_section(tokens, "left"),
+ match = subset_by_section(tokens, "match"),
+ right = subset_by_section(tokens, "right")
+ )
+
+ lemma <- list(
+ left = subset_by_section(lemmas, "left"),
+ match = subset_by_section(lemmas, "match"),
+ right = subset_by_section(lemmas, "right")
+ )
+
+ pos <- list(
+ left = subset_by_section(pos_tags, "left"),
+ match = subset_by_section(pos_tags, "match"),
+ right = subset_by_section(pos_tags, "right")
+ )
+
+ morph <- list(
+ left = subset_by_section(morph_tags, "left"),
+ match = subset_by_section(morph_tags, "match"),
+ right = subset_by_section(morph_tags, "right")
+ )
+
+ list(
+ atokens = atokens,
+ lemma = lemma,
+ pos = pos,
+ morph = morph
+ )
}
#' Fetch annotations for all collected matches