Parse annotations into lists of vectors Change-Id: I9ee3a5c10ff997380b9661dd603f8ec83c6a91a8

commit: a29f3d4e72815d0e896ee9a65b3482fd9c6d08a7 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Fri Jul 18 10:14:43 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Mon Jul 21 11:39:30 2025 +0200
tree: 146be7e57fd831dc030b209f92277b70d7c8faea
parent: ff712a997522ed17f9a5d4d1dec94873a497ce07 [diff]
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index e00e19e..ef58c3d 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R

@@ -21,8 +21,7 @@
   "webUIRequestUrl",
   "apiResponse",
   "collectedMatches",
-  "hasMoreMatches",
-  "annotations"
+  "hasMoreMatches"
 ))
 
 #' Initialize KorAPQuery object
@@ -39,7 +38,6 @@
 #' @param apiResponse data-frame representation of the JSON response of the API request
 #' @param hasMoreMatches logical that signals if more query results can be fetched
 #' @param collectedMatches matches already fetched from the KorAP-API-server
-#' @param annotations list of annotation data for collected matches
 #'
 #' @importFrom tibble tibble
 #' @export
@@ -49,7 +47,7 @@
              "corpusSigle", "textSigle", "pubDate", "pubPlace",
              "availability", "textClass", "snippet", "tokens"
            ),
-           requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL, annotations = NULL) {
+           requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
     .Object <- callNextMethod()
     .Object@korapConnection <- korapConnection
     .Object@request <- request
@@ -62,7 +60,6 @@
     .Object@apiResponse <- apiResponse
     .Object@hasMoreMatches <- hasMoreMatches
     .Object@collectedMatches <- collectedMatches
-    .Object@annotations <- annotations
     .Object
   }
 )
@@ -741,76 +738,508 @@
   return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
 })
 
+#'
+#' Parse XML annotations into linguistic layers
+#'
+#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
+#' from XML annotation snippets returned by the KorAP API.
+#'
+#' @param xml_snippet XML string containing annotation data
+#' @return Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
+#' @keywords internal
+parse_xml_annotations <- function(xml_snippet) {
+  if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
+    return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
+  }
+
+  # Extract content within <span class="match">...</span> using a more robust approach
+  if (grepl('<span class="match">', xml_snippet)) {
+    # Find the start of match span
+    start_pos <- regexpr('<span class="match">', xml_snippet)
+    if (start_pos > 0) {
+      # Find the end by counting nested spans
+      content_start <- start_pos + attr(start_pos, "match.length")
+      remaining <- substr(xml_snippet, content_start, nchar(xml_snippet))
+
+      # Simple approach: extract everything until we hit context-right or end
+      if (grepl('<span class="context-right">', remaining)) {
+        content_to_parse <- gsub('(.*?)<span class="context-right">.*', '\\1', remaining)
+      } else {
+        # Find the closing </span> that matches our opening span
+        # For now, use a simpler approach - take everything until the last </span> sequence
+        content_to_parse <- gsub('(.*)</span>\\s*$', '\\1', remaining)
+      }
+    } else {
+      content_to_parse <- xml_snippet
+    }
+  } else {
+    content_to_parse <- xml_snippet
+  }
+
+  # Initialize result vectors
+  tokens <- character(0)
+  lemmas <- character(0)
+  pos_tags <- character(0)
+  morph_tags <- character(0)
+
+  # Split the content by </span> and process each meaningful part
+  parts <- unlist(strsplit(content_to_parse, '</span>'))
+
+  for (part in parts) {
+    part <- trimws(part)
+    if (nchar(part) == 0) next
+
+    # Look for parts that have title attributes and end with text
+    if (grepl('<span[^>]*title=', part)) {
+      # Extract the text content (everything after the last >)
+      text_content <- gsub('.*>([^<]*)$', '\\1', part)
+      text_content <- trimws(text_content)
+
+      if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
+        tokens <- c(tokens, text_content)
+
+        # Extract all title attributes from this part
+        title_pattern <- 'title="([^"]*)"'
+        title_matches <- gregexpr(title_pattern, part)
+
+        lemma <- NA
+        pos_tag <- NA
+        morph_tag <- NA
+
+        if (title_matches[[1]][1] != -1) {
+          all_titles <- regmatches(part, title_matches)[[1]]
+          for (title_match in all_titles) {
+            title_content <- gsub(title_pattern, '\\1', title_match)
+            
+            # Split by spaces and process each annotation
+            annotations <- unlist(strsplit(title_content, "\\s+"))
+            for (annotation in annotations) {
+              if (grepl('^tt/l:', annotation)) {
+                lemma <- gsub('^tt/l:(.*)$', '\\1', annotation)
+              } else if (grepl('^tt/p:', annotation)) {
+                pos_tag <- gsub('^tt/p:(.*)$', '\\1', annotation)
+              } else if (grepl('^tt/m:', annotation)) {
+                morph_tag <- gsub('^tt/m:(.*)$', '\\1', annotation)
+              }
+            }
+          }
+        }
+
+        lemmas <- c(lemmas, lemma)
+        pos_tags <- c(pos_tags, pos_tag)
+        morph_tags <- c(morph_tags, morph_tag)
+      }
+    }
+  }
+
+  # If no tokens found with the splitting approach, try a different method
+  if (length(tokens) == 0) {
+    # Look for the innermost spans that contain actual text
+    innermost_pattern <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
+    innermost_matches <- gregexpr(innermost_pattern, content_to_parse, perl = TRUE)
+
+    if (innermost_matches[[1]][1] != -1) {
+      matches <- regmatches(content_to_parse, innermost_matches)[[1]]
+
+      for (match in matches) {
+        title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
+        text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
+        text <- trimws(text)
+
+        if (nchar(text) > 0) {
+          tokens <- c(tokens, text)
+
+          # Parse space-separated annotations in title
+          lemma <- NA
+          pos_tag <- NA
+          morph_tag <- NA
+          
+          annotations <- unlist(strsplit(title, "\\s+"))
+          for (annotation in annotations) {
+            if (grepl('^tt/l:', annotation)) {
+              lemma <- gsub('^tt/l:(.*)$', '\\1', annotation)
+            } else if (grepl('^tt/p:', annotation)) {
+              pos_tag <- gsub('^tt/p:(.*)$', '\\1', annotation)
+            } else if (grepl('^tt/m:', annotation)) {
+              morph_tag <- gsub('^tt/m:(.*)$', '\\1', annotation)
+            }
+          }
+          
+          lemmas <- c(lemmas, lemma)
+          pos_tags <- c(pos_tags, pos_tag)
+          morph_tags <- c(morph_tags, morph_tag)
+        }
+      }
+    }
+  }
+
+  # Ensure all vectors have the same length
+  max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
+  if (max_length > 0) {
+    tokens <- c(tokens, rep(NA, max_length - length(tokens)))
+    lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
+    pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
+    morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
+  }
+
+  return(list(
+    token = tokens,
+    lemma = lemmas,
+    pos = pos_tags,
+    morph = morph_tags
+  ))
+}
+
+#'
+#' Parse XML annotations into linguistic layers with left/match/right structure
+#'
+#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
+#' from XML annotation snippets returned by the KorAP API, split into left context,
+#' match, and right context sections like the tokens field.
+#'
+#' @param xml_snippet XML string containing annotation data
+#' @return Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
+#' @keywords internal
+parse_xml_annotations_structured <- function(xml_snippet) {
+  if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
+    empty_result <- list(left = character(0), match = character(0), right = character(0))
+    return(list(
+      atokens = empty_result,
+      lemma = empty_result,
+      pos = empty_result,
+      morph = empty_result
+    ))
+  }
+
+  # Helper function to extract annotations from a span section
+  extract_annotations_from_section <- function(section_content) {
+    tokens <- character(0)
+    lemmas <- character(0)
+    pos_tags <- character(0)
+    morph_tags <- character(0)
+
+    # Split the content by </span> and process each meaningful part
+    parts <- unlist(strsplit(section_content, '</span>'))
+
+    for (part in parts) {
+      part <- trimws(part)
+      if (nchar(part) == 0) next
+
+      # Look for parts that have title attributes and end with text
+      if (grepl('<span[^>]*title=', part)) {
+        # Extract the text content (everything after the last >)
+        text_content <- gsub('.*>([^<]*)$', '\\1', part)
+        text_content <- trimws(text_content)
+
+        if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
+          tokens <- c(tokens, text_content)
+
+          # Extract all title attributes from this part
+          title_pattern <- 'title="([^"]*)"'
+          title_matches <- gregexpr(title_pattern, part)
+
+          lemma <- NA
+          pos_tag <- NA
+          morph_tag <- NA
+
+          if (title_matches[[1]][1] != -1) {
+            all_titles <- regmatches(part, title_matches)[[1]]
+            for (title_match in all_titles) {
+              title_content <- gsub(title_pattern, '\\1', title_match)
+              
+              # Split by spaces and process each annotation
+              annotations <- unlist(strsplit(title_content, "\\s+"))
+              for (annotation in annotations) {
+                if (grepl('^tt/l:', annotation)) {
+                  lemma <- gsub('^tt/l:(.*)$', '\\1', annotation)
+                } else if (grepl('^tt/p:', annotation)) {
+                  pos_tag <- gsub('^tt/p:(.*)$', '\\1', annotation)
+                } else if (grepl('^tt/m:', annotation)) {
+                  morph_tag <- gsub('^tt/m:(.*)$', '\\1', annotation)
+                }
+              }
+            }
+          }
+
+          lemmas <- c(lemmas, lemma)
+          pos_tags <- c(pos_tags, pos_tag)
+          morph_tags <- c(morph_tags, morph_tag)
+        }
+      }
+    }
+
+    # If no tokens found with the splitting approach, try a different method
+    if (length(tokens) == 0) {
+      # Look for the innermost spans that contain actual text
+      innermost_pattern <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
+      innermost_matches <- gregexpr(innermost_pattern, section_content, perl = TRUE)
+
+      if (innermost_matches[[1]][1] != -1) {
+        matches <- regmatches(section_content, innermost_matches)[[1]]
+
+        for (match in matches) {
+          title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
+          text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
+          text <- trimws(text)
+
+          if (nchar(text) > 0) {
+            tokens <- c(tokens, text)
+
+            # Parse space-separated annotations in title
+            lemma <- NA
+            pos_tag <- NA
+            morph_tag <- NA
+            
+            annotations <- unlist(strsplit(title, "\\s+"))
+            for (annotation in annotations) {
+              if (grepl('^tt/l:', annotation)) {
+                lemma <- gsub('^tt/l:(.*)$', '\\1', annotation)
+              } else if (grepl('^tt/p:', annotation)) {
+                pos_tag <- gsub('^tt/p:(.*)$', '\\1', annotation)
+              } else if (grepl('^tt/m:', annotation)) {
+                morph_tag <- gsub('^tt/m:(.*)$', '\\1', annotation)
+              }
+            }
+            
+            lemmas <- c(lemmas, lemma)
+            pos_tags <- c(pos_tags, pos_tag)
+            morph_tags <- c(morph_tags, morph_tag)
+          }
+        }
+      }
+    }
+
+    # Ensure all vectors have the same length
+    max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
+    if (max_length > 0) {
+      tokens <- c(tokens, rep(NA, max_length - length(tokens)))
+      lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
+      pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
+      morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
+    }
+
+    return(list(
+      tokens = tokens,
+      lemmas = lemmas,
+      pos_tags = pos_tags,
+      morph_tags = morph_tags
+    ))
+  }
+
+  # Split the XML into three parts: left context, match content, and right context
+  # The structure is: <span class="match">...left...<mark>...match...</mark>...right...</span>
+  
+  # First extract the content within the match span using DOTALL modifier
+  match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*<span class="context-right">'
+  match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
+  
+  if (match_span_match == -1) {
+    # Try alternative pattern if no context-right
+    match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*$'
+    match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
+  }
+  
+  if (match_span_match > 0) {
+    match_span_content <- gsub(match_span_pattern, '\\1', xml_snippet, perl = TRUE)
+    
+    # Now find the <mark> and </mark> positions within this content
+    mark_start <- regexpr('<mark[^>]*>', match_span_content, perl = TRUE)
+    mark_end <- regexpr('</mark>', match_span_content, perl = TRUE)
+    
+    if (mark_start > 0 && mark_end > 0) {
+      # Left context: everything before <mark>
+      left_content <- substr(match_span_content, 1, mark_start - 1)
+      
+      # Match content: everything between <mark> and </mark> (including the mark tags for now)
+      match_content <- substr(match_span_content, mark_start, mark_end + attr(mark_end, "match.length") - 1)
+      
+      # Right context: everything after </mark>
+      right_content_start <- mark_end + attr(mark_end, "match.length")
+      right_content <- substr(match_span_content, right_content_start, nchar(match_span_content))
+    } else {
+      # No mark tags found, treat entire match span as match content
+      left_content <- ""
+      match_content <- match_span_content
+      right_content <- ""
+    }
+  } else {
+    # No match span found, treat entire content as match
+    left_content <- ""
+    match_content <- xml_snippet
+    right_content <- ""
+  }
+
+  # Process each section
+  left_annotations <- extract_annotations_from_section(left_content)
+  match_annotations <- extract_annotations_from_section(match_content)
+  right_annotations <- extract_annotations_from_section(right_content)
+
+  return(list(
+    atokens = list(
+      left = left_annotations$tokens,
+      match = match_annotations$tokens,
+      right = right_annotations$tokens
+    ),
+    lemma = list(
+      left = left_annotations$lemmas,
+      match = match_annotations$lemmas,
+      right = right_annotations$lemmas
+    ),
+    pos = list(
+      left = left_annotations$pos_tags,
+      match = match_annotations$pos_tags,
+      right = right_annotations$pos_tags
+    ),
+    morph = list(
+      left = left_annotations$morph_tags,
+      match = match_annotations$morph_tags,
+      right = right_annotations$morph_tags
+    )
+  ))
+}
+
 #' Fetch annotations for all collected matches
 #'
 #' **`fetchAnnotations`** fetches annotations for all matches in the `@collectedMatches` slot
-#' of a KorAPQuery object and stores them in the `@annotations` slot. The method automatically
-#' uses the `matchID` from collected matches when available for safer and more reliable 
-#' annotation retrieval, falling back to constructing URLs from `matchStart` and `matchEnd`
-#' if necessary.
+#' of a KorAPQuery object and adds annotation columns directly to the `@collectedMatches` 
+#' data frame. The method automatically uses the `matchID` from collected matches when 
+#' available for safer and more reliable annotation retrieval, falling back to constructing 
+#' URLs from `matchStart` and `matchEnd` if necessary.
+#'
+#' **Important**: For copyright-restricted corpora, users must be authorized via [auth()]
+#' and the initial corpus query must have `metadataOnly = FALSE` to ensure snippets are
+#' available for annotation parsing.
+#'
+#' The method parses XML snippet annotations and adds linguistic columns to the data frame:
+#' - `pos`: data frame with `left`, `match`, `right` columns, each containing list vectors of part-of-speech tags
+#' - `lemma`: data frame with `left`, `match`, `right` columns, each containing list vectors of lemmas
+#' - `morph`: data frame with `left`, `match`, `right` columns, each containing list vectors of morphological tags
+#' - `atokens`: data frame with `left`, `match`, `right` columns, each containing list vectors of token text (from annotations)
+#' - `annotation_snippet`: original XML snippet from the annotation API
 #'
 #' @family corpus search functions
 #' @aliases fetchAnnotations
 #'
-#' @param kqo object obtained from [corpusQuery()] with collected matches
+#' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
 #' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
 #' @param verbose print progress information if true
-#' @return The updated `kqo` object with annotations in `@annotations` slot
+#' @return The updated `kqo` object with annotation columns added to `@collectedMatches`
 #'
 #' @examples
 #' \dontrun{
 #'
 #' # Fetch annotations for matches using Tree-Tagger foundry
+#' # Note: Authorization required for copyright-restricted corpora
 #' q <- KorAPConnection() |>
-#'   corpusQuery("Ameisenplage") |>
+#'   auth() |>
+#'   corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
 #'   fetchNext(maxFetch = 10) |>
 #'   fetchAnnotations()
-#' q@annotations
 #'
-#' # Use a different foundry
+#' # Access linguistic annotations for match i:
+#' pos_tags <- q@collectedMatches$pos         # Data frame with left/match/right columns for POS tags
+#' lemmas <- q@collectedMatches$lemma         # Data frame with left/match/right columns for lemmas  
+#' morphology <- q@collectedMatches$morph     # Data frame with left/match/right columns for morphological tags
+#' atokens <- q@collectedMatches$atokens      # Data frame with left/match/right columns for annotation token text
+#' raw_snippet <- q@collectedMatches$annotation_snippet[[i]] # Original XML snippet for match i
+#' 
+#' # Access specific components:
+#' match_pos <- q@collectedMatches$pos$match[[i]]     # POS tags for the matched tokens in match i
+#' left_lemmas <- q@collectedMatches$lemma$left[[i]]  # Lemmas for the left context in match i
+#' right_tokens <- q@collectedMatches$atokens$right[[i]] # Token text for the right context in match i
+#'
+#' # Use a different foundry (e.g., mate-parser)
 #' q <- KorAPConnection() |>
-#'   corpusQuery("Ameisenplage") |>
+#'   auth() |>
+#'   corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
 #'   fetchNext(maxFetch = 10) |>
 #'   fetchAnnotations(foundry = "mate")
-#' q@annotations
+#' q@collectedMatches
 #' }
-#'
 #' @export
 setMethod("fetchAnnotations", "KorAPQuery", function(kqo, foundry = "tt", verbose = kqo@korapConnection@verbose) {
   if (is.null(kqo@collectedMatches) || nrow(kqo@collectedMatches) == 0) {
     warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
     return(kqo)
   }
-  
+
   df <- kqo@collectedMatches
   kco <- kqo@korapConnection
-  annotations_list <- list()
-  
+
   if (verbose) {
     cat("Fetching annotations for", nrow(df), "matches using foundry:", foundry, "\n")
   }
+
+  # Initialize annotation columns as data frames (like tokens field)
+  # Create the structure more explicitly to avoid assignment issues
+  nrows <- nrow(df)
   
+  df$pos <- data.frame(
+    left = I(replicate(nrows, character(0), simplify = FALSE)),
+    match = I(replicate(nrows, character(0), simplify = FALSE)),
+    right = I(replicate(nrows, character(0), simplify = FALSE)),
+    stringsAsFactors = FALSE
+  )
+  
+  df$lemma <- data.frame(
+    left = I(replicate(nrows, character(0), simplify = FALSE)),
+    match = I(replicate(nrows, character(0), simplify = FALSE)),
+    right = I(replicate(nrows, character(0), simplify = FALSE)),
+    stringsAsFactors = FALSE
+  )
+  
+  df$morph <- data.frame(
+    left = I(replicate(nrows, character(0), simplify = FALSE)),
+    match = I(replicate(nrows, character(0), simplify = FALSE)),
+    right = I(replicate(nrows, character(0), simplify = FALSE)),
+    stringsAsFactors = FALSE
+  )
+  
+  df$atokens <- data.frame(
+    left = I(replicate(nrows, character(0), simplify = FALSE)),
+    match = I(replicate(nrows, character(0), simplify = FALSE)),
+    right = I(replicate(nrows, character(0), simplify = FALSE)),
+    stringsAsFactors = FALSE
+  )
+  
+  df$annotation_snippet <- replicate(nrows, NA, simplify = FALSE)
+
   for (i in seq_len(nrow(df))) {
     if (verbose && i %% 10 == 0) {
       cat("Processing match", i, "of", nrow(df), "\n")
     }
-    
+
     # Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
     if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
-      # matchID format: "match-match-A00/JUN/39609-p202-203"
-      # Extract document path and position: A00/JUN/39609-p202-203
-      # Then convert to URL format: A00/JUN/39609/p202-203
+      # matchID format: "match-match-A00/JUN/39609-p202-203" or encrypted format like
+      # "match-DNB10/CSL/80400-p2343-2344x_MinDOhu_P6dd2MMZJyyus_7MairdKnr1LxY07Cya-Ow"
+      # Extract document path and position, handling both regular and encrypted formats
       
-      # First extract the document path with position (everything after the last "match-")
-      doc_path_with_pos <- gsub(".*match-([^-]+(?:/[^-]+)*-p\\d+-\\d+).*", "\\1", df$matchID[i])
-      # Then convert the dash before position to slash
-      match_path <- gsub("-p(\\d+-\\d+)", "/p\\1", doc_path_with_pos)
-      req <- paste0(kco@apiUrl, "corpus/", match_path, "?foundry=", foundry)
-      if (verbose) {
-        cat("Using matchID approach for match", i, ": matchID =", df$matchID[i], "\n")
-        cat("Extracted doc path:", doc_path_with_pos, "\n")
-        cat("Final match path:", match_path, "\n")
-        cat("Constructed URL:", req, "\n")
+      # More flexible regex to extract the document path with position and encryption
+      # Look for pattern: match-(...)-p(\d+)-(\d+)(.*) where (.*) is the encrypted part
+      # We need to capture the entire path including the encrypted suffix
+      match_result <- regexpr("match-(.+?-p\\d+-\\d+.*)", df$matchID[i], perl = TRUE)
+      
+      if (match_result > 0) {
+        # Extract the complete path including encryption (everything after "match-")
+        doc_path_with_pos_and_encryption <- gsub("^match-(.+)$", "\\1", df$matchID[i], perl = TRUE)
+        # Convert the dash before position to slash, but keep everything after the position
+        match_path <- gsub("-p(\\d+-\\d+.*)", "/p\\1", doc_path_with_pos_and_encryption)
+        req <- paste0(kco@apiUrl, "corpus/", match_path, "?foundry=", foundry)
+        
+        if (verbose) {
+          cat("Using matchID approach for match", i, ": matchID =", df$matchID[i], "\n")
+          cat("Extracted doc path with encryption:", doc_path_with_pos_and_encryption, "\n")
+          cat("Final match path:", match_path, "\n")
+          cat("Constructed URL:", req, "\n")
+        }
+      } else {
+        # If regex fails, fall back to the old method
+        if (verbose) {
+          cat("Failed to parse matchID format:", df$matchID[i], "\n")
+          cat("Falling back to textSigle + position method\n")
+        }
+        req <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", df$matchStart[i], "-", df$matchEnd[i], "?foundry=", foundry)
       }
     } else {
       # Fallback to the old method
@@ -820,31 +1249,197 @@
         cat("Constructed URL:", req, "\n")
       }
     }
-    
+
     tryCatch({
       res <- apiCall(kco, req)
+      if (verbose) {
+        cat("API call result for match", i, ":\n")
+        if (is.null(res)) {
+          cat("  Result is NULL\n")
+        } else {
+          cat("  Result class:", class(res), "\n")
+          if (is.list(res)) {
+            cat("  Result names:", paste(names(res), collapse = ", "), "\n")
+            if ("snippet" %in% names(res)) {
+              snippet_length <- if (is.character(res$snippet)) nchar(res$snippet) else "not character"
+              cat("  Snippet length:", snippet_length, "\n")
+              if (is.character(res$snippet) && nchar(res$snippet) > 0) {
+                cat("  Snippet preview:", substr(res$snippet, 1, 100), "...\n")
+              }
+            }
+          }
+        }
+      }
+      
       if (!is.null(res)) {
-        annotations_list[[i]] <- res
+        # Store the raw annotation snippet
+        df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
+
+        # Parse XML annotations if snippet is available
+        if (is.list(res) && "snippet" %in% names(res)) {
+          parsed_annotations <- parse_xml_annotations_structured(res$snippet)
+
+          # Store the parsed linguistic data in data frame format (like tokens)
+          # Use individual assignment to avoid data frame mismatch errors
+          tryCatch({
+            # Assign POS annotations
+            df$pos$left[i] <- list(parsed_annotations$pos$left)
+            df$pos$match[i] <- list(parsed_annotations$pos$match)
+            df$pos$right[i] <- list(parsed_annotations$pos$right)
+            
+            # Assign lemma annotations
+            df$lemma$left[i] <- list(parsed_annotations$lemma$left)
+            df$lemma$match[i] <- list(parsed_annotations$lemma$match)
+            df$lemma$right[i] <- list(parsed_annotations$lemma$right)
+            
+            # Assign morphology annotations
+            df$morph$left[i] <- list(parsed_annotations$morph$left)
+            df$morph$match[i] <- list(parsed_annotations$morph$match)
+            df$morph$right[i] <- list(parsed_annotations$morph$right)
+            
+            # Assign token annotations
+            df$atokens$left[i] <- list(parsed_annotations$atokens$left)
+            df$atokens$match[i] <- list(parsed_annotations$atokens$match)
+            df$atokens$right[i] <- list(parsed_annotations$atokens$right)
+          }, error = function(assign_error) {
+            if (verbose) {
+              cat("Error assigning annotations for match", i, ":", assign_error$message, "\n")
+              cat("Setting empty values instead\n")
+            }
+            # Set empty character vectors on assignment error using list assignment
+            df$pos$left[i] <<- list(character(0))
+            df$pos$match[i] <<- list(character(0))
+            df$pos$right[i] <<- list(character(0))
+            
+            df$lemma$left[i] <<- list(character(0))
+            df$lemma$match[i] <<- list(character(0))
+            df$lemma$right[i] <<- list(character(0))
+            
+            df$morph$left[i] <<- list(character(0))
+            df$morph$match[i] <<- list(character(0))
+            df$morph$right[i] <<- list(character(0))
+            
+            df$atokens$left[i] <<- list(character(0))
+            df$atokens$match[i] <<- list(character(0))
+            df$atokens$right[i] <<- list(character(0))
+          })
+
+          if (verbose && i <= 3) {  # Show details for first few matches
+            cat("Match", i, "parsed annotations:\n")
+            cat("  Left tokens:", length(parsed_annotations$atokens$left), "\n")
+            cat("  Match tokens:", length(parsed_annotations$atokens$match), "\n")  
+            cat("  Right tokens:", length(parsed_annotations$atokens$right), "\n")
+            if (length(parsed_annotations$pos$match) > 0 && any(!is.na(parsed_annotations$pos$match))) {
+              cat("  Match POS tags:", paste(parsed_annotations$pos$match, collapse = ", "), "\n")
+            }
+            if (length(parsed_annotations$lemma$match) > 0 && any(!is.na(parsed_annotations$lemma$match))) {
+              cat("  Match lemmas:", paste(parsed_annotations$lemma$match, collapse = ", "), "\n")
+            }
+            if (length(parsed_annotations$morph$match) > 0 && any(!is.na(parsed_annotations$morph$match))) {
+              cat("  Match morph tags:", paste(parsed_annotations$morph$match, collapse = ", "), "\n")
+            }
+          }
+        } else {
+          # No snippet available, store empty vectors
+          df$pos$left[i] <- list(character(0))
+          df$pos$match[i] <- list(character(0))
+          df$pos$right[i] <- list(character(0))
+          
+          df$lemma$left[i] <- list(character(0))
+          df$lemma$match[i] <- list(character(0))
+          df$lemma$right[i] <- list(character(0))
+          
+          df$morph$left[i] <- list(character(0))
+          df$morph$match[i] <- list(character(0))
+          df$morph$right[i] <- list(character(0))
+          
+          df$atokens$left[i] <- list(character(0))
+          df$atokens$match[i] <- list(character(0))
+          df$atokens$right[i] <- list(character(0))
+        }
       } else {
         if (verbose) {
           cat("Warning: No annotations returned for match", i, "\n")
         }
-        annotations_list[[i]] <- NULL
+        # Store NAs for failed requests
+        df$pos$left[i] <- list(NA)
+        df$pos$match[i] <- list(NA)
+        df$pos$right[i] <- list(NA)
+        
+        df$lemma$left[i] <- list(NA)
+        df$lemma$match[i] <- list(NA)
+        df$lemma$right[i] <- list(NA)
+        
+        df$morph$left[i] <- list(NA)
+        df$morph$match[i] <- list(NA)
+        df$morph$right[i] <- list(NA)
+        
+        df$atokens$left[i] <- list(NA)
+        df$atokens$match[i] <- list(NA)
+        df$atokens$right[i] <- list(NA)
+        df$annotation_snippet[[i]] <- NA
       }
     }, error = function(e) {
       if (verbose) {
         cat("Error fetching annotations for match", i, ":", e$message, "\n")
       }
-      annotations_list[[i]] <- NULL
+      # Store NAs for failed requests
+      df$pos$left[i] <- list(NA)
+      df$pos$match[i] <- list(NA)
+      df$pos$right[i] <- list(NA)
+      
+      df$lemma$left[i] <- list(NA)
+      df$lemma$match[i] <- list(NA)
+      df$lemma$right[i] <- list(NA)
+      
+      df$morph$left[i] <- list(NA)
+      df$morph$match[i] <- list(NA)
+      df$morph$right[i] <- list(NA)
+      
+      df$atokens$left[i] <- list(NA)
+      df$atokens$match[i] <- list(NA)
+      df$atokens$right[i] <- list(NA)
+      df$annotation_snippet[[i]] <- NA
     })
   }
-  
+
   if (verbose) {
-    successful_annotations <- sum(!sapply(annotations_list, is.null))
+    successful_annotations <- sum(!is.na(df$annotation_snippet))
     cat("Successfully fetched annotations for", successful_annotations, "of", nrow(df), "matches\n")
+    cat("Linguistic data stored as columns in collectedMatches\n")
+    cat("Data frame dimensions before assignment:", nrow(df), "x", ncol(df), "\n")
   }
-  
-  kqo@annotations <- annotations_list
+
+  # Validate data frame structure before assignment
+  if (nrow(df) != nrow(kqo@collectedMatches)) {
+    if (verbose) {
+      cat("Warning: Row count mismatch. Original:", nrow(kqo@collectedMatches), "Modified:", nrow(df), "\n")
+    }
+  }
+
+  # Update the collectedMatches with annotation data
+  tryCatch({
+    kqo@collectedMatches <- df
+  }, error = function(assign_error) {
+    if (verbose) {
+      cat("Error updating collectedMatches:", assign_error$message, "\n")
+      cat("Attempting to preserve original data and add annotations separately\n")
+    }
+    # Try a safer approach: add columns individually
+    tryCatch({
+      kqo@collectedMatches$pos <- df$pos
+      kqo@collectedMatches$lemma <- df$lemma  
+      kqo@collectedMatches$morph <- df$morph
+      kqo@collectedMatches$atokens <- df$atokens
+      kqo@collectedMatches$annotation_snippet <- df$annotation_snippet
+    }, error = function(col_error) {
+      if (verbose) {
+        cat("Error adding annotation columns:", col_error$message, "\n")
+      }
+      warning("Failed to add annotation data to collectedMatches")
+    })
+  })
+
   return(kqo)
 })
 
@@ -963,9 +1558,14 @@
   }
   cat("   Total results: ", q@totalResults, "\n")
   cat(" Fetched results: ", q@nextStartIndex, "\n")
-  if (!is.null(q@annotations)) {
-    successful_annotations <- sum(!sapply(q@annotations, is.null))
-    cat("     Annotations: ", successful_annotations, " of ", length(q@annotations), " matches\n")
+  if (!is.null(q@collectedMatches) && "pos" %in% colnames(q@collectedMatches)) {
+    successful_annotations <- sum(!is.na(q@collectedMatches$annotation_snippet))
+    parsed_annotations <- sum(!is.na(q@collectedMatches$pos))
+    cat("     Annotations: ", successful_annotations, " of ", nrow(q@collectedMatches), " matches")
+    if (parsed_annotations > 0) {
+      cat(" (", parsed_annotations, " with parsed linguistic data)")
+    }
+    cat("\n")
   }
 }
 

diff --git a/man/fetchAnnotations-KorAPQuery-method.Rd b/man/fetchAnnotations-KorAPQuery-method.Rd
new file mode 100644
index 0000000..b2dc3bf
--- /dev/null
+++ b/man/fetchAnnotations-KorAPQuery-method.Rd

@@ -0,0 +1,79 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/KorAPQuery.R
+\name{fetchAnnotations,KorAPQuery-method}
+\alias{fetchAnnotations,KorAPQuery-method}
+\alias{fetchAnnotations}
+\title{Fetch annotations for all collected matches}
+\usage{
+\S4method{fetchAnnotations}{KorAPQuery}(kqo, foundry = "tt", verbose = kqo@korapConnection@verbose)
+}
+\arguments{
+\item{kqo}{object obtained from \code{\link[=corpusQuery]{corpusQuery()}} with collected matches. Note: the original corpus query should have \code{metadataOnly = FALSE} for annotation parsing to work.}
+
+\item{foundry}{string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)}
+
+\item{verbose}{print progress information if true}
+}
+\value{
+The updated \code{kqo} object with annotation columns added to \verb{@collectedMatches}
+}
+\description{
+\strong{\code{fetchAnnotations}} fetches annotations for all matches in the \verb{@collectedMatches} slot
+of a KorAPQuery object and adds annotation columns directly to the \verb{@collectedMatches}
+data frame. The method automatically uses the \code{matchID} from collected matches when
+available for safer and more reliable annotation retrieval, falling back to constructing
+URLs from \code{matchStart} and \code{matchEnd} if necessary.
+}
+\details{
+\strong{Important}: For copyright-restricted corpora, users must be authorized via \code{\link[=auth]{auth()}}
+and the initial corpus query must have \code{metadataOnly = FALSE} to ensure snippets are
+available for annotation parsing.
+
+The method parses XML snippet annotations and adds linguistic columns to the data frame:
+\itemize{
+\item \code{pos}: data frame with \code{left}, \code{match}, \code{right} columns, each containing list vectors of part-of-speech tags
+\item \code{lemma}: data frame with \code{left}, \code{match}, \code{right} columns, each containing list vectors of lemmas
+\item \code{morph}: data frame with \code{left}, \code{match}, \code{right} columns, each containing list vectors of morphological tags
+\item \code{atokens}: data frame with \code{left}, \code{match}, \code{right} columns, each containing list vectors of token text (from annotations)
+\item \code{annotation_snippet}: original XML snippet from the annotation API
+}
+}
+\examples{
+\dontrun{
+
+# Fetch annotations for matches using Tree-Tagger foundry
+# Note: Authorization required for copyright-restricted corpora
+q <- KorAPConnection() |>
+  auth() |>
+  corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
+  fetchNext(maxFetch = 10) |>
+  fetchAnnotations()
+
+# Access linguistic annotations for match i:
+pos_tags <- q@collectedMatches$pos         # Data frame with left/match/right columns for POS tags
+lemmas <- q@collectedMatches$lemma         # Data frame with left/match/right columns for lemmas  
+morphology <- q@collectedMatches$morph     # Data frame with left/match/right columns for morphological tags
+atokens <- q@collectedMatches$atokens      # Data frame with left/match/right columns for annotation token text
+raw_snippet <- q@collectedMatches$annotation_snippet[[i]] # Original XML snippet for match i
+
+# Access specific components:
+match_pos <- q@collectedMatches$pos$match[[i]]     # POS tags for the matched tokens in match i
+left_lemmas <- q@collectedMatches$lemma$left[[i]]  # Lemmas for the left context in match i
+right_tokens <- q@collectedMatches$atokens$right[[i]] # Token text for the right context in match i
+
+# Use a different foundry (e.g., mate-parser)
+q <- KorAPConnection() |>
+  auth() |>
+  corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
+  fetchNext(maxFetch = 10) |>
+  fetchAnnotations(foundry = "mate")
+q@collectedMatches
+}
+}
+\seealso{
+Other corpus search functions: 
+\code{\link{corpusQuery,KorAPConnection-method}},
+\code{\link{fetchAll,KorAPQuery-method}},
+\code{\link{fetchNext,KorAPQuery-method}}
+}
+\concept{corpus search functions}

diff --git a/man/initialize-KorAPQuery-method.Rd b/man/initialize-KorAPQuery-method.Rd
index 0815bea..beb44ad 100644
--- a/man/initialize-KorAPQuery-method.Rd
+++ b/man/initialize-KorAPQuery-method.Rd

@@ -17,8 +17,7 @@
   webUIRequestUrl = "",
   apiResponse = NULL,
   hasMoreMatches = FALSE,
-  collectedMatches = NULL,
-  annotations = NULL
+  collectedMatches = NULL
 )
 }
 \arguments{
@@ -45,8 +44,6 @@
 \item{hasMoreMatches}{logical that signals if more query results can be fetched}
 
 \item{collectedMatches}{matches already fetched from the KorAP-API-server}
-
-\item{annotations}{list of annotation data for collected matches}
 }
 \description{
 Initialize KorAPQuery object

diff --git a/man/parse_xml_annotations.Rd b/man/parse_xml_annotations.Rd
new file mode 100644
index 0000000..b819129
--- /dev/null
+++ b/man/parse_xml_annotations.Rd

@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/KorAPQuery.R
+\name{parse_xml_annotations}
+\alias{parse_xml_annotations}
+\title{Parse XML annotations into linguistic layers}
+\usage{
+parse_xml_annotations(xml_snippet)
+}
+\arguments{
+\item{xml_snippet}{XML string containing annotation data}
+}
+\value{
+Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
+}
+\description{
+Internal helper function to extract linguistic annotations (lemma, POS, morphology)
+from XML annotation snippets returned by the KorAP API.
+}
+\keyword{internal}

diff --git a/man/parse_xml_annotations_structured.Rd b/man/parse_xml_annotations_structured.Rd
new file mode 100644
index 0000000..07f7c8c
--- /dev/null
+++ b/man/parse_xml_annotations_structured.Rd

@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/KorAPQuery.R
+\name{parse_xml_annotations_structured}
+\alias{parse_xml_annotations_structured}
+\title{Parse XML annotations into linguistic layers with left/match/right structure}
+\usage{
+parse_xml_annotations_structured(xml_snippet)
+}
+\arguments{
+\item{xml_snippet}{XML string containing annotation data}
+}
+\value{
+Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
+}
+\description{
+Internal helper function to extract linguistic annotations (lemma, POS, morphology)
+from XML annotation snippets returned by the KorAP API, split into left context,
+match, and right context sections like the tokens field.
+}
+\keyword{internal}

diff --git a/tests/testthat/test-fetchAnnotations.R b/tests/testthat/test-fetchAnnotations.R
new file mode 100644
index 0000000..46f604f
--- /dev/null
+++ b/tests/testthat/test-fetchAnnotations.R

@@ -0,0 +1,179 @@
+test_that("fetchAnnotations works with valid matches", {
+  skip_if_offline()
+
+  kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
+  q <- kco %>%
+    corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
+    fetchNext(maxFetch = 2)
+
+  # Skip test if no matches found
+  skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query")
+
+  # Test that structured annotation columns are initially absent
+  expect_false("atokens" %in% colnames(q@collectedMatches))
+  expect_false("pos" %in% colnames(q@collectedMatches))
+
+  # Test that matchID is preserved in collectedMatches
+  expect_true("matchID" %in% colnames(q@collectedMatches))
+  expect_true(all(!is.na(q@collectedMatches$matchID)))
+
+  # Test fetchAnnotations with default foundry
+  q_with_annotations <- fetchAnnotations(q, verbose = FALSE)
+
+  # Check that structured annotation columns are now populated
+  expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches))
+  expect_true("pos" %in% colnames(q_with_annotations@collectedMatches))
+
+  # Check that the structured columns have left/match/right components
+  expect_true(all(c("left", "match", "right") %in% names(q_with_annotations@collectedMatches$atokens)))
+  expect_true(all(c("left", "match", "right") %in% names(q_with_annotations@collectedMatches$pos)))
+
+  # Test fetchAnnotations with specific foundry
+  q_with_tt <- fetchAnnotations(q, foundry = "tt", verbose = FALSE)
+  expect_true("atokens" %in% colnames(q_with_tt@collectedMatches))
+  expect_true("pos" %in% colnames(q_with_tt@collectedMatches))
+
+  # Test that annotations contain actual content (regression test for URL construction)
+  if (nrow(q_with_tt@collectedMatches) > 0) {
+    # Check that the first match has populated annotation data
+    expect_true(length(q_with_tt@collectedMatches$atokens$left[[1]]) > 0 ||
+      length(q_with_tt@collectedMatches$atokens$match[[1]]) > 0 ||
+      length(q_with_tt@collectedMatches$atokens$right[[1]]) > 0)
+    expect_true(length(q_with_tt@collectedMatches$pos$left[[1]]) > 0 ||
+      length(q_with_tt@collectedMatches$pos$match[[1]]) > 0 ||
+      length(q_with_tt@collectedMatches$pos$right[[1]]) > 0)
+  }
+})
+
+test_that("fetchAnnotations handles empty matches gracefully", {
+  kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
+
+  # Create a query object with no collected matches
+  q <- KorAPQuery(
+    korapConnection = kco,
+    collectedMatches = NULL
+  )
+
+  # Should warn and return original object
+  expect_warning(
+    result <- fetchAnnotations(q, verbose = FALSE),
+    "No collected matches found"
+  )
+  expect_identical(result, q)
+})
+
+test_that("fetchAnnotations preserves original object structure", {
+  skip_if_offline()
+
+  kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
+  q <- kco %>%
+    corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
+    fetchNext(maxFetch = 1)
+
+  # Skip test if no matches found
+  skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query")
+
+  q_original <- q
+  q_with_annotations <- fetchAnnotations(q, verbose = FALSE)
+
+  # Check that all original slots are preserved
+  expect_identical(q_with_annotations@korapConnection, q_original@korapConnection)
+  expect_identical(q_with_annotations@request, q_original@request)
+  expect_identical(q_with_annotations@vc, q_original@vc)
+  expect_identical(q_with_annotations@totalResults, q_original@totalResults)
+
+  # collectedMatches should have additional annotation columns
+  expect_true(nrow(q_with_annotations@collectedMatches) == nrow(q_original@collectedMatches))
+  expect_true(ncol(q_with_annotations@collectedMatches) > ncol(q_original@collectedMatches))
+
+  # Original columns should be preserved
+  original_cols <- colnames(q_original@collectedMatches)
+  expect_true(all(original_cols %in% colnames(q_with_annotations@collectedMatches)))
+
+  # New annotation columns should be present
+  expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches))
+  expect_true("pos" %in% colnames(q_with_annotations@collectedMatches))
+})
+
+test_that("fetchAnnotations returns structured left/match/right format", {
+  skip_if_offline()
+
+  kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
+  q <- kco %>%
+    corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
+    fetchNext(maxFetch = 1)
+
+  # Skip test if no matches found
+  skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query")
+
+  q_with_annotations <- fetchAnnotations(q, foundry = "tt", verbose = FALSE)
+
+  # Test that structured annotation columns exist
+  expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches))
+  expect_true("pos" %in% colnames(q_with_annotations@collectedMatches))
+
+  # Test the structure of annotation columns
+  atokens <- q_with_annotations@collectedMatches$atokens
+  pos <- q_with_annotations@collectedMatches$pos
+
+  expect_true(is.data.frame(atokens))
+  expect_true(is.data.frame(pos))
+
+  expect_true(all(c("left", "match", "right") %in% names(atokens)))
+  expect_true(all(c("left", "match", "right") %in% names(pos)))
+
+  # Test that each component is a list column
+  expect_true(is.list(atokens$left))
+  expect_true(is.list(atokens$match))
+  expect_true(is.list(atokens$right))
+  expect_true(is.list(pos$left))
+  expect_true(is.list(pos$match))
+  expect_true(is.list(pos$right))
+
+  # Test that the first match has actual data
+  if (nrow(q_with_annotations@collectedMatches) > 0) {
+    # At least one of left/match/right should have content
+    total_tokens <- length(atokens$left[[1]]) + length(atokens$match[[1]]) + length(atokens$right[[1]])
+    expect_true(total_tokens > 0)
+
+    total_pos <- length(pos$left[[1]]) + length(pos$match[[1]]) + length(pos$right[[1]])
+    expect_true(total_pos > 0)
+
+    # Token count should match POS count
+    expect_equal(total_tokens, total_pos)
+
+    # Match tokens should not be empty (since we found a match)
+    expect_true(length(atokens$match[[1]]) > 0)
+    expect_true(length(pos$match[[1]]) > 0)
+  }
+})
+
+test_that("matchID is preserved in collectedMatches", {
+  skip_if_offline()
+
+  kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
+  q <- kco %>%
+    corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
+    fetchNext(maxFetch = 1)
+
+  # Skip test if no matches found
+  skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query")
+
+  # Check that matchID is present and valid
+  expect_true("matchID" %in% colnames(q@collectedMatches))
+  expect_true(all(!is.na(q@collectedMatches$matchID)))
+
+  # Verify matchID format (should contain position information)
+  expect_true(all(grepl("-p\\d+-\\d+", q@collectedMatches$matchID)))
+
+  # Verify that matchStart and matchEnd are correctly extracted from matchID
+  for (i in seq_len(nrow(q@collectedMatches))) {
+    match_id <- q@collectedMatches$matchID[i]
+    positions <- gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", match_id)
+    expected_start <- as.integer(stringr::word(positions, 1))
+    expected_end <- as.integer(stringr::word(positions, 2)) - 1
+
+    expect_equal(q@collectedMatches$matchStart[i], expected_start)
+    expect_equal(q@collectedMatches$matchEnd[i], expected_end)
+  }
+})
commit	a29f3d4e72815d0e896ee9a65b3482fd9c6d08a7	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Fri Jul 18 10:14:43 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Jul 21 11:39:30 2025 +0200
tree	146be7e57fd831dc030b209f92277b70d7c8faea
parent	ff712a997522ed17f9a5d4d1dec94873a497ce07 [diff]