R/KorAPQuery.R - KorAP/RKorAPClient - Gitiles

 #' KorAPQuery class (internal)
 #'
 #' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
 #'
 #' @keywords internal
 #' @include KorAPConnection.R
 #' @include logging.R
 #' @import httr2
 #'
 #' @include RKorAPClient-package.R

 #' @export
 KorAPQuery <- setClass("KorAPQuery", slots = c(
   "korapConnection",
   "request",
   "vc",
   "totalResults",
   "nextStartIndex",
   "fields",
   "requestUrl",
   "webUIRequestUrl",
   "apiResponse",
   "collectedMatches",
   "hasMoreMatches"
 ))

 #' Initialize KorAPQuery object
 #' @keywords internal
 #' @param .Object …
 #' @param korapConnection KorAPConnection object
 #' @param request query part of the request URL
 #' @param vc definition of a virtual corpus
 #' @param totalResults number of hits the query has yielded
 #' @param nextStartIndex at what index to start the next fetch of query results
 #' @param fields what data / metadata fields should be collected
 #' @param requestUrl complete URL of the API request
 #' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
 #' @param apiResponse data-frame representation of the JSON response of the API request
 #' @param hasMoreMatches logical that signals if more query results can be fetched
 #' @param collectedMatches matches already fetched from the KorAP-API-server
 #'
 #' @importFrom tibble tibble
 #' @export
 setMethod(
   "initialize", "KorAPQuery",
   function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
              "corpusSigle", "textSigle", "pubDate", "pubPlace",
              "availability", "textClass", "snippet", "tokens"
            ),
            requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
     .Object <- callNextMethod()
     .Object@korapConnection <- korapConnection
     .Object@request <- request
     .Object@vc <- vc
     .Object@totalResults <- totalResults
     .Object@nextStartIndex <- nextStartIndex
     .Object@fields <- fields
     .Object@requestUrl <- requestUrl
     .Object@webUIRequestUrl <- webUIRequestUrl
     .Object@apiResponse <- apiResponse
     .Object@hasMoreMatches <- hasMoreMatches
     .Object@collectedMatches <- collectedMatches
     .Object
   }
 )

 setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
 setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
 setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
 setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
 setGeneric("fetchAnnotations", function(kqo, ...) standardGeneric("fetchAnnotations"))
 setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))

 maxResultsPerPage <- 50

 ## quiets concerns of R CMD check re: the .'s that appear in pipelines
 utils::globalVariables(c("."))

 #' Search corpus for query terms
 #'
 #' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
 #'
 #' @family corpus search functions
 #' @aliases corpusQuery
 #'
 #' @importFrom urltools url_encode
 #' @importFrom purrr pmap
 #' @importFrom dplyr bind_rows group_by
 #'
 #' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
 #' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
 #' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
 #' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
 #' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
 #'    If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
 #'    your RKorAPClient application as explained in the
 #'   [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
 #'   of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
 #'   `FALSE`.
 #' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
 #' @param fields character vector specifying which metadata fields to retrieve for each match.
 #' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
 #' \describe{
 #'   \item{**Text identification**:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
 #'   \item{**Publication info**:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
 #'   \item{**Temporal data**:}{`pubDate`, `creationDate` - when text was published/created}
 #'   \item{**Publication details**:}{`pubPlace`, `publisher`, `reference` - where/how published}
 #'   \item{**Text classification**:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
 #'   \item{**Adminstrative and technical info**:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
 #'   \item{**Content data**:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
 #'   \item{**System data**:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
 #' }
 #' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
 #' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default  if `metadataOnly` is set to `FALSE`.
 #' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
 #' @param verbose print some info
 #' @param as.df return result as data frame instead of as S4 object?
 #' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
 #' @param context string that specifies the size of the left and the right context returned in `snippet`
 #'        (provided that `metadataOnly` is set to `false` and that the necessary access right are  met).
 #'        The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
 #'        If the parameter is not set, the default context size secification of the KorAP server instance will be used.
 #'        Note that you cannot overrule the maximum context size set in the KorAP server instance,
 #'        as this is typically legally motivated.
 #' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
 #' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
 #' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
 #'
 #' @examples
 #' \dontrun{
 #'
 #' # Fetch basic metadata for "Ameisenplage"
 #' KorAPConnection() |>
 #'   corpusQuery("Ameisenplage") |>
 #'   fetchAll()
 #'
 #' # Fetch specific metadata fields for bibliographic analysis
 #' query <- KorAPConnection() |>
 #'   corpusQuery("Ameisenplage",
 #'               fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
 #' results <- fetchAll(query)
 #' results@collectedMatches
 #' }
 #'
 #' \dontrun{
 #'
 #' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
 #' # and show the number of query hits (but don't fetch them).
 #'
 #' KorAPConnection(verbose = TRUE) |>
 #'   corpusQuery(
 #'     KorAPUrl =
 #'       "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
 #'   )
 #' }
 #'
 #' \dontrun{
 #'
 #' # Plot the time/frequency curve of "Ameisenplage"
 #' KorAPConnection(verbose = TRUE) |>
 #'   {
 #'     . ->> kco
 #'   } |>
 #'   corpusQuery("Ameisenplage") |>
 #'   fetchAll() |>
 #'   slot("collectedMatches") |>
 #'   mutate(year = lubridate::year(pubDate)) |>
 #'   dplyr::select(year) |>
 #'   group_by(year) |>
 #'   summarise(Count = dplyr::n()) |>
 #'   mutate(Freq = mapply(function(f, y) {
 #'     f / corpusStats(kco, paste("pubDate in", y))@tokens
 #'   }, Count, year)) |>
 #'   dplyr::select(-Count) |>
 #'   complete(year = min(year):max(year), fill = list(Freq = 0)) |>
 #'   plot(type = "l")
 #' }
 #' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
 #'
 #' @references
 #' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
 #'
 #' @export
 setMethod(
   "corpusQuery", "KorAPConnection",
   function(kco,
            query = if (missing(KorAPUrl)) {
              stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
            } else {
              httr2::url_parse(KorAPUrl)$query$q
            },
            vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
            KorAPUrl,
            metadataOnly = TRUE,
            ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
            fields = c(
              "corpusSigle",
              "textSigle",
              "pubDate",
              "pubPlace",
              "availability",
              "textClass",
              "snippet",
              "tokens"
            ),
            accessRewriteFatal = TRUE,
            verbose = kco@verbose,
            expand = length(vc) != length(query),
            as.df = FALSE,
            context = NULL) {
     if (length(query) > 1 || length(vc) > 1) {
       grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)

       # Initialize timing variables for ETA calculation
       total_queries <- nrow(grid)
       current_query <- 0
       start_time <- Sys.time()

       results <- purrr::pmap(grid, function(query, vc, ...) {
         current_query <<- current_query + 1

         # Execute the single query directly (avoiding recursive call)
         contentFields <- c("snippet", "tokens")
         query_fields <- fields
         if (metadataOnly) {
           query_fields <- query_fields[!query_fields %in% contentFields]
         }
         if (!"textSigle" %in% query_fields) {
           query_fields <- c(query_fields, "textSigle")
         }
         request <-
           paste0(
             "?q=",
             url_encode(enc2utf8(query)),
             ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
             ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
             ifelse(!metadataOnly, "&show-tokens=true", ""),
             "&ql=", ql
           )
         webUIRequestUrl <- paste0(kco@KorAPUrl, request)
         requestUrl <- paste0(
           kco@apiUrl,
           "search",
           request,
           "&fields=",
           paste(query_fields, collapse = ","),
           if (metadataOnly) "&access-rewrite-disabled=true" else ""
         )

         # Show individual query progress
         log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
         res <- apiCall(kco, paste0(requestUrl, "&count=0"))
         if (is.null(res)) {
           log_info(verbose, ": API call failed\n")
           totalResults <- 0
         } else {
           totalResults <- as.integer(res$meta$totalResults)
           log_info(verbose, ": ", totalResults, " hits")
           if (!is.null(res$meta$cached)) {
             log_info(verbose, " [cached]")
           } else if (!is.null(res$meta$benchmark)) {
             if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
               time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
               formatted_time <- paste0(round(time_value, 2), "s")
               log_info(verbose, ", took ", formatted_time)
             } else {
               log_info(verbose, ", took ", res$meta$benchmark)
             }
           }

           # Calculate and display ETA information on the same line if verbose and we have more than one query
           if (verbose && total_queries > 1) {
             eta_info <- calculate_eta(current_query, total_queries, start_time)
             if (eta_info != "") {
               elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
               avg_time_per_query <- elapsed_time / current_query

               # Add ETA info to the same line - remove the leading ". " for cleaner formatting
               clean_eta_info <- sub("^\\. ", ". ", eta_info)
               log_info(verbose, clean_eta_info)
             }
           }

           log_info(verbose, "\n")
         }

         result <- data.frame(
           query = query,
           totalResults = totalResults,
           vc = vc,
           webUIRequestUrl = webUIRequestUrl,
           stringsAsFactors = FALSE
         )

         return(result)
       })

       results %>% bind_rows()
     } else {
       contentFields <- c("snippet", "tokens")
       if (metadataOnly) {
         fields <- fields[!fields %in% contentFields]
       }
       if (!"textSigle" %in% fields) {
         fields <- c(fields, "textSigle")
       }
       request <-
         paste0(
           "?q=",
           url_encode(enc2utf8(query)),
           ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
           ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
           ifelse(!metadataOnly, "&show-tokens=true", ""),
           "&ql=", ql
         )
       webUIRequestUrl <- paste0(kco@KorAPUrl, request)
       requestUrl <- paste0(
         kco@apiUrl,
         "search",
         request,
         "&fields=",
         paste(fields, collapse = ","),
         if (metadataOnly) "&access-rewrite-disabled=true" else ""
       )
       log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
         sep =
           ""
       )
       res <- apiCall(kco, paste0(requestUrl, "&count=0"))
       if (is.null(res)) {
         message("API call failed.")
         totalResults <- 0
       } else {
         totalResults <- as.integer(res$meta$totalResults)
         log_info(verbose, ": ", totalResults, " hits")
         if (!is.null(res$meta$cached)) {
           log_info(verbose, " [cached]\n")
         } else if (!is.null(res$meta$benchmark)) {
           # Round the benchmark time to 2 decimal places for better readability
           # If it's a string ending with 's', extract the number, round it, and re-add 's'
           if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
             time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
             formatted_time <- paste0(round(time_value, 2), "s")
             log_info(verbose, ", took ", formatted_time, "\n", sep = "")
           } else {
             # Fallback if the format is different than expected
             log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
           }
         } else {
           log_info(verbose, "\n")
         }
       }
       if (as.df) {
         data.frame(
           query = query,
           totalResults = totalResults,
           vc = vc,
           webUIRequestUrl = webUIRequestUrl,
           stringsAsFactors = FALSE
         )
       } else {
         KorAPQuery(
           korapConnection = kco,
           nextStartIndex = 0,
           fields = fields,
           requestUrl = requestUrl,
           request = request,
           totalResults = totalResults,
           vc = vc,
           apiResponse = res,
           webUIRequestUrl = webUIRequestUrl,
           hasMoreMatches = (totalResults > 0),
         )
       }
     }
   }
 )

 #' @importFrom purrr map
 repair_data_strcuture <- function(x) {
   if (is.list(x)) {
     as.character(purrr::map(x, ~ if (length(.x) > 1) {
       paste(.x, collapse = " ")
     } else {
       .x
     }))
   } else {
     ifelse(is.na(x), "", x)
   }
 }

 #' Fetch the next bunch of results of a KorAP query.
 #'
 #' **`fetchNext`** fetches the next bunch of results of a KorAP query.
 #'
 #' @family corpus search functions
 #'
 #' @param kqo object obtained from [corpusQuery()]
 #' @param offset start offset for query results to fetch
 #' @param maxFetch maximum number of query results to fetch
 #' @param verbose print progress information if true
 #' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
 #' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
 #'
 #' @examples
 #' \dontrun{
 #'
 #' q <- KorAPConnection() |>
 #'   corpusQuery("Ameisenplage") |>
 #'   fetchNext()
 #' q@collectedMatches
 #' }
 #'
 #' @references
 #' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
 #'
 #' @aliases fetchNext
 #' @importFrom dplyr rowwise mutate bind_rows select summarise n select
 #' @importFrom tibble enframe add_column
 #' @importFrom stringr word
 #' @importFrom tidyr unnest unchop pivot_wider
 #' @importFrom purrr map
 #' @export
 setMethod("fetchNext", "KorAPQuery", function(kqo,
                                               offset = kqo@nextStartIndex,
                                               maxFetch = maxResultsPerPage,
                                               verbose = kqo@korapConnection@verbose,
                                               randomizePageOrder = FALSE) {
   # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
   results <- key <- name <- tmp_positions <- 0

   if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
     return(kqo)
   }
   use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
   # Calculate the initial page number (not used directly - keeping for reference)
   collectedMatches <- kqo@collectedMatches

   # Track start time for ETA calculation
   start_time <- Sys.time()

   # For randomized page order, generate a list of randomized page indices
   if (randomizePageOrder) {
     # Calculate how many pages we need to fetch based on maxFetch
     total_pages_to_fetch <- if (!is.na(maxFetch)) {
       # Either limited by maxFetch or total results, whichever is smaller
       min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
     } else {
       # All pages
       ceiling(kqo@totalResults / maxResultsPerPage)
     }

     # Generate randomized page indices (0-based for API)
     pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
     page_index <- 1 # Index to track which page in the randomized list we're on
   }

   if (is.null(collectedMatches)) {
     collectedMatches <- data.frame()
   }

   # Initialize the page counter properly based on nextStartIndex and any previously fetched results
   # We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
   # For first call, this will be 1, for subsequent calls, it will reflect our actual position
   current_page_number <- ceiling(offset / maxResultsPerPage) + 1

   # For sequential fetches, keep track of which global page we're on
   # This is important for correctly showing page numbers in subsequent fetchNext calls
   page_count_start <- current_page_number

   repeat {
     # Determine which page to fetch next
     if (randomizePageOrder) {
       # In randomized mode, get the page from our randomized list using the page_index
       # Make sure we don't exceed the array bounds
       if (page_index > length(pages)) {
         break # No more pages to fetch in randomized mode
       }
       current_offset_page <- pages[page_index]
       # For display purposes in randomized mode, show which page out of the total we're fetching
       display_page_number <- page_index
     } else {
       # In sequential mode, use the current_page_number to calculate the offset
       current_offset_page <- (current_page_number - 1)
       display_page_number <- current_page_number
     }

     # Calculate the actual offset in tokens
     currentOffset <- current_offset_page * maxResultsPerPage

     # Build the query with the appropriate count and offset using httr2
     count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)

     # Parse existing URL to preserve all query parameters
     parsed_url <- httr2::url_parse(kqo@requestUrl)
     existing_query <- parsed_url$query

     # Add/update count and offset parameters
     existing_query$count <- count_param
     existing_query$offset <- currentOffset

     # Rebuild the URL with all parameters
     query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
     res <- apiCall(kqo@korapConnection, query)
     if (length(res$matches) == 0) {
       break
     }

     if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
       log_info(verbose, "Using fields API: ")
       currentMatches <- res$matches$fields %>%
         purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
         tibble::enframe() %>%
         tidyr::unnest(cols = value) %>%
         tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
         dplyr::select(-name)
       if ("snippet" %in% colnames(res$matches)) {
         currentMatches$snippet <- res$matches$snippet
       }
       if ("tokens" %in% colnames(res$matches)) {
         currentMatches$tokens <- res$matches$tokens
       }
     } else {
       currentMatches <- res$matches
     }

     for (field in kqo@fields) {
       if (!field %in% colnames(currentMatches)) {
         currentMatches[, field] <- NA
       }
     }
     currentMatches <- currentMatches %>%
       select(kqo@fields) %>%
       mutate(
         matchID = res$matches$matchID,
         tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
         matchStart = as.integer(stringr::word(tmp_positions, 1)),
         matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
       ) %>%
       select(-tmp_positions)

     if (!is.list(collectedMatches)) {
       collectedMatches <- currentMatches
     } else {
       collectedMatches <- bind_rows(collectedMatches, currentMatches)
     }

     # Get the actual items per page from the API response
     # We now consistently use maxResultsPerPage instead

     # Calculate total pages consistently using fixed maxResultsPerPage
     # This ensures consistent page counting across the function
     total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)

     # Calculate ETA using the centralized function from logging.R
     current_page <- if (randomizePageOrder) page_index else display_page_number
     total_pages_to_fetch <- if (!is.na(maxFetch)) {
       # Account for offset - we can only fetch from the remaining results after offset
       remaining_results_after_offset <- max(0, kqo@totalResults - offset)
       min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
     } else {
       total_pages
     }

     eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)

     # Extract timing information for display
     time_per_page <- NA
     if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
       time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
     }

     # Create the page display string with proper formatting

     # For global page tracking, calculate the absolute page number
     actual_display_number <- if (randomizePageOrder) {
       current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
     } else {
       # In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
       current_offset_page + 1
     }

     # For subsequent calls to fetchNext, we need to calculate the correct page numbers
     # based on the current batch being fetched

     # For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
     # Simply count from 1 within the current batch

     # The relative page number is simply the current position in this batch
     if (randomizePageOrder) {
       relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
     } else {
       relative_page_number <- display_page_number - (page_count_start - 1)
     }

     # How many pages will we fetch in this batch?
     # If maxFetch is specified, calculate the total pages for this fetch operation
     pages_in_this_batch <- if (!is.na(maxFetch)) {
       # Account for offset - we can only fetch from the remaining results after offset
       remaining_results_after_offset <- max(0, kqo@totalResults - offset)
       min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
     } else {
       # Otherwise fetch all remaining pages
       total_pages - page_count_start + 1
     }

     # The total pages to be shown in this batch
     batch_total_pages <- pages_in_this_batch

     page_display <- paste0(
       "Retrieved page ",
       sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
       "/",
       sprintf("%d", batch_total_pages)
     )

     # If randomized, also show which actual page we fetched
     if (randomizePageOrder) {
       # Determine the maximum width needed for page numbers (based on total pages)
       # This ensures consistent alignment
       max_page_width <- nchar(as.character(total_pages))
       # Add the actual page number that was fetched (0-based + 1 for display) with proper padding
       page_display <- paste0(
         page_display,
         sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
       )
     }
     # Always show the absolute page number and total pages (for clarity)
     else {
       # Show the absolute page number (out of total possible pages)
       page_display <- paste0(page_display, sprintf(
         " (page %d of %d total)",
         actual_display_number, total_pages
       ))
     }

     # Add caching or timing information
     if (!is.null(res$meta$cached)) {
       page_display <- paste0(page_display, " [cached]")
     } else {
       page_display <- paste0(
         page_display,
         " in ",
         if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
         "s",
         eta_info
       )
     }

     log_info(verbose, paste0(page_display, "\n"))

     # Increment the appropriate counter based on mode
     if (randomizePageOrder) {
       page_index <- page_index + 1
     } else {
       current_page_number <- current_page_number + 1
     }
     results <- results + res$meta$itemsPerPage
     if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
       break
     }
   }
   nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
   KorAPQuery(
     nextStartIndex = nextStartIndex,
     korapConnection = kqo@korapConnection,
     fields = kqo@fields,
     requestUrl = kqo@requestUrl,
     request = kqo@request,
     totalResults = kqo@totalResults,
     vc = kqo@vc,
     webUIRequestUrl = kqo@webUIRequestUrl,
     hasMoreMatches = (kqo@totalResults > nextStartIndex),
     apiResponse = res,
     collectedMatches = collectedMatches
   )
 })

 #' Fetch all results of a KorAP query.
 #'
 #' **`fetchAll`** fetches all results of a KorAP query.
 #'
 #' @family corpus search functions
 #' @param kqo object obtained from [corpusQuery()]
 #' @param verbose print progress information if true
 #' @param ... further arguments passed to [fetchNext()]
 #' @return The updated `kqo` object with all results in `@collectedMatches`
 #'
 #' @examples
 #' \dontrun{
 #' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
 #' q <- KorAPConnection() |>
 #'   corpusQuery("Ameisenplage") |>
 #'   fetchAll()
 #' q@collectedMatches
 #'
 #' # Fetch also all KWICs
 #' q <- KorAPConnection() |> auth() |>
 #'  corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
 #'  fetchAll()
 #' q@collectedMatches
 #'
 #' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
 #' q <- KorAPConnection() |>
 #'  corpusQuery("<base/s=t>", # this matches each text once
 #'     vc = "pubDate in 1958-03-12",
 #'     fields = c("textSigle", "title"),
 #' ) |>
 #'  fetchAll()
 #' q@collectedMatches
 #' }
 #'
 #' @aliases fetchAll
 #' @export
 setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
   return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
 })

 #' Fetches the remaining results of a KorAP query.
 #'
 #' @param kqo object obtained from [corpusQuery()]
 #' @param verbose print progress information if true
 #' @param ... further arguments passed to [fetchNext()]
 #' @return The updated `kqo` object with remaining results in `@collectedMatches`
 #'
 #' @examples
 #' \dontrun{
 #'
 #' q <- KorAPConnection() |>
 #'   corpusQuery("Ameisenplage") |>
 #'   fetchRest()
 #' q@collectedMatches
 #' }
 #'
 #' @aliases fetchRest
 #' @export
 setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
   return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
 })

 #'
 #' Parse XML annotations into linguistic layers
 #'
 #' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
 #' from XML annotation snippets returned by the KorAP API.
 #'
 #' @param xml_snippet XML string containing annotation data
 #' @return Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
 #' @keywords internal
 parse_xml_annotations <- function(xml_snippet) {
   if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
     return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
   }

   # Extract content within <span class="match">...</span> using a more robust approach
   if (grepl('<span class="match">', xml_snippet)) {
     # Find the start of match span
     start_pos <- regexpr('<span class="match">', xml_snippet)
     if (start_pos > 0) {
       # Find the end by counting nested spans
       content_start <- start_pos + attr(start_pos, "match.length")
       remaining <- substr(xml_snippet, content_start, nchar(xml_snippet))

       # Simple approach: extract everything until we hit context-right or end
       if (grepl('<span class="context-right">', remaining)) {
         content_to_parse <- gsub('(.*?)<span class="context-right">.*', '\\1', remaining)
       } else {
         # Find the closing </span> that matches our opening span
         # For now, use a simpler approach - take everything until the last </span> sequence
         content_to_parse <- gsub('(.*)</span>\\s*$', '\\1', remaining)
       }
     } else {
       content_to_parse <- xml_snippet
     }
   } else {
     content_to_parse <- xml_snippet
   }

   # Initialize result vectors
   tokens <- character(0)
   lemmas <- character(0)
   pos_tags <- character(0)
   morph_tags <- character(0)

   # Split the content by </span> and process each meaningful part
   parts <- unlist(strsplit(content_to_parse, '</span>'))

   for (part in parts) {
     part <- trimws(part)
     if (nchar(part) == 0) next

     # Look for parts that have title attributes and end with text
     if (grepl('<span[^>]*title=', part)) {
       # Extract the text content (everything after the last >)
       text_content <- gsub('.*>([^<]*)$', '\\1', part)
       text_content <- trimws(text_content)

       if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
         tokens <- c(tokens, text_content)

         # Extract all title attributes from this part
         title_pattern <- 'title="([^"]*)"'
         title_matches <- gregexpr(title_pattern, part)

         lemma <- NA
         pos_tag <- NA
         morph_tag <- NA

         if (title_matches[[1]][1] != -1) {
           all_titles <- regmatches(part, title_matches)[[1]]
           for (title_match in all_titles) {
             title_content <- gsub(title_pattern, '\\1', title_match)

             # Split by spaces and process each annotation
             annotations <- unlist(strsplit(title_content, "\\s+"))
             for (annotation in annotations) {
               if (grepl('^[^/]+/l:', annotation)) {
                 lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
               } else if (grepl('^[^/]+/p:', annotation)) {
                 pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
               } else if (grepl('^[^/]+/m:', annotation)) {
                 morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
               }
             }
           }
         }

         lemmas <- c(lemmas, lemma)
         pos_tags <- c(pos_tags, pos_tag)
         morph_tags <- c(morph_tags, morph_tag)
       }
     }
   }

   # If no tokens found with the splitting approach, try a different method
   if (length(tokens) == 0) {
     # Look for the innermost spans that contain actual text
     innermost_pattern <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
     innermost_matches <- gregexpr(innermost_pattern, content_to_parse, perl = TRUE)

     if (innermost_matches[[1]][1] != -1) {
       matches <- regmatches(content_to_parse, innermost_matches)[[1]]

       for (match in matches) {
         title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
         text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
         text <- trimws(text)

         if (nchar(text) > 0) {
           tokens <- c(tokens, text)

           # Parse space-separated annotations in title
           lemma <- NA
           pos_tag <- NA
           morph_tag <- NA

           annotations <- unlist(strsplit(title, "\\s+"))
           for (annotation in annotations) {
             if (grepl('^[^/]+/l:', annotation)) {
               lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
             } else if (grepl('^[^/]+/p:', annotation)) {
               pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
             } else if (grepl('^[^/]+/m:', annotation)) {
               morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
             }
           }

           lemmas <- c(lemmas, lemma)
           pos_tags <- c(pos_tags, pos_tag)
           morph_tags <- c(morph_tags, morph_tag)
         }
       }
     }
   }

   # Ensure all vectors have the same length
   max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
   if (max_length > 0) {
     tokens <- c(tokens, rep(NA, max_length - length(tokens)))
     lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
     pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
     morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
   }

   return(list(
     token = tokens,
     lemma = lemmas,
     pos = pos_tags,
     morph = morph_tags
   ))
 }

 #'
 #' Parse XML annotations into linguistic layers with left/match/right structure
 #'
 #' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
 #' from XML annotation snippets returned by the KorAP API, split into left context,
 #' match, and right context sections like the tokens field.
 #'
 #' @param xml_snippet XML string containing annotation data
 #' @return Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
 #' @keywords internal
 parse_xml_annotations_structured <- function(xml_snippet) {
   if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
     empty_result <- list(left = character(0), match = character(0), right = character(0))
     return(list(
       atokens = empty_result,
       lemma = empty_result,
       pos = empty_result,
       morph = empty_result
     ))
   }

   # Helper function to extract annotations from a span section
   extract_annotations_from_section <- function(section_content) {
     # Handle both spaced tokens and nested single tokens
     tokens <- character(0)
     lemmas <- character(0)
     pos_tags <- character(0)
     morph_tags <- character(0)

     # First try to split by spaces between span groups (for multiple tokens)
     # Look for spaces that separate token groups
     if (grepl('</span>\\s+<span', section_content)) {
       # Multiple tokens separated by spaces
       token_groups <- unlist(strsplit(section_content, '(?<=</span>)\\s+(?=<span)', perl = TRUE))
     } else {
       # Single token (or no spaces between tokens)
       token_groups <- c(section_content)
     }

     for (group in token_groups) {
       group <- trimws(group)
       if (nchar(group) == 0) next

       # Extract the actual text content (the innermost text)
       text_match <- regexpr('>([^<>]+)</span>', group, perl = TRUE)
       if (text_match > 0) {
         # Find all possible text contents and take the last one (innermost)
         all_texts <- regmatches(group, gregexpr('>([^<>]+)</span>', group, perl = TRUE))[[1]]
         if (length(all_texts) > 0) {
           # Take the last match (innermost text)
           text_content <- sub('.*>([^<>]+)</span>.*', '\\1', all_texts[length(all_texts)], perl = TRUE)
           text_content <- trimws(text_content)

           if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
             tokens <- c(tokens, text_content)

             # Extract all title attributes from this group
             titles <- regmatches(group, gregexpr('title="([^"]*)"', group, perl = TRUE))[[1]]

             morph_features <- character(0)
             lemma <- NA
             pos_tag <- NA

             for (title in titles) {
               content <- sub('title="([^"]*)"', '\\1', title, perl = TRUE)

               if (grepl('^[^/]+/l:', content)) {
                 lemma <- sub('^[^/]+/l:(.*)$', '\\1', content)
               } else if (grepl('^[^/]+/p:', content)) {
                 pos_tag <- sub('^[^/]+/p:(.*)$', '\\1', content)
               } else if (grepl('^[^/]+/m:', content)) {
                 morph_feature <- sub('^[^/]+/m:(.*)$', '\\1', content)
                 morph_features <- c(morph_features, morph_feature)
               }
             }

             lemmas <- c(lemmas, lemma)
             pos_tags <- c(pos_tags, pos_tag)
             morph_tag <- if (length(morph_features) > 0) paste(morph_features, collapse = "|") else NA
             morph_tags <- c(morph_tags, morph_tag)
           }
         }
       }
     }

     # Ensure all vectors have the same length
     max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
     if (max_length > 0) {
       tokens <- c(tokens, rep(NA, max_length - length(tokens)))
       lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
       pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
       morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
     }

     return(list(
       tokens = tokens,
       lemmas = lemmas,
       pos_tags = pos_tags,
       morph_tags = morph_tags
     ))
   }

   # Split the XML into three parts: left context, match content, and right context
   # The structure is: <span class="match">...left...<mark>...match...</mark>...right...</span>

   # First extract the content within the match span using DOTALL modifier
   match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*<span class="context-right">'
   match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)

   if (match_span_match == -1) {
     # Try alternative pattern if no context-right
     match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*$'
     match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
   }

   if (match_span_match > 0) {
     match_span_content <- gsub(match_span_pattern, '\\1', xml_snippet, perl = TRUE)

     # Now find the <mark> and </mark> positions within this content
     mark_start <- regexpr('<mark[^>]*>', match_span_content, perl = TRUE)
     mark_end <- regexpr('</mark>', match_span_content, perl = TRUE)

     if (mark_start > 0 && mark_end > 0) {
       # Left context: everything before <mark>
       left_content <- substr(match_span_content, 1, mark_start - 1)

       # Match content: everything between <mark> and </mark> (including the mark tags for now)
       match_content <- substr(match_span_content, mark_start, mark_end + attr(mark_end, "match.length") - 1)

       # Right context: everything after </mark>
       right_content_start <- mark_end + attr(mark_end, "match.length")
       right_content <- substr(match_span_content, right_content_start, nchar(match_span_content))
     } else {
       # No mark tags found, treat entire match span as match content
       left_content <- ""
       match_content <- match_span_content
       right_content <- ""
     }
   } else {
     # No match span found, treat entire content as match
     left_content <- ""
     match_content <- xml_snippet
     right_content <- ""
   }

   # Process each section
   left_annotations <- extract_annotations_from_section(left_content)
   match_annotations <- extract_annotations_from_section(match_content)
   right_annotations <- extract_annotations_from_section(right_content)

   return(list(
     atokens = list(
       left = left_annotations$tokens,
       match = match_annotations$tokens,
       right = right_annotations$tokens
     ),
     lemma = list(
       left = left_annotations$lemmas,
       match = match_annotations$lemmas,
       right = right_annotations$lemmas
     ),
     pos = list(
       left = left_annotations$pos_tags,
       match = match_annotations$pos_tags,
       right = right_annotations$pos_tags
     ),
     morph = list(
       left = left_annotations$morph_tags,
       match = match_annotations$morph_tags,
       right = right_annotations$morph_tags
     )
   ))
 }

 #' Fetch annotations for all collected matches
 #'
 #' `r lifecycle::badge("experimental")`
 #'
 #' **`fetchAnnotations`** fetches annotations (only token annotations, for now)
 #' for all matches in the `@collectedMatches` slot
 #' of a KorAPQuery object and adds annotation columns directly to the `@collectedMatches`
 #' data frame. The method uses the `matchID` from collected matches.
 #'
 #' **Important**: For copyright-restricted corpora, users must be authorized via [auth()]
 #' and the initial corpus query must have `metadataOnly = FALSE` to ensure snippets are
 #' available for annotation parsing.
 #'
 #' The method parses XML snippet annotations and adds linguistic columns to the data frame:
 #' - `pos`: data frame with `left`, `match`, `right` columns, each containing list vectors of part-of-speech tags
 #' - `lemma`: data frame with `left`, `match`, `right` columns, each containing list vectors of lemmas
 #' - `morph`: data frame with `left`, `match`, `right` columns, each containing list vectors of morphological tags
 #' - `atokens`: data frame with `left`, `match`, `right` columns, each containing list vectors of token text (from annotations)
 #' - `annotation_snippet`: original XML snippet from the annotation API
 #'
 #' @family corpus search functions
 #' @concept Annotations
 #' @aliases fetchAnnotations
 #'
 #' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
 #' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
 #' @param verbose print progress information if true
 #' @return The updated `kqo` object with annotation columns
 #' like `pos`, `lemma`, `morph` (and `atokens` and `annotation_snippet`)
 #' in the `@collectedMatches` slot. Each column is a data frame
 #' with `left`, `match`, and `right` columns containing list vectors of annotations
 #' for the left context, matched tokens, and right context, respectively.
 #' The original XML snippet for each match is also stored in `annotation_snippet`.
 #'
 #' @examples
 #' \dontrun{
 #'
 #' # Fetch annotations for matches using Tree-Tagger foundry
 #' # Note: Authorization required for copyright-restricted corpora
 #' q <- KorAPConnection() |>
 #'   auth() |>
 #'   corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
 #'   fetchNext(maxFetch = 10) |>
 #'   fetchAnnotations()
 #'
 #' # Access linguistic annotations for match i:
 #' pos_tags <- q@collectedMatches$pos         # Data frame with left/match/right columns for POS tags
 #' lemmas <- q@collectedMatches$lemma         # Data frame with left/match/right columns for lemmas
 #' morphology <- q@collectedMatches$morph     # Data frame with left/match/right columns for morphological tags
 #' atokens <- q@collectedMatches$atokens      # Data frame with left/match/right columns for annotation token text
 #' raw_snippet <- q@collectedMatches$annotation_snippet[[i]] # Original XML snippet for match i
 #'
 #' # Access specific components:
 #' match_pos <- q@collectedMatches$pos$match[[i]]     # POS tags for the matched tokens in match i
 #' left_lemmas <- q@collectedMatches$lemma$left[[i]]  # Lemmas for the left context in match i
 #' right_tokens <- q@collectedMatches$atokens$right[[i]] # Token text for the right context in match i
 #'
 #' # Use a different foundry (e.g., MarMoT)
 #' q <- KorAPConnection() |>
 #'   auth() |>
 #'   corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
 #'   fetchNext(maxFetch = 10) |>
 #'   fetchAnnotations(foundry = "marmot")
 #' q@collectedMatches$pos$left[1] # POS tags for the left context of the first match
 #' }
 #' @export
 setMethod("fetchAnnotations", "KorAPQuery", function(kqo, foundry = "tt", verbose = kqo@korapConnection@verbose) {
   if (is.null(kqo@collectedMatches) || nrow(kqo@collectedMatches) == 0) {
     warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
     return(kqo)
   }

   df <- kqo@collectedMatches
   kco <- kqo@korapConnection

   # Initialize annotation columns as data frames (like tokens field)
   # Create the structure more explicitly to avoid assignment issues
   nrows <- nrow(df)

   df$pos <- data.frame(
     left = I(replicate(nrows, character(0), simplify = FALSE)),
     match = I(replicate(nrows, character(0), simplify = FALSE)),
     right = I(replicate(nrows, character(0), simplify = FALSE)),
     stringsAsFactors = FALSE
   )

   df$lemma <- data.frame(
     left = I(replicate(nrows, character(0), simplify = FALSE)),
     match = I(replicate(nrows, character(0), simplify = FALSE)),
     right = I(replicate(nrows, character(0), simplify = FALSE)),
     stringsAsFactors = FALSE
   )

   df$morph <- data.frame(
     left = I(replicate(nrows, character(0), simplify = FALSE)),
     match = I(replicate(nrows, character(0), simplify = FALSE)),
     right = I(replicate(nrows, character(0), simplify = FALSE)),
     stringsAsFactors = FALSE
   )

   df$atokens <- data.frame(
     left = I(replicate(nrows, character(0), simplify = FALSE)),
     match = I(replicate(nrows, character(0), simplify = FALSE)),
     right = I(replicate(nrows, character(0), simplify = FALSE)),
     stringsAsFactors = FALSE
   )

   df$annotation_snippet <- replicate(nrows, NA, simplify = FALSE)

   # Initialize timing for ETA calculation
   start_time <- Sys.time()
   if (verbose) {
     log_info(verbose, paste("Starting to fetch annotations for", nrows, "matches\n"))
   }

   for (i in seq_len(nrow(df))) {
     # ETA logging
     if (verbose && i > 1) {
       eta_info <- calculate_eta(i, nrows, start_time)
       log_info(verbose, paste("Fetching annotations for match", i, "of", nrows, eta_info, "\n"))
     }
     # Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
     if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
       # matchID format: "match-match-A00/JUN/39609-p202-203" or encrypted format like
       # "match-DNB10/CSL/80400-p2343-2344x_MinDOhu_P6dd2MMZJyyus_7MairdKnr1LxY07Cya-Ow"
       # Extract document path and position, handling both regular and encrypted formats

       # More flexible regex to extract the document path with position and encryption
       # Look for pattern: match-(...)-p(\d+)-(\d+)(.*) where (.*) is the encrypted part
       # We need to capture the entire path including the encrypted suffix
       match_result <- regexpr("match-(.+?-p\\d+-\\d+.*)", df$matchID[i], perl = TRUE)

       if (match_result > 0) {
         # Extract the complete path including encryption (everything after "match-")
         doc_path_with_pos_and_encryption <- gsub("^match-(.+)$", "\\1", df$matchID[i], perl = TRUE)
         # Convert the dash before position to slash, but keep everything after the position
         match_path <- gsub("-p(\\d+-\\d+.*)", "/p\\1", doc_path_with_pos_and_encryption)
         # Use httr2 to construct URL safely
         base_url <- paste0(kco@apiUrl, "corpus/", match_path)
         req <- httr2::url_modify(base_url, query = list(foundry = foundry))
       } else {
         # If regex fails, fall back to the old method with httr2
         # Format numbers to avoid scientific notation
         match_start <- format(df$matchStart[i], scientific = FALSE)
         match_end <- format(df$matchEnd[i], scientific = FALSE)
         base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
         req <- httr2::url_modify(base_url, query = list(foundry = foundry))
       }
     } else {
       # Fallback to the old method with httr2
       # Format numbers to avoid scientific notation
       match_start <- format(df$matchStart[i], scientific = FALSE)
       match_end <- format(df$matchEnd[i], scientific = FALSE)
       base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
       req <- httr2::url_modify(base_url, query = list(foundry = foundry))
     }

     tryCatch({
       res <- apiCall(kco, req)

       if (!is.null(res)) {
         # Store the raw annotation snippet
         df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA

         # Parse XML annotations if snippet is available
         if (is.list(res) && "snippet" %in% names(res)) {
           parsed_annotations <- parse_xml_annotations_structured(res$snippet)

           # Store the parsed linguistic data in data frame format (like tokens)
           # Use individual assignment to avoid data frame mismatch errors
           tryCatch({
             # Assign POS annotations
             df$pos$left[i] <- list(parsed_annotations$pos$left)
             df$pos$match[i] <- list(parsed_annotations$pos$match)
             df$pos$right[i] <- list(parsed_annotations$pos$right)

             # Assign lemma annotations
             df$lemma$left[i] <- list(parsed_annotations$lemma$left)
             df$lemma$match[i] <- list(parsed_annotations$lemma$match)
             df$lemma$right[i] <- list(parsed_annotations$lemma$right)

             # Assign morphology annotations
             df$morph$left[i] <- list(parsed_annotations$morph$left)
             df$morph$match[i] <- list(parsed_annotations$morph$match)
             df$morph$right[i] <- list(parsed_annotations$morph$right)

             # Assign token annotations
             df$atokens$left[i] <- list(parsed_annotations$atokens$left)
             df$atokens$match[i] <- list(parsed_annotations$atokens$match)
             df$atokens$right[i] <- list(parsed_annotations$atokens$right)
           }, error = function(assign_error) {
             # Set empty character vectors on assignment error using list assignment
             df$pos$left[i] <<- list(character(0))
             df$pos$match[i] <<- list(character(0))
             df$pos$right[i] <<- list(character(0))

             df$lemma$left[i] <<- list(character(0))
             df$lemma$match[i] <<- list(character(0))
             df$lemma$right[i] <<- list(character(0))

             df$morph$left[i] <<- list(character(0))
             df$morph$match[i] <<- list(character(0))
             df$morph$right[i] <<- list(character(0))

             df$atokens$left[i] <<- list(character(0))
             df$atokens$match[i] <<- list(character(0))
             df$atokens$right[i] <<- list(character(0))
           })
         } else {
           # No snippet available, store empty vectors
           df$pos$left[i] <- list(character(0))
           df$pos$match[i] <- list(character(0))
           df$pos$right[i] <- list(character(0))

           df$lemma$left[i] <- list(character(0))
           df$lemma$match[i] <- list(character(0))
           df$lemma$right[i] <- list(character(0))

           df$morph$left[i] <- list(character(0))
           df$morph$match[i] <- list(character(0))
           df$morph$right[i] <- list(character(0))

           df$atokens$left[i] <- list(character(0))
           df$atokens$match[i] <- list(character(0))
           df$atokens$right[i] <- list(character(0))
         }
       } else {
         # Store NAs for failed requests
         df$pos$left[i] <- list(NA)
         df$pos$match[i] <- list(NA)
         df$pos$right[i] <- list(NA)

         df$lemma$left[i] <- list(NA)
         df$lemma$match[i] <- list(NA)
         df$lemma$right[i] <- list(NA)

         df$morph$left[i] <- list(NA)
         df$morph$match[i] <- list(NA)
         df$morph$right[i] <- list(NA)

         df$atokens$left[i] <- list(NA)
         df$atokens$match[i] <- list(NA)
         df$atokens$right[i] <- list(NA)
         df$annotation_snippet[[i]] <- NA
       }
     }, error = function(e) {
       # Store NAs for failed requests
       df$pos$left[i] <- list(NA)
       df$pos$match[i] <- list(NA)
       df$pos$right[i] <- list(NA)

       df$lemma$left[i] <- list(NA)
       df$lemma$match[i] <- list(NA)
       df$lemma$right[i] <- list(NA)

       df$morph$left[i] <- list(NA)
       df$morph$match[i] <- list(NA)
       df$morph$right[i] <- list(NA)

       df$atokens$left[i] <- list(NA)
       df$atokens$match[i] <- list(NA)
       df$atokens$right[i] <- list(NA)
       df$annotation_snippet[[i]] <- NA
     })
   }

   # Validate data frame structure before assignment
   if (nrow(df) != nrow(kqo@collectedMatches)) {
   }

   # Update the collectedMatches with annotation data
   tryCatch({
     kqo@collectedMatches <- df
   }, error = function(assign_error) {
     # Try a safer approach: add columns individually
     tryCatch({
       kqo@collectedMatches$pos <- df$pos
       kqo@collectedMatches$lemma <- df$lemma
       kqo@collectedMatches$morph <- df$morph
       kqo@collectedMatches$atokens <- df$atokens
       kqo@collectedMatches$annotation_snippet <- df$annotation_snippet
     }, error = function(col_error) {
       warning("Failed to add annotation data to collectedMatches")
     })
   })

   if (verbose) {
     elapsed_time <- Sys.time() - start_time
     log_info(verbose, paste("Finished fetching annotations for", nrows, "matches in", format_duration(as.numeric(elapsed_time, units = "secs")), "\n"))
   }

   return(kqo)
 })

 #' Query frequencies of search expressions in virtual corpora
 #'
 #' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
 #' [ci()] to compute a tibble with the absolute and relative frequencies and
 #' confidence intervals of one ore multiple search terms across one or multiple
 #' virtual corpora.
 #'
 #' @family frequency analysis
 #' @aliases frequencyQuery
 #' @examples
 #' \dontrun{
 #'
 #' KorAPConnection(verbose = TRUE) |>
 #'   frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
 #' }
 #'
 # @inheritParams corpusQuery
 #' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
 #' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
 #' @param vc virtual corpus definition(s) (can be a vector)
 #' @param conf.level confidence level of the returned confidence interval (passed through [ci()]  to [prop.test()]).
 #' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
 #' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
 #' @export
 #'
 #' @return A tibble, with each row containing the following result columns for query and vc combinations:
 #'   - **query**: the query string used for the frequency analysis.
 #'   - **totalResults**: absolute frequency of query matches in the vc.
 #'   - **vc**:  virtual corpus used for the query.
 #'   - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
 #'   - **total**: total number of words in vc.
 #'   - **f**:  relative frequency of query matches in the vc.
 #'   - **conf.low**:  lower bound of the confidence interval for the relative frequency, given `conf.level`.
 #'   - **conf.high**:  upper bound of the confidence interval for the relative frequency, given `conf.level`.

 setMethod(
   "frequencyQuery", "KorAPConnection",
   function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
     (if (as.alternatives) {
       corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
         group_by(vc) |>
         mutate(total = sum(totalResults))
     } else {
       corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
         mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
     }) |>
       ci(conf.level = conf.level)
   }
 )

 #' buildWebUIRequestUrlFromString
 #'
 #' @rdname KorAPQuery-class
 #' @importFrom urltools url_encode
 #' @export
 buildWebUIRequestUrlFromString <- function(KorAPUrl,
                                            query,
                                            vc = "",
                                            ql = "poliqarp") {
   if ("KorAPConnection" %in% class(KorAPUrl)) {
     KorAPUrl <- KorAPUrl@KorAPUrl
   }

   request <-
     paste0(
       "?q=",
       urltools::url_encode(enc2utf8(as.character(query))),
       ifelse(vc != "",
         paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
         ""
       ),
       "&ql=",
       ql
     )
   paste0(KorAPUrl, request)
 }

 #' buildWebUIRequestUrl
 #'
 #' @rdname KorAPQuery-class
 #' @importFrom httr2 url_parse
 #' @export
 buildWebUIRequestUrl <- function(kco,
                                  query = if (missing(KorAPUrl)) {
                                    stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
                                  } else {
                                    httr2::url_parse(KorAPUrl)$query$q
                                  },
                                  vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
                                  KorAPUrl,
                                  ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
   buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
 }

 #' format()
 #' @rdname KorAPQuery-class
 #' @param x KorAPQuery object
 #' @param ... further arguments passed to or from other methods
 #' @importFrom urltools param_get url_decode
 #' @export
 format.KorAPQuery <- function(x, ...) {
   cat("<KorAPQuery>\n")
   q <- x
   param <- urltools::param_get(q@request) |> lapply(urltools::url_decode)
   cat("           Query: ", param$q, "\n")
   if (!is.null(param$cq) && param$cq != "") {
     cat("  Virtual corpus: ", param$cq, "\n")
   }
   if (!is.null(q@collectedMatches)) {
     cat("==============================================================================================================", "\n")
     print(summary(q@collectedMatches))
     cat("==============================================================================================================", "\n")
   }
   cat("   Total results: ", q@totalResults, "\n")
   cat(" Fetched results: ", q@nextStartIndex, "\n")
   if (!is.null(q@collectedMatches) && "pos" %in% colnames(q@collectedMatches)) {
     successful_annotations <- sum(!is.na(q@collectedMatches$annotation_snippet))
     parsed_annotations <- sum(!is.na(q@collectedMatches$pos))
     cat("     Annotations: ", successful_annotations, " of ", nrow(q@collectedMatches), " matches")
     if (parsed_annotations > 0) {
       cat(" (", parsed_annotations, " with parsed linguistic data)")
     }
     cat("\n")
   }
 }

 #' show()
 #'
 #' @rdname KorAPQuery-class
 #' @param object KorAPQuery object
 #' @export
 setMethod("show", "KorAPQuery", function(object) {
   format(object)
   invisible(object)
 })