blob: 7d315874c55cc90153a852efede065c760da1d0a [file] [log] [blame]
#' KorAPQuery class (internal)
#'
#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
#'
#' @keywords internal
#' @include KorAPConnection.R
#' @include logging.R
#' @import httr2
#'
#' @include RKorAPClient-package.R
#' @export
KorAPQuery <- setClass("KorAPQuery", slots = c(
"korapConnection",
"request",
"vc",
"totalResults",
"nextStartIndex",
"fields",
"requestUrl",
"webUIRequestUrl",
"apiResponse",
"collectedMatches",
"hasMoreMatches"
))
#' Initialize KorAPQuery object
#' @keywords internal
#' @param .Object …
#' @param korapConnection KorAPConnection object
#' @param request query part of the request URL
#' @param vc definition of a virtual corpus
#' @param totalResults number of hits the query has yielded
#' @param nextStartIndex at what index to start the next fetch of query results
#' @param fields what data / metadata fields should be collected
#' @param requestUrl complete URL of the API request
#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
#' @param apiResponse data-frame representation of the JSON response of the API request
#' @param hasMoreMatches logical that signals if more query results can be fetched
#' @param collectedMatches matches already fetched from the KorAP-API-server
#'
#' @importFrom tibble tibble
#' @export
setMethod(
"initialize", "KorAPQuery",
function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
"corpusSigle", "textSigle", "pubDate", "pubPlace",
"availability", "textClass", "snippet", "tokens"
),
requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
.Object <- callNextMethod()
.Object@korapConnection <- korapConnection
.Object@request <- request
.Object@vc <- vc
.Object@totalResults <- totalResults
.Object@nextStartIndex <- nextStartIndex
.Object@fields <- fields
.Object@requestUrl <- requestUrl
.Object@webUIRequestUrl <- webUIRequestUrl
.Object@apiResponse <- apiResponse
.Object@hasMoreMatches <- hasMoreMatches
.Object@collectedMatches <- collectedMatches
.Object
}
)
setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
setGeneric("fetchAnnotations", function(kqo, ...) standardGeneric("fetchAnnotations"))
setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
maxResultsPerPage <- 50
## quiets concerns of R CMD check re: the .'s that appear in pipelines
utils::globalVariables(c("."))
#' Search corpus for query terms
#'
#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
#'
#' @family corpus search functions
#' @aliases corpusQuery
#'
#' @importFrom urltools url_encode
#' @importFrom purrr pmap
#' @importFrom dplyr bind_rows group_by
#'
#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
#' your RKorAPClient application as explained in the
#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
#' `FALSE`.
#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
#' @param fields character vector specifying which metadata fields to retrieve for each match.
#' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
#' \describe{
#' \item{**Text identification**:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
#' \item{**Publication info**:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
#' \item{**Temporal data**:}{`pubDate`, `creationDate` - when text was published/created}
#' \item{**Publication details**:}{`pubPlace`, `publisher`, `reference` - where/how published}
#' \item{**Text classification**:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
#' \item{**Adminstrative and technical info**:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
#' \item{**Content data**:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
#' \item{**System data**:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
#' }
#' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
#' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default if `metadataOnly` is set to `FALSE`.
#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
#' @param verbose print some info
#' @param as.df return result as data frame instead of as S4 object?
#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
#' @param context string that specifies the size of the left and the right context returned in `snippet`
#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
#' as this is typically legally motivated.
#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
#'
#' @examples
#' \dontrun{
#'
#' # Fetch basic metadata for "Ameisenplage"
#' KorAPConnection() |>
#' corpusQuery("Ameisenplage") |>
#' fetchAll()
#'
#' # Fetch specific metadata fields for bibliographic analysis
#' query <- KorAPConnection() |>
#' corpusQuery("Ameisenplage",
#' fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
#' results <- fetchAll(query)
#' results@collectedMatches
#' }
#'
#' \dontrun{
#'
#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
#' # and show the number of query hits (but don't fetch them).
#'
#' KorAPConnection(verbose = TRUE) |>
#' corpusQuery(
#' KorAPUrl =
#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
#' )
#' }
#'
#' \dontrun{
#'
#' # Plot the time/frequency curve of "Ameisenplage"
#' KorAPConnection(verbose = TRUE) |>
#' {
#' . ->> kco
#' } |>
#' corpusQuery("Ameisenplage") |>
#' fetchAll() |>
#' slot("collectedMatches") |>
#' mutate(year = lubridate::year(pubDate)) |>
#' dplyr::select(year) |>
#' group_by(year) |>
#' summarise(Count = dplyr::n()) |>
#' mutate(Freq = mapply(function(f, y) {
#' f / corpusStats(kco, paste("pubDate in", y))@tokens
#' }, Count, year)) |>
#' dplyr::select(-Count) |>
#' complete(year = min(year):max(year), fill = list(Freq = 0)) |>
#' plot(type = "l")
#' }
#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
#'
#' @references
#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
#'
#' @export
setMethod(
"corpusQuery", "KorAPConnection",
function(kco,
query = if (missing(KorAPUrl)) {
stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
} else {
httr2::url_parse(KorAPUrl)$query$q
},
vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
KorAPUrl,
metadataOnly = TRUE,
ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
fields = c(
"corpusSigle",
"textSigle",
"pubDate",
"pubPlace",
"availability",
"textClass",
"snippet",
"tokens"
),
accessRewriteFatal = TRUE,
verbose = kco@verbose,
expand = length(vc) != length(query),
as.df = FALSE,
context = NULL) {
if (length(query) > 1 || length(vc) > 1) {
grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
# Initialize timing variables for ETA calculation
total_queries <- nrow(grid)
current_query <- 0
start_time <- Sys.time()
results <- purrr::pmap(grid, function(query, vc, ...) {
current_query <<- current_query + 1
# Execute the single query directly (avoiding recursive call)
contentFields <- c("snippet", "tokens")
query_fields <- fields
if (metadataOnly) {
query_fields <- query_fields[!query_fields %in% contentFields]
}
if (!"textSigle" %in% query_fields) {
query_fields <- c(query_fields, "textSigle")
}
request <-
paste0(
"?q=",
url_encode(enc2utf8(query)),
ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
ifelse(!metadataOnly, "&show-tokens=true", ""),
"&ql=", ql
)
webUIRequestUrl <- paste0(kco@KorAPUrl, request)
requestUrl <- paste0(
kco@apiUrl,
"search",
request,
"&fields=",
paste(query_fields, collapse = ","),
if (metadataOnly) "&access-rewrite-disabled=true" else ""
)
# Show individual query progress
log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
res <- apiCall(kco, paste0(requestUrl, "&count=0"))
if (is.null(res)) {
log_info(verbose, ": API call failed\n")
totalResults <- 0
} else {
totalResults <- as.integer(res$meta$totalResults)
log_info(verbose, ": ", totalResults, " hits")
if (!is.null(res$meta$cached)) {
log_info(verbose, " [cached]")
} else if (!is.null(res$meta$benchmark)) {
if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
formatted_time <- paste0(round(time_value, 2), "s")
log_info(verbose, ", took ", formatted_time)
} else {
log_info(verbose, ", took ", res$meta$benchmark)
}
}
# Calculate and display ETA information on the same line if verbose and we have more than one query
if (verbose && total_queries > 1) {
eta_info <- calculate_eta(current_query, total_queries, start_time)
if (eta_info != "") {
elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
avg_time_per_query <- elapsed_time / current_query
# Add ETA info to the same line - remove the leading ". " for cleaner formatting
clean_eta_info <- sub("^\\. ", ". ", eta_info)
log_info(verbose, clean_eta_info)
}
}
log_info(verbose, "\n")
}
result <- data.frame(
query = query,
totalResults = totalResults,
vc = vc,
webUIRequestUrl = webUIRequestUrl,
stringsAsFactors = FALSE
)
return(result)
})
results %>% bind_rows()
} else {
contentFields <- c("snippet", "tokens")
if (metadataOnly) {
fields <- fields[!fields %in% contentFields]
}
if (!"textSigle" %in% fields) {
fields <- c(fields, "textSigle")
}
request <-
paste0(
"?q=",
url_encode(enc2utf8(query)),
ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
ifelse(!metadataOnly, "&show-tokens=true", ""),
"&ql=", ql
)
webUIRequestUrl <- paste0(kco@KorAPUrl, request)
requestUrl <- paste0(
kco@apiUrl,
"search",
request,
"&fields=",
paste(fields, collapse = ","),
if (metadataOnly) "&access-rewrite-disabled=true" else ""
)
log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
sep =
""
)
res <- apiCall(kco, paste0(requestUrl, "&count=0"))
if (is.null(res)) {
message("API call failed.")
totalResults <- 0
} else {
totalResults <- as.integer(res$meta$totalResults)
log_info(verbose, ": ", totalResults, " hits")
if (!is.null(res$meta$cached)) {
log_info(verbose, " [cached]\n")
} else if (!is.null(res$meta$benchmark)) {
# Round the benchmark time to 2 decimal places for better readability
# If it's a string ending with 's', extract the number, round it, and re-add 's'
if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
formatted_time <- paste0(round(time_value, 2), "s")
log_info(verbose, ", took ", formatted_time, "\n", sep = "")
} else {
# Fallback if the format is different than expected
log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
}
} else {
log_info(verbose, "\n")
}
}
if (as.df) {
data.frame(
query = query,
totalResults = totalResults,
vc = vc,
webUIRequestUrl = webUIRequestUrl,
stringsAsFactors = FALSE
)
} else {
KorAPQuery(
korapConnection = kco,
nextStartIndex = 0,
fields = fields,
requestUrl = requestUrl,
request = request,
totalResults = totalResults,
vc = vc,
apiResponse = res,
webUIRequestUrl = webUIRequestUrl,
hasMoreMatches = (totalResults > 0),
)
}
}
}
)
#' @importFrom purrr map
repair_data_strcuture <- function(x) {
if (is.list(x)) {
as.character(purrr::map(x, ~ if (length(.x) > 1) {
paste(.x, collapse = " ")
} else {
.x
}))
} else {
ifelse(is.na(x), "", x)
}
}
#' Fetch the next bunch of results of a KorAP query.
#'
#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
#'
#' @family corpus search functions
#'
#' @param kqo object obtained from [corpusQuery()]
#' @param offset start offset for query results to fetch
#' @param maxFetch maximum number of query results to fetch
#' @param verbose print progress information if true
#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
#'
#' @examples
#' \dontrun{
#'
#' q <- KorAPConnection() |>
#' corpusQuery("Ameisenplage") |>
#' fetchNext()
#' q@collectedMatches
#' }
#'
#' @references
#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
#'
#' @aliases fetchNext
#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
#' @importFrom tibble enframe add_column
#' @importFrom stringr word
#' @importFrom tidyr unnest unchop pivot_wider
#' @importFrom purrr map
#' @export
setMethod("fetchNext", "KorAPQuery", function(kqo,
offset = kqo@nextStartIndex,
maxFetch = maxResultsPerPage,
verbose = kqo@korapConnection@verbose,
randomizePageOrder = FALSE) {
# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
results <- key <- name <- tmp_positions <- 0
if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
return(kqo)
}
use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
# Calculate the initial page number (not used directly - keeping for reference)
collectedMatches <- kqo@collectedMatches
# Track start time for ETA calculation
start_time <- Sys.time()
# For randomized page order, generate a list of randomized page indices
if (randomizePageOrder) {
# Calculate how many pages we need to fetch based on maxFetch
total_pages_to_fetch <- if (!is.na(maxFetch)) {
# Either limited by maxFetch or total results, whichever is smaller
min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
} else {
# All pages
ceiling(kqo@totalResults / maxResultsPerPage)
}
# Generate randomized page indices (0-based for API)
pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
page_index <- 1 # Index to track which page in the randomized list we're on
}
if (is.null(collectedMatches)) {
collectedMatches <- data.frame()
}
# Initialize the page counter properly based on nextStartIndex and any previously fetched results
# We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
# For first call, this will be 1, for subsequent calls, it will reflect our actual position
current_page_number <- ceiling(offset / maxResultsPerPage) + 1
# For sequential fetches, keep track of which global page we're on
# This is important for correctly showing page numbers in subsequent fetchNext calls
page_count_start <- current_page_number
repeat {
# Determine which page to fetch next
if (randomizePageOrder) {
# In randomized mode, get the page from our randomized list using the page_index
# Make sure we don't exceed the array bounds
if (page_index > length(pages)) {
break # No more pages to fetch in randomized mode
}
current_offset_page <- pages[page_index]
# For display purposes in randomized mode, show which page out of the total we're fetching
display_page_number <- page_index
} else {
# In sequential mode, use the current_page_number to calculate the offset
current_offset_page <- (current_page_number - 1)
display_page_number <- current_page_number
}
# Calculate the actual offset in tokens
currentOffset <- current_offset_page * maxResultsPerPage
# Build the query with the appropriate count and offset using httr2
count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
# Parse existing URL to preserve all query parameters
parsed_url <- httr2::url_parse(kqo@requestUrl)
existing_query <- parsed_url$query
# Add/update count and offset parameters
existing_query$count <- count_param
existing_query$offset <- currentOffset
# Rebuild the URL with all parameters
query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
res <- apiCall(kqo@korapConnection, query)
if (length(res$matches) == 0) {
break
}
if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
log_info(verbose, "Using fields API: ")
currentMatches <- res$matches$fields %>%
purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
tibble::enframe() %>%
tidyr::unnest(cols = value) %>%
tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
dplyr::select(-name)
if ("snippet" %in% colnames(res$matches)) {
currentMatches$snippet <- res$matches$snippet
}
if ("tokens" %in% colnames(res$matches)) {
currentMatches$tokens <- res$matches$tokens
}
} else {
currentMatches <- res$matches
}
for (field in kqo@fields) {
if (!field %in% colnames(currentMatches)) {
currentMatches[, field] <- NA
}
}
currentMatches <- currentMatches %>%
select(kqo@fields) %>%
mutate(
matchID = res$matches$matchID,
tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
matchStart = as.integer(stringr::word(tmp_positions, 1)),
matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
) %>%
select(-tmp_positions)
if (!is.list(collectedMatches)) {
collectedMatches <- currentMatches
} else {
collectedMatches <- bind_rows(collectedMatches, currentMatches)
}
# Get the actual items per page from the API response
# We now consistently use maxResultsPerPage instead
# Calculate total pages consistently using fixed maxResultsPerPage
# This ensures consistent page counting across the function
total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
# Calculate ETA using the centralized function from logging.R
current_page <- if (randomizePageOrder) page_index else display_page_number
total_pages_to_fetch <- if (!is.na(maxFetch)) {
# Account for offset - we can only fetch from the remaining results after offset
remaining_results_after_offset <- max(0, kqo@totalResults - offset)
min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
} else {
total_pages
}
eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)
# Extract timing information for display
time_per_page <- NA
if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
}
# Create the page display string with proper formatting
# For global page tracking, calculate the absolute page number
actual_display_number <- if (randomizePageOrder) {
current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
} else {
# In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
current_offset_page + 1
}
# For subsequent calls to fetchNext, we need to calculate the correct page numbers
# based on the current batch being fetched
# For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
# Simply count from 1 within the current batch
# The relative page number is simply the current position in this batch
if (randomizePageOrder) {
relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
} else {
relative_page_number <- display_page_number - (page_count_start - 1)
}
# How many pages will we fetch in this batch?
# If maxFetch is specified, calculate the total pages for this fetch operation
pages_in_this_batch <- if (!is.na(maxFetch)) {
# Account for offset - we can only fetch from the remaining results after offset
remaining_results_after_offset <- max(0, kqo@totalResults - offset)
min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
} else {
# Otherwise fetch all remaining pages
total_pages - page_count_start + 1
}
# The total pages to be shown in this batch
batch_total_pages <- pages_in_this_batch
page_display <- paste0(
"Retrieved page ",
sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
"/",
sprintf("%d", batch_total_pages)
)
# If randomized, also show which actual page we fetched
if (randomizePageOrder) {
# Determine the maximum width needed for page numbers (based on total pages)
# This ensures consistent alignment
max_page_width <- nchar(as.character(total_pages))
# Add the actual page number that was fetched (0-based + 1 for display) with proper padding
page_display <- paste0(
page_display,
sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
)
}
# Always show the absolute page number and total pages (for clarity)
else {
# Show the absolute page number (out of total possible pages)
page_display <- paste0(page_display, sprintf(
" (page %d of %d total)",
actual_display_number, total_pages
))
}
# Add caching or timing information
if (!is.null(res$meta$cached)) {
page_display <- paste0(page_display, " [cached]")
} else {
page_display <- paste0(
page_display,
" in ",
if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
"s",
eta_info
)
}
log_info(verbose, paste0(page_display, "\n"))
# Increment the appropriate counter based on mode
if (randomizePageOrder) {
page_index <- page_index + 1
} else {
current_page_number <- current_page_number + 1
}
results <- results + res$meta$itemsPerPage
if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
break
}
}
nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
KorAPQuery(
nextStartIndex = nextStartIndex,
korapConnection = kqo@korapConnection,
fields = kqo@fields,
requestUrl = kqo@requestUrl,
request = kqo@request,
totalResults = kqo@totalResults,
vc = kqo@vc,
webUIRequestUrl = kqo@webUIRequestUrl,
hasMoreMatches = (kqo@totalResults > nextStartIndex),
apiResponse = res,
collectedMatches = collectedMatches
)
})
#' Fetch all results of a KorAP query.
#'
#' **`fetchAll`** fetches all results of a KorAP query.
#'
#' @family corpus search functions
#' @param kqo object obtained from [corpusQuery()]
#' @param verbose print progress information if true
#' @param ... further arguments passed to [fetchNext()]
#' @return The updated `kqo` object with all results in `@collectedMatches`
#'
#' @examples
#' \dontrun{
#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
#' q <- KorAPConnection() |>
#' corpusQuery("Ameisenplage") |>
#' fetchAll()
#' q@collectedMatches
#'
#' # Fetch also all KWICs
#' q <- KorAPConnection() |> auth() |>
#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
#' fetchAll()
#' q@collectedMatches
#'
#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
#' q <- KorAPConnection() |>
#' corpusQuery("<base/s=t>", # this matches each text once
#' vc = "pubDate in 1958-03-12",
#' fields = c("textSigle", "title"),
#' ) |>
#' fetchAll()
#' q@collectedMatches
#' }
#'
#' @aliases fetchAll
#' @export
setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
})
#' Fetches the remaining results of a KorAP query.
#'
#' @param kqo object obtained from [corpusQuery()]
#' @param verbose print progress information if true
#' @param ... further arguments passed to [fetchNext()]
#' @return The updated `kqo` object with remaining results in `@collectedMatches`
#'
#' @examples
#' \dontrun{
#'
#' q <- KorAPConnection() |>
#' corpusQuery("Ameisenplage") |>
#' fetchRest()
#' q@collectedMatches
#' }
#'
#' @aliases fetchRest
#' @export
setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
})
#'
#' Parse XML annotations into linguistic layers
#'
#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
#' from XML annotation snippets returned by the KorAP API.
#'
#' @param xml_snippet XML string containing annotation data
#' @return Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
#' @keywords internal
parse_xml_annotations <- function(xml_snippet) {
if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
}
# Extract content within <span class="match">...</span> using a more robust approach
if (grepl('<span class="match">', xml_snippet)) {
# Find the start of match span
start_pos <- regexpr('<span class="match">', xml_snippet)
if (start_pos > 0) {
# Find the end by counting nested spans
content_start <- start_pos + attr(start_pos, "match.length")
remaining <- substr(xml_snippet, content_start, nchar(xml_snippet))
# Simple approach: extract everything until we hit context-right or end
if (grepl('<span class="context-right">', remaining)) {
content_to_parse <- gsub('(.*?)<span class="context-right">.*', '\\1', remaining)
} else {
# Find the closing </span> that matches our opening span
# For now, use a simpler approach - take everything until the last </span> sequence
content_to_parse <- gsub('(.*)</span>\\s*$', '\\1', remaining)
}
} else {
content_to_parse <- xml_snippet
}
} else {
content_to_parse <- xml_snippet
}
# Initialize result vectors
tokens <- character(0)
lemmas <- character(0)
pos_tags <- character(0)
morph_tags <- character(0)
# Split the content by </span> and process each meaningful part
parts <- unlist(strsplit(content_to_parse, '</span>'))
for (part in parts) {
part <- trimws(part)
if (nchar(part) == 0) next
# Look for parts that have title attributes and end with text
if (grepl('<span[^>]*title=', part)) {
# Extract the text content (everything after the last >)
text_content <- gsub('.*>([^<]*)$', '\\1', part)
text_content <- trimws(text_content)
if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
tokens <- c(tokens, text_content)
# Extract all title attributes from this part
title_pattern <- 'title="([^"]*)"'
title_matches <- gregexpr(title_pattern, part)
lemma <- NA
pos_tag <- NA
morph_tag <- NA
if (title_matches[[1]][1] != -1) {
all_titles <- regmatches(part, title_matches)[[1]]
for (title_match in all_titles) {
title_content <- gsub(title_pattern, '\\1', title_match)
# Split by spaces and process each annotation
annotations <- unlist(strsplit(title_content, "\\s+"))
for (annotation in annotations) {
if (grepl('^[^/]+/l:', annotation)) {
lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
} else if (grepl('^[^/]+/p:', annotation)) {
pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
} else if (grepl('^[^/]+/m:', annotation)) {
morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
}
}
}
}
lemmas <- c(lemmas, lemma)
pos_tags <- c(pos_tags, pos_tag)
morph_tags <- c(morph_tags, morph_tag)
}
}
}
# If no tokens found with the splitting approach, try a different method
if (length(tokens) == 0) {
# Look for the innermost spans that contain actual text
innermost_pattern <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
innermost_matches <- gregexpr(innermost_pattern, content_to_parse, perl = TRUE)
if (innermost_matches[[1]][1] != -1) {
matches <- regmatches(content_to_parse, innermost_matches)[[1]]
for (match in matches) {
title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
text <- trimws(text)
if (nchar(text) > 0) {
tokens <- c(tokens, text)
# Parse space-separated annotations in title
lemma <- NA
pos_tag <- NA
morph_tag <- NA
annotations <- unlist(strsplit(title, "\\s+"))
for (annotation in annotations) {
if (grepl('^[^/]+/l:', annotation)) {
lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
} else if (grepl('^[^/]+/p:', annotation)) {
pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
} else if (grepl('^[^/]+/m:', annotation)) {
morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
}
}
lemmas <- c(lemmas, lemma)
pos_tags <- c(pos_tags, pos_tag)
morph_tags <- c(morph_tags, morph_tag)
}
}
}
}
# Ensure all vectors have the same length
max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
if (max_length > 0) {
tokens <- c(tokens, rep(NA, max_length - length(tokens)))
lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
}
return(list(
token = tokens,
lemma = lemmas,
pos = pos_tags,
morph = morph_tags
))
}
#'
#' Parse XML annotations into linguistic layers with left/match/right structure
#'
#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
#' from XML annotation snippets returned by the KorAP API, split into left context,
#' match, and right context sections like the tokens field.
#'
#' @param xml_snippet XML string containing annotation data
#' @return Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
#' @keywords internal
parse_xml_annotations_structured <- function(xml_snippet) {
if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
empty_result <- list(left = character(0), match = character(0), right = character(0))
return(list(
atokens = empty_result,
lemma = empty_result,
pos = empty_result,
morph = empty_result
))
}
# Helper function to extract annotations from a span section
extract_annotations_from_section <- function(section_content) {
# Handle both spaced tokens and nested single tokens
tokens <- character(0)
lemmas <- character(0)
pos_tags <- character(0)
morph_tags <- character(0)
# First try to split by spaces between span groups (for multiple tokens)
# Look for spaces that separate token groups
if (grepl('</span>\\s+<span', section_content)) {
# Multiple tokens separated by spaces
token_groups <- unlist(strsplit(section_content, '(?<=</span>)\\s+(?=<span)', perl = TRUE))
} else {
# Single token (or no spaces between tokens)
token_groups <- c(section_content)
}
for (group in token_groups) {
group <- trimws(group)
if (nchar(group) == 0) next
# Extract the actual text content (the innermost text)
text_match <- regexpr('>([^<>]+)</span>', group, perl = TRUE)
if (text_match > 0) {
# Find all possible text contents and take the last one (innermost)
all_texts <- regmatches(group, gregexpr('>([^<>]+)</span>', group, perl = TRUE))[[1]]
if (length(all_texts) > 0) {
# Take the last match (innermost text)
text_content <- sub('.*>([^<>]+)</span>.*', '\\1', all_texts[length(all_texts)], perl = TRUE)
text_content <- trimws(text_content)
if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
tokens <- c(tokens, text_content)
# Extract all title attributes from this group
titles <- regmatches(group, gregexpr('title="([^"]*)"', group, perl = TRUE))[[1]]
morph_features <- character(0)
lemma <- NA
pos_tag <- NA
for (title in titles) {
content <- sub('title="([^"]*)"', '\\1', title, perl = TRUE)
if (grepl('^[^/]+/l:', content)) {
lemma <- sub('^[^/]+/l:(.*)$', '\\1', content)
} else if (grepl('^[^/]+/p:', content)) {
pos_tag <- sub('^[^/]+/p:(.*)$', '\\1', content)
} else if (grepl('^[^/]+/m:', content)) {
morph_feature <- sub('^[^/]+/m:(.*)$', '\\1', content)
morph_features <- c(morph_features, morph_feature)
}
}
lemmas <- c(lemmas, lemma)
pos_tags <- c(pos_tags, pos_tag)
morph_tag <- if (length(morph_features) > 0) paste(morph_features, collapse = "|") else NA
morph_tags <- c(morph_tags, morph_tag)
}
}
}
}
# Ensure all vectors have the same length
max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
if (max_length > 0) {
tokens <- c(tokens, rep(NA, max_length - length(tokens)))
lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
}
return(list(
tokens = tokens,
lemmas = lemmas,
pos_tags = pos_tags,
morph_tags = morph_tags
))
}
# Split the XML into three parts: left context, match content, and right context
# The structure is: <span class="match">...left...<mark>...match...</mark>...right...</span>
# First extract the content within the match span using DOTALL modifier
match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*<span class="context-right">'
match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
if (match_span_match == -1) {
# Try alternative pattern if no context-right
match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*$'
match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
}
if (match_span_match > 0) {
match_span_content <- gsub(match_span_pattern, '\\1', xml_snippet, perl = TRUE)
# Now find the <mark> and </mark> positions within this content
mark_start <- regexpr('<mark[^>]*>', match_span_content, perl = TRUE)
mark_end <- regexpr('</mark>', match_span_content, perl = TRUE)
if (mark_start > 0 && mark_end > 0) {
# Left context: everything before <mark>
left_content <- substr(match_span_content, 1, mark_start - 1)
# Match content: everything between <mark> and </mark> (including the mark tags for now)
match_content <- substr(match_span_content, mark_start, mark_end + attr(mark_end, "match.length") - 1)
# Right context: everything after </mark>
right_content_start <- mark_end + attr(mark_end, "match.length")
right_content <- substr(match_span_content, right_content_start, nchar(match_span_content))
} else {
# No mark tags found, treat entire match span as match content
left_content <- ""
match_content <- match_span_content
right_content <- ""
}
} else {
# No match span found, treat entire content as match
left_content <- ""
match_content <- xml_snippet
right_content <- ""
}
# Process each section
left_annotations <- extract_annotations_from_section(left_content)
match_annotations <- extract_annotations_from_section(match_content)
right_annotations <- extract_annotations_from_section(right_content)
return(list(
atokens = list(
left = left_annotations$tokens,
match = match_annotations$tokens,
right = right_annotations$tokens
),
lemma = list(
left = left_annotations$lemmas,
match = match_annotations$lemmas,
right = right_annotations$lemmas
),
pos = list(
left = left_annotations$pos_tags,
match = match_annotations$pos_tags,
right = right_annotations$pos_tags
),
morph = list(
left = left_annotations$morph_tags,
match = match_annotations$morph_tags,
right = right_annotations$morph_tags
)
))
}
#' Fetch annotations for all collected matches
#'
#' **`fetchAnnotations`** fetches annotations for all matches in the `@collectedMatches` slot
#' of a KorAPQuery object and adds annotation columns directly to the `@collectedMatches`
#' data frame. The method automatically uses the `matchID` from collected matches when
#' available for safer and more reliable annotation retrieval, falling back to constructing
#' URLs from `matchStart` and `matchEnd` if necessary.
#'
#' **Important**: For copyright-restricted corpora, users must be authorized via [auth()]
#' and the initial corpus query must have `metadataOnly = FALSE` to ensure snippets are
#' available for annotation parsing.
#'
#' The method parses XML snippet annotations and adds linguistic columns to the data frame:
#' - `pos`: data frame with `left`, `match`, `right` columns, each containing list vectors of part-of-speech tags
#' - `lemma`: data frame with `left`, `match`, `right` columns, each containing list vectors of lemmas
#' - `morph`: data frame with `left`, `match`, `right` columns, each containing list vectors of morphological tags
#' - `atokens`: data frame with `left`, `match`, `right` columns, each containing list vectors of token text (from annotations)
#' - `annotation_snippet`: original XML snippet from the annotation API
#'
#' @family corpus search functions
#' @aliases fetchAnnotations
#'
#' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
#' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
#' @param verbose print progress information if true
#' @return The updated `kqo` object with annotation columns added to `@collectedMatches`
#'
#' @examples
#' \dontrun{
#'
#' # Fetch annotations for matches using Tree-Tagger foundry
#' # Note: Authorization required for copyright-restricted corpora
#' q <- KorAPConnection() |>
#' auth() |>
#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
#' fetchNext(maxFetch = 10) |>
#' fetchAnnotations()
#'
#' # Access linguistic annotations for match i:
#' pos_tags <- q@collectedMatches$pos # Data frame with left/match/right columns for POS tags
#' lemmas <- q@collectedMatches$lemma # Data frame with left/match/right columns for lemmas
#' morphology <- q@collectedMatches$morph # Data frame with left/match/right columns for morphological tags
#' atokens <- q@collectedMatches$atokens # Data frame with left/match/right columns for annotation token text
#' raw_snippet <- q@collectedMatches$annotation_snippet[[i]] # Original XML snippet for match i
#'
#' # Access specific components:
#' match_pos <- q@collectedMatches$pos$match[[i]] # POS tags for the matched tokens in match i
#' left_lemmas <- q@collectedMatches$lemma$left[[i]] # Lemmas for the left context in match i
#' right_tokens <- q@collectedMatches$atokens$right[[i]] # Token text for the right context in match i
#'
#' # Use a different foundry (e.g., mate-parser)
#' q <- KorAPConnection() |>
#' auth() |>
#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
#' fetchNext(maxFetch = 10) |>
#' fetchAnnotations(foundry = "mate")
#' q@collectedMatches
#' }
#' @export
setMethod("fetchAnnotations", "KorAPQuery", function(kqo, foundry = "tt", verbose = kqo@korapConnection@verbose) {
if (is.null(kqo@collectedMatches) || nrow(kqo@collectedMatches) == 0) {
warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
return(kqo)
}
df <- kqo@collectedMatches
kco <- kqo@korapConnection
# Initialize annotation columns as data frames (like tokens field)
# Create the structure more explicitly to avoid assignment issues
nrows <- nrow(df)
df$pos <- data.frame(
left = I(replicate(nrows, character(0), simplify = FALSE)),
match = I(replicate(nrows, character(0), simplify = FALSE)),
right = I(replicate(nrows, character(0), simplify = FALSE)),
stringsAsFactors = FALSE
)
df$lemma <- data.frame(
left = I(replicate(nrows, character(0), simplify = FALSE)),
match = I(replicate(nrows, character(0), simplify = FALSE)),
right = I(replicate(nrows, character(0), simplify = FALSE)),
stringsAsFactors = FALSE
)
df$morph <- data.frame(
left = I(replicate(nrows, character(0), simplify = FALSE)),
match = I(replicate(nrows, character(0), simplify = FALSE)),
right = I(replicate(nrows, character(0), simplify = FALSE)),
stringsAsFactors = FALSE
)
df$atokens <- data.frame(
left = I(replicate(nrows, character(0), simplify = FALSE)),
match = I(replicate(nrows, character(0), simplify = FALSE)),
right = I(replicate(nrows, character(0), simplify = FALSE)),
stringsAsFactors = FALSE
)
df$annotation_snippet <- replicate(nrows, NA, simplify = FALSE)
for (i in seq_len(nrow(df))) {
# Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
# matchID format: "match-match-A00/JUN/39609-p202-203" or encrypted format like
# "match-DNB10/CSL/80400-p2343-2344x_MinDOhu_P6dd2MMZJyyus_7MairdKnr1LxY07Cya-Ow"
# Extract document path and position, handling both regular and encrypted formats
# More flexible regex to extract the document path with position and encryption
# Look for pattern: match-(...)-p(\d+)-(\d+)(.*) where (.*) is the encrypted part
# We need to capture the entire path including the encrypted suffix
match_result <- regexpr("match-(.+?-p\\d+-\\d+.*)", df$matchID[i], perl = TRUE)
if (match_result > 0) {
# Extract the complete path including encryption (everything after "match-")
doc_path_with_pos_and_encryption <- gsub("^match-(.+)$", "\\1", df$matchID[i], perl = TRUE)
# Convert the dash before position to slash, but keep everything after the position
match_path <- gsub("-p(\\d+-\\d+.*)", "/p\\1", doc_path_with_pos_and_encryption)
req <- paste0(kco@apiUrl, "corpus/", match_path, "?foundry=", foundry)
} else {
# If regex fails, fall back to the old method
req <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", df$matchStart[i], "-", df$matchEnd[i], "?foundry=", foundry)
}
} else {
# Fallback to the old method
req <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", df$matchStart[i], "-", df$matchEnd[i], "?foundry=", foundry)
}
tryCatch({
res <- apiCall(kco, req)
if (!is.null(res)) {
# Store the raw annotation snippet
df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
# Parse XML annotations if snippet is available
if (is.list(res) && "snippet" %in% names(res)) {
parsed_annotations <- parse_xml_annotations_structured(res$snippet)
# Store the parsed linguistic data in data frame format (like tokens)
# Use individual assignment to avoid data frame mismatch errors
tryCatch({
# Assign POS annotations
df$pos$left[i] <- list(parsed_annotations$pos$left)
df$pos$match[i] <- list(parsed_annotations$pos$match)
df$pos$right[i] <- list(parsed_annotations$pos$right)
# Assign lemma annotations
df$lemma$left[i] <- list(parsed_annotations$lemma$left)
df$lemma$match[i] <- list(parsed_annotations$lemma$match)
df$lemma$right[i] <- list(parsed_annotations$lemma$right)
# Assign morphology annotations
df$morph$left[i] <- list(parsed_annotations$morph$left)
df$morph$match[i] <- list(parsed_annotations$morph$match)
df$morph$right[i] <- list(parsed_annotations$morph$right)
# Assign token annotations
df$atokens$left[i] <- list(parsed_annotations$atokens$left)
df$atokens$match[i] <- list(parsed_annotations$atokens$match)
df$atokens$right[i] <- list(parsed_annotations$atokens$right)
}, error = function(assign_error) {
# Set empty character vectors on assignment error using list assignment
df$pos$left[i] <<- list(character(0))
df$pos$match[i] <<- list(character(0))
df$pos$right[i] <<- list(character(0))
df$lemma$left[i] <<- list(character(0))
df$lemma$match[i] <<- list(character(0))
df$lemma$right[i] <<- list(character(0))
df$morph$left[i] <<- list(character(0))
df$morph$match[i] <<- list(character(0))
df$morph$right[i] <<- list(character(0))
df$atokens$left[i] <<- list(character(0))
df$atokens$match[i] <<- list(character(0))
df$atokens$right[i] <<- list(character(0))
})
} else {
# No snippet available, store empty vectors
df$pos$left[i] <- list(character(0))
df$pos$match[i] <- list(character(0))
df$pos$right[i] <- list(character(0))
df$lemma$left[i] <- list(character(0))
df$lemma$match[i] <- list(character(0))
df$lemma$right[i] <- list(character(0))
df$morph$left[i] <- list(character(0))
df$morph$match[i] <- list(character(0))
df$morph$right[i] <- list(character(0))
df$atokens$left[i] <- list(character(0))
df$atokens$match[i] <- list(character(0))
df$atokens$right[i] <- list(character(0))
}
} else {
# Store NAs for failed requests
df$pos$left[i] <- list(NA)
df$pos$match[i] <- list(NA)
df$pos$right[i] <- list(NA)
df$lemma$left[i] <- list(NA)
df$lemma$match[i] <- list(NA)
df$lemma$right[i] <- list(NA)
df$morph$left[i] <- list(NA)
df$morph$match[i] <- list(NA)
df$morph$right[i] <- list(NA)
df$atokens$left[i] <- list(NA)
df$atokens$match[i] <- list(NA)
df$atokens$right[i] <- list(NA)
df$annotation_snippet[[i]] <- NA
}
}, error = function(e) {
# Store NAs for failed requests
df$pos$left[i] <- list(NA)
df$pos$match[i] <- list(NA)
df$pos$right[i] <- list(NA)
df$lemma$left[i] <- list(NA)
df$lemma$match[i] <- list(NA)
df$lemma$right[i] <- list(NA)
df$morph$left[i] <- list(NA)
df$morph$match[i] <- list(NA)
df$morph$right[i] <- list(NA)
df$atokens$left[i] <- list(NA)
df$atokens$match[i] <- list(NA)
df$atokens$right[i] <- list(NA)
df$annotation_snippet[[i]] <- NA
})
}
# Validate data frame structure before assignment
if (nrow(df) != nrow(kqo@collectedMatches)) {
}
# Update the collectedMatches with annotation data
tryCatch({
kqo@collectedMatches <- df
}, error = function(assign_error) {
# Try a safer approach: add columns individually
tryCatch({
kqo@collectedMatches$pos <- df$pos
kqo@collectedMatches$lemma <- df$lemma
kqo@collectedMatches$morph <- df$morph
kqo@collectedMatches$atokens <- df$atokens
kqo@collectedMatches$annotation_snippet <- df$annotation_snippet
}, error = function(col_error) {
warning("Failed to add annotation data to collectedMatches")
})
})
return(kqo)
})
#' Query frequencies of search expressions in virtual corpora
#'
#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
#' [ci()] to compute a tibble with the absolute and relative frequencies and
#' confidence intervals of one ore multiple search terms across one or multiple
#' virtual corpora.
#'
#' @family frequency analysis
#' @aliases frequencyQuery
#' @examples
#' \dontrun{
#'
#' KorAPConnection(verbose = TRUE) |>
#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
#' }
#'
# @inheritParams corpusQuery
#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
#' @param vc virtual corpus definition(s) (can be a vector)
#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
#' @export
#'
#' @return A tibble, with each row containing the following result columns for query and vc combinations:
#' - **query**: the query string used for the frequency analysis.
#' - **totalResults**: absolute frequency of query matches in the vc.
#' - **vc**: virtual corpus used for the query.
#' - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
#' - **total**: total number of words in vc.
#' - **f**: relative frequency of query matches in the vc.
#' - **conf.low**: lower bound of the confidence interval for the relative frequency, given `conf.level`.
#' - **conf.high**: upper bound of the confidence interval for the relative frequency, given `conf.level`.
setMethod(
"frequencyQuery", "KorAPConnection",
function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
(if (as.alternatives) {
corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
group_by(vc) |>
mutate(total = sum(totalResults))
} else {
corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
}) |>
ci(conf.level = conf.level)
}
)
#' buildWebUIRequestUrlFromString
#'
#' @rdname KorAPQuery-class
#' @importFrom urltools url_encode
#' @export
buildWebUIRequestUrlFromString <- function(KorAPUrl,
query,
vc = "",
ql = "poliqarp") {
if ("KorAPConnection" %in% class(KorAPUrl)) {
KorAPUrl <- KorAPUrl@KorAPUrl
}
request <-
paste0(
"?q=",
urltools::url_encode(enc2utf8(as.character(query))),
ifelse(vc != "",
paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
""
),
"&ql=",
ql
)
paste0(KorAPUrl, request)
}
#' buildWebUIRequestUrl
#'
#' @rdname KorAPQuery-class
#' @importFrom httr2 url_parse
#' @export
buildWebUIRequestUrl <- function(kco,
query = if (missing(KorAPUrl)) {
stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
} else {
httr2::url_parse(KorAPUrl)$query$q
},
vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
KorAPUrl,
ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
}
#' format()
#' @rdname KorAPQuery-class
#' @param x KorAPQuery object
#' @param ... further arguments passed to or from other methods
#' @importFrom urltools param_get url_decode
#' @export
format.KorAPQuery <- function(x, ...) {
cat("<KorAPQuery>\n")
q <- x
param <- urltools::param_get(q@request) |> lapply(urltools::url_decode)
cat(" Query: ", param$q, "\n")
if (!is.null(param$cq) && param$cq != "") {
cat(" Virtual corpus: ", param$cq, "\n")
}
if (!is.null(q@collectedMatches)) {
cat("==============================================================================================================", "\n")
print(summary(q@collectedMatches))
cat("==============================================================================================================", "\n")
}
cat(" Total results: ", q@totalResults, "\n")
cat(" Fetched results: ", q@nextStartIndex, "\n")
if (!is.null(q@collectedMatches) && "pos" %in% colnames(q@collectedMatches)) {
successful_annotations <- sum(!is.na(q@collectedMatches$annotation_snippet))
parsed_annotations <- sum(!is.na(q@collectedMatches$pos))
cat(" Annotations: ", successful_annotations, " of ", nrow(q@collectedMatches), " matches")
if (parsed_annotations > 0) {
cat(" (", parsed_annotations, " with parsed linguistic data)")
}
cat("\n")
}
}
#' show()
#'
#' @rdname KorAPQuery-class
#' @param object KorAPQuery object
#' @export
setMethod("show", "KorAPQuery", function(object) {
format(object)
invisible(object)
})