blob: fa788849659337d6dd9158910f1f95c08158f1b6 [file] [log] [blame]
Marc Kupietza8c40f42025-06-24 15:49:52 +02001#' KorAPQuery class (internal)
Marc Kupietze95108e2019-09-18 13:23:58 +02002#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02003#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
Marc Kupietze95108e2019-09-18 13:23:58 +02004#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02005#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +02006#' @include KorAPConnection.R
Marc Kupietz6dfeed92025-06-03 11:58:06 +02007#' @include logging.R
Marc Kupietzf9129592025-01-26 19:17:54 +01008#' @import httr2
Marc Kupietze95108e2019-09-18 13:23:58 +02009#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
Marc Kupietza29f3d42025-07-18 10:14:43 +020024 "hasMoreMatches"
Marc Kupietze95108e2019-09-18 13:23:58 +020025))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietza8c40f42025-06-24 15:49:52 +020027#' Initialize KorAPQuery object
28#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +020029#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020030#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020031#' @param request query part of the request URL
32#' @param vc definition of a virtual corpus
33#' @param totalResults number of hits the query has yielded
34#' @param nextStartIndex at what index to start the next fetch of query results
35#' @param fields what data / metadata fields should be collected
36#' @param requestUrl complete URL of the API request
37#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
38#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020039#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020040#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020041#'
42#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020043#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +020044setMethod(
45 "initialize", "KorAPQuery",
46 function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
47 "corpusSigle", "textSigle", "pubDate", "pubPlace",
48 "availability", "textClass", "snippet", "tokens"
49 ),
Marc Kupietza29f3d42025-07-18 10:14:43 +020050 requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
Marc Kupietzd8851222025-05-01 10:57:19 +020051 .Object <- callNextMethod()
52 .Object@korapConnection <- korapConnection
53 .Object@request <- request
54 .Object@vc <- vc
55 .Object@totalResults <- totalResults
56 .Object@nextStartIndex <- nextStartIndex
57 .Object@fields <- fields
58 .Object@requestUrl <- requestUrl
59 .Object@webUIRequestUrl <- webUIRequestUrl
60 .Object@apiResponse <- apiResponse
61 .Object@hasMoreMatches <- hasMoreMatches
62 .Object@collectedMatches <- collectedMatches
63 .Object
64 }
65)
Marc Kupietz632cbd42019-09-06 16:04:51 +020066
Marc Kupietzd8851222025-05-01 10:57:19 +020067setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
68setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
69setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
70setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
Marc Kupietz0af75932025-09-09 18:14:16 +020071setGeneric(
72 "fetchAnnotations",
73 function(kqo,
74 foundry = "tt",
75 overwrite = FALSE,
76 verbose = kqo@korapConnection@verbose) standardGeneric("fetchAnnotations")
77)
Marc Kupietzd8851222025-05-01 10:57:19 +020078setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietze95108e2019-09-18 13:23:58 +020079
80maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020081
Marc Kupietz4de53ec2019-10-04 09:12:00 +020082## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietzef1ef4a2025-02-19 12:12:40 +010083utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020084
Marc Kupietza8c40f42025-06-24 15:49:52 +020085#' Search corpus for query terms
Marc Kupietzdbd431a2021-08-29 12:17:45 +020086#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020087#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020088#'
Marc Kupietza8c40f42025-06-24 15:49:52 +020089#' @family corpus search functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +020090#' @aliases corpusQuery
91#'
92#' @importFrom urltools url_encode
93#' @importFrom purrr pmap
Marc Kupietzea34b812025-06-25 15:49:00 +020094#' @importFrom dplyr bind_rows group_by
Marc Kupietzdbd431a2021-08-29 12:17:45 +020095#'
Marc Kupietz617266d2025-02-27 10:43:07 +010096#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz67edcb52021-09-20 21:54:24 +020097#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020098#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020099#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz132f0052023-04-16 14:23:05 +0200100#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
101#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
102#' your RKorAPClient application as explained in the
103#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
104#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
105#' `FALSE`.
Marc Kupietz67edcb52021-09-20 21:54:24 +0200106#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Marc Kupietz1623fe82025-06-24 16:31:46 +0200107#' @param fields character vector specifying which metadata fields to retrieve for each match.
108#' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
109#' \describe{
110#' \item{**Text identification**:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
111#' \item{**Publication info**:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
112#' \item{**Temporal data**:}{`pubDate`, `creationDate` - when text was published/created}
113#' \item{**Publication details**:}{`pubPlace`, `publisher`, `reference` - where/how published}
114#' \item{**Text classification**:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
115#' \item{**Adminstrative and technical info**:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
116#' \item{**Content data**:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
117#' \item{**System data**:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
118#' }
119#' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
120#' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default if `metadataOnly` is set to `FALSE`.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100121#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +0200122#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200123#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200124#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200125#' @param context string that specifies the size of the left and the right context returned in `snippet`
126#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
127#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
128#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
129#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
130#' as this is typically legally motivated.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200131#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz67edcb52021-09-20 21:54:24 +0200132#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
133#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200134#'
135#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200136#' \dontrun{
137#'
Marc Kupietz1623fe82025-06-24 16:31:46 +0200138#' # Fetch basic metadata for "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200139#' KorAPConnection() |>
140#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200141#' fetchAll()
Marc Kupietz1623fe82025-06-24 16:31:46 +0200142#'
143#' # Fetch specific metadata fields for bibliographic analysis
Marc Kupietzd3526422025-06-25 09:16:15 +0200144#' query <- KorAPConnection() |>
Marc Kupietz1623fe82025-06-24 16:31:46 +0200145#' corpusQuery("Ameisenplage",
146#' fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
147#' results <- fetchAll(query)
148#' results@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100149#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200150#'
Marc Kupietz6ae76052021-09-21 10:34:00 +0200151#' \dontrun{
152#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200153#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
154#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200155#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200156#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200157#' corpusQuery(
158#' KorAPUrl =
159#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
160#' )
Marc Kupietz6ae76052021-09-21 10:34:00 +0200161#' }
162#'
163#' \dontrun{
Marc Kupietz3c531f62019-09-13 12:17:24 +0200164#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200165#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200166#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200167#' {
168#' . ->> kco
Marc Kupietzd3526422025-06-25 09:16:15 +0200169#' } |>
170#' corpusQuery("Ameisenplage") |>
171#' fetchAll() |>
172#' slot("collectedMatches") |>
173#' mutate(year = lubridate::year(pubDate)) |>
174#' dplyr::select(year) |>
175#' group_by(year) |>
176#' summarise(Count = dplyr::n()) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200177#' mutate(Freq = mapply(function(f, y) {
178#' f / corpusStats(kco, paste("pubDate in", y))@tokens
Marc Kupietzd3526422025-06-25 09:16:15 +0200179#' }, Count, year)) |>
180#' dplyr::select(-Count) |>
181#' complete(year = min(year):max(year), fill = list(Freq = 0)) |>
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200182#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100183#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200184#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200185#'
186#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200187#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200188#'
189#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +0200190setMethod(
191 "corpusQuery", "KorAPConnection",
192 function(kco,
193 query = if (missing(KorAPUrl)) {
194 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
195 } else {
196 httr2::url_parse(KorAPUrl)$query$q
197 },
198 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
199 KorAPUrl,
200 metadataOnly = TRUE,
201 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
202 fields = c(
203 "corpusSigle",
204 "textSigle",
205 "pubDate",
206 "pubPlace",
207 "availability",
208 "textClass",
209 "snippet",
210 "tokens"
211 ),
212 accessRewriteFatal = TRUE,
213 verbose = kco@verbose,
214 expand = length(vc) != length(query),
215 as.df = FALSE,
216 context = NULL) {
217 if (length(query) > 1 || length(vc) > 1) {
218 grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200219
220 # Initialize timing variables for ETA calculation
221 total_queries <- nrow(grid)
222 current_query <- 0
223 start_time <- Sys.time()
224
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200225 results <- purrr::pmap(grid, function(query, vc, ...) {
226 current_query <<- current_query + 1
227
228 # Execute the single query directly (avoiding recursive call)
229 contentFields <- c("snippet", "tokens")
230 query_fields <- fields
231 if (metadataOnly) {
232 query_fields <- query_fields[!query_fields %in% contentFields]
233 }
234 if (!"textSigle" %in% query_fields) {
235 query_fields <- c(query_fields, "textSigle")
236 }
237 request <-
238 paste0(
239 "?q=",
240 url_encode(enc2utf8(query)),
241 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
242 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
243 ifelse(!metadataOnly, "&show-tokens=true", ""),
244 "&ql=", ql
245 )
246 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
247 requestUrl <- paste0(
248 kco@apiUrl,
249 "search",
250 request,
251 "&fields=",
252 paste(query_fields, collapse = ","),
253 if (metadataOnly) "&access-rewrite-disabled=true" else ""
254 )
255
256 # Show individual query progress
257 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
258 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
259 if (is.null(res)) {
260 log_info(verbose, ": API call failed\n")
261 totalResults <- 0
262 } else {
Marc Kupietz336c85d2025-07-24 13:52:03 +0200263 # Check for query rewrites and warn the user
264 warnOnRewrites(res)
265
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200266 totalResults <- as.integer(res$meta$totalResults)
267 log_info(verbose, ": ", totalResults, " hits")
268 if (!is.null(res$meta$cached)) {
269 log_info(verbose, " [cached]")
270 } else if (!is.null(res$meta$benchmark)) {
271 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
272 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
273 formatted_time <- paste0(round(time_value, 2), "s")
274 log_info(verbose, ", took ", formatted_time)
275 } else {
276 log_info(verbose, ", took ", res$meta$benchmark)
277 }
278 }
Marc Kupietz365660e2025-06-25 15:09:55 +0200279
280 # Calculate and display ETA information on the same line if verbose and we have more than one query
281 if (verbose && total_queries > 1) {
282 eta_info <- calculate_eta(current_query, total_queries, start_time)
283 if (eta_info != "") {
284 elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
285 avg_time_per_query <- elapsed_time / current_query
286
287 # Add ETA info to the same line - remove the leading ". " for cleaner formatting
288 clean_eta_info <- sub("^\\. ", ". ", eta_info)
289 log_info(verbose, clean_eta_info)
290 }
291 }
292
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200293 log_info(verbose, "\n")
294 }
295
296 result <- data.frame(
297 query = query,
298 totalResults = totalResults,
299 vc = vc,
300 webUIRequestUrl = webUIRequestUrl,
301 stringsAsFactors = FALSE
302 )
303
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200304 return(result)
305 })
306
307 results %>% bind_rows()
Marc Kupietzd8851222025-05-01 10:57:19 +0200308 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200309 contentFields <- c("snippet", "tokens")
Marc Kupietza96537f2019-11-09 23:07:44 +0100310 if (metadataOnly) {
311 fields <- fields[!fields %in% contentFields]
312 }
Marc Kupietz80dc6432025-02-07 16:57:40 +0100313 if (!"textSigle" %in% fields) {
314 fields <- c(fields, "textSigle")
315 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100316 request <-
Marc Kupietzd8851222025-05-01 10:57:19 +0200317 paste0(
318 "?q=",
319 url_encode(enc2utf8(query)),
320 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
321 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
322 ifelse(!metadataOnly, "&show-tokens=true", ""),
323 "&ql=", ql
324 )
Marc Kupietza96537f2019-11-09 23:07:44 +0100325 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
326 requestUrl <- paste0(
327 kco@apiUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200328 "search",
Marc Kupietza96537f2019-11-09 23:07:44 +0100329 request,
Marc Kupietzd8851222025-05-01 10:57:19 +0200330 "&fields=",
Marc Kupietza96537f2019-11-09 23:07:44 +0100331 paste(fields, collapse = ","),
Marc Kupietzd8851222025-05-01 10:57:19 +0200332 if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietza96537f2019-11-09 23:07:44 +0100333 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200334 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
335 sep =
336 ""
337 )
338 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietza4675722022-02-23 23:55:15 +0100339 if (is.null(res)) {
Marc Kupietza4675722022-02-23 23:55:15 +0100340 message("API call failed.")
341 totalResults <- 0
342 } else {
Marc Kupietz336c85d2025-07-24 13:52:03 +0200343 # Check for query rewrites and warn the user
344 warnOnRewrites(res)
345
Marc Kupietzd8851222025-05-01 10:57:19 +0200346 totalResults <- as.integer(res$meta$totalResults)
Marc Kupietza47d1502023-04-18 15:26:47 +0200347 log_info(verbose, ": ", totalResults, " hits")
Marc Kupietzd8851222025-05-01 10:57:19 +0200348 if (!is.null(res$meta$cached)) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200349 log_info(verbose, " [cached]\n")
Marc Kupietzd8851222025-05-01 10:57:19 +0200350 } else if (!is.null(res$meta$benchmark)) {
Marc Kupietz2baf5c52025-09-05 16:41:11 +0200351 # Round the benchmark time to 2 decimal places for better readability.
352 # Be robust to locales using comma as decimal separator (e.g., "0,12s").
Marc Kupietz7638ca42025-05-25 13:18:16 +0200353 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
Marc Kupietz2baf5c52025-09-05 16:41:11 +0200354 bench_str <- sub("s$", "", res$meta$benchmark)
355 bench_num <- suppressWarnings(as.numeric(gsub(",", ".", bench_str)))
356 if (!is.na(bench_num)) {
357 formatted_time <- paste0(round(bench_num, 2), "s")
358 } else {
359 formatted_time <- res$meta$benchmark
360 }
Marc Kupietz7638ca42025-05-25 13:18:16 +0200361 log_info(verbose, ", took ", formatted_time, "\n", sep = "")
362 } else {
363 # Fallback if the format is different than expected
364 log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
365 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200366 } else {
367 log_info(verbose, "\n")
368 }
Marc Kupietza4675722022-02-23 23:55:15 +0100369 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200370 if (as.df) {
Marc Kupietza96537f2019-11-09 23:07:44 +0100371 data.frame(
372 query = query,
Marc Kupietza4675722022-02-23 23:55:15 +0100373 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100374 vc = vc,
375 webUIRequestUrl = webUIRequestUrl,
376 stringsAsFactors = FALSE
377 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200378 } else {
Marc Kupietza96537f2019-11-09 23:07:44 +0100379 KorAPQuery(
380 korapConnection = kco,
381 nextStartIndex = 0,
382 fields = fields,
383 requestUrl = requestUrl,
384 request = request,
Marc Kupietza4675722022-02-23 23:55:15 +0100385 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100386 vc = vc,
387 apiResponse = res,
388 webUIRequestUrl = webUIRequestUrl,
Marc Kupietza4675722022-02-23 23:55:15 +0100389 hasMoreMatches = (totalResults > 0),
Marc Kupietza96537f2019-11-09 23:07:44 +0100390 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200391 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100392 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200393 }
394)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200395
Marc Kupietz336c85d2025-07-24 13:52:03 +0200396# Helper function to check if a query rewrite warning should be shown
397warnOnRewrites <- function(res) {
398 if (!is.null(res$collection$rewrites)) {
399 comment <- res$collection$rewrites$`_comment`
400 # Only show warning if it's not just the standard policy message
401 if (!is.null(comment) && comment != "All corpus access policy has been added.") {
402 warning(res$collection$rewrites$editor, " had to rewrite your query: ", comment)
403 }
404 }
405}
406
Marc Kupietz05a60792024-12-07 16:23:31 +0100407#' @importFrom purrr map
408repair_data_strcuture <- function(x) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200409 if (is.list(x)) {
410 as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz05a60792024-12-07 16:23:31 +0100411 paste(.x, collapse = " ")
412 } else {
413 .x
414 }))
Marc Kupietzd8851222025-05-01 10:57:19 +0200415 } else {
Marc Kupietz05a60792024-12-07 16:23:31 +0100416 ifelse(is.na(x), "", x)
Marc Kupietzd8851222025-05-01 10:57:19 +0200417 }
Marc Kupietz05a60792024-12-07 16:23:31 +0100418}
419
Marc Kupietz62da2b52019-09-12 17:43:34 +0200420#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200421#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200422#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200423#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200424#' @family corpus search functions
425#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200426#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200427#' @param offset start offset for query results to fetch
428#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200429#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200430#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
431#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200432#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100433#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200434#' \dontrun{
435#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200436#' q <- KorAPConnection() |>
437#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200438#' fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100439#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100440#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100441#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200442#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200443#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200444#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200445#' @aliases fetchNext
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200446#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietzf4881122024-12-17 14:55:39 +0100447#' @importFrom tibble enframe add_column
448#' @importFrom stringr word
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200449#' @importFrom tidyr unnest unchop pivot_wider
450#' @importFrom purrr map
Marc Kupietz632cbd42019-09-06 16:04:51 +0200451#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200452setMethod("fetchNext", "KorAPQuery", function(kqo,
453 offset = kqo@nextStartIndex,
454 maxFetch = maxResultsPerPage,
455 verbose = kqo@korapConnection@verbose,
456 randomizePageOrder = FALSE) {
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100457 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietzd8851222025-05-01 10:57:19 +0200458 results <- key <- name <- tmp_positions <- 0
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100459
Marc Kupietze95108e2019-09-18 13:23:58 +0200460 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
461 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200462 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200463 use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz623d7122025-05-25 12:46:12 +0200464 # Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietze95108e2019-09-18 13:23:58 +0200465 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200466
Marc Kupietz24799fd2025-06-25 14:15:36 +0200467 # Track start time for ETA calculation
468 start_time <- Sys.time()
469
Marc Kupietz623d7122025-05-25 12:46:12 +0200470 # For randomized page order, generate a list of randomized page indices
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200471 if (randomizePageOrder) {
Marc Kupietz623d7122025-05-25 12:46:12 +0200472 # Calculate how many pages we need to fetch based on maxFetch
473 total_pages_to_fetch <- if (!is.na(maxFetch)) {
474 # Either limited by maxFetch or total results, whichever is smaller
475 min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
476 } else {
477 # All pages
478 ceiling(kqo@totalResults / maxResultsPerPage)
479 }
480
481 # Generate randomized page indices (0-based for API)
482 pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
483 page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200484 }
485
Marc Kupietzd8851222025-05-01 10:57:19 +0200486 if (is.null(collectedMatches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200487 collectedMatches <- data.frame()
488 }
Marc Kupietz623d7122025-05-25 12:46:12 +0200489
490 # Initialize the page counter properly based on nextStartIndex and any previously fetched results
491 # We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
492 # For first call, this will be 1, for subsequent calls, it will reflect our actual position
493 current_page_number <- ceiling(offset / maxResultsPerPage) + 1
494
495 # For sequential fetches, keep track of which global page we're on
496 # This is important for correctly showing page numbers in subsequent fetchNext calls
497 page_count_start <- current_page_number
498
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200499 repeat {
Marc Kupietz623d7122025-05-25 12:46:12 +0200500 # Determine which page to fetch next
501 if (randomizePageOrder) {
502 # In randomized mode, get the page from our randomized list using the page_index
503 # Make sure we don't exceed the array bounds
504 if (page_index > length(pages)) {
505 break # No more pages to fetch in randomized mode
506 }
507 current_offset_page <- pages[page_index]
508 # For display purposes in randomized mode, show which page out of the total we're fetching
509 display_page_number <- page_index
510 } else {
511 # In sequential mode, use the current_page_number to calculate the offset
512 current_offset_page <- (current_page_number - 1)
513 display_page_number <- current_page_number
514 }
515
516 # Calculate the actual offset in tokens
517 currentOffset <- current_offset_page * maxResultsPerPage
518
Marc Kupietzef0e9392025-06-18 12:21:49 +0200519 # Build the query with the appropriate count and offset using httr2
520 count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
Marc Kupietzecc86702025-06-24 12:12:51 +0200521
Marc Kupietzef0e9392025-06-18 12:21:49 +0200522 # Parse existing URL to preserve all query parameters
523 parsed_url <- httr2::url_parse(kqo@requestUrl)
524 existing_query <- parsed_url$query
Marc Kupietzecc86702025-06-24 12:12:51 +0200525
Marc Kupietzef0e9392025-06-18 12:21:49 +0200526 # Add/update count and offset parameters
527 existing_query$count <- count_param
528 existing_query$offset <- currentOffset
Marc Kupietzecc86702025-06-24 12:12:51 +0200529
Marc Kupietzef0e9392025-06-18 12:21:49 +0200530 # Rebuild the URL with all parameters
531 query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz336c85d2025-07-24 13:52:03 +0200532
Marc Kupietz68170952021-06-30 09:37:21 +0200533 res <- apiCall(kqo@korapConnection, query)
534 if (length(res$matches) == 0) {
535 break
536 }
537
Marc Kupietz336c85d2025-07-24 13:52:03 +0200538 # Check for query rewrites and warn the user
539 warnOnRewrites(res)
540
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200541 if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz16ccf112025-01-26 13:25:27 +0100542 log_info(verbose, "Using fields API: ")
Marc Kupietz05a60792024-12-07 16:23:31 +0100543 currentMatches <- res$matches$fields %>%
544 purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
545 tibble::enframe() %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200546 tidyr::unnest(cols = value) %>%
547 tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200548 dplyr::select(-name)
Marc Kupietzd8851222025-05-01 10:57:19 +0200549 if ("snippet" %in% colnames(res$matches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200550 currentMatches$snippet <- res$matches$snippet
551 }
Marc Kupietz3cd2c6c2025-01-08 20:35:39 +0100552 if ("tokens" %in% colnames(res$matches)) {
553 currentMatches$tokens <- res$matches$tokens
554 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200555 } else {
556 currentMatches <- res$matches
557 }
558
Marc Kupietze95108e2019-09-18 13:23:58 +0200559 for (field in kqo@fields) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200560 if (!field %in% colnames(currentMatches)) {
561 currentMatches[, field] <- NA
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200562 }
563 }
Marc Kupietzf4881122024-12-17 14:55:39 +0100564 currentMatches <- currentMatches %>%
565 select(kqo@fields) %>%
566 mutate(
Marc Kupietzff712a92025-07-18 09:07:23 +0200567 matchID = res$matches$matchID,
Marc Kupietz0447da02025-01-08 20:51:09 +0100568 tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
Marc Kupietzf4881122024-12-17 14:55:39 +0100569 matchStart = as.integer(stringr::word(tmp_positions, 1)),
570 matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
571 ) %>%
572 select(-tmp_positions)
573
Marc Kupietz62da2b52019-09-12 17:43:34 +0200574 if (!is.list(collectedMatches)) {
575 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200576 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200577 collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200578 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200579
Marc Kupietz336c85d2025-07-24 13:52:03 +0200580
Marc Kupietz623d7122025-05-25 12:46:12 +0200581 # Get the actual items per page from the API response
582 # We now consistently use maxResultsPerPage instead
Marc Kupietzacbaab02025-05-01 10:56:35 +0200583
Marc Kupietz623d7122025-05-25 12:46:12 +0200584 # Calculate total pages consistently using fixed maxResultsPerPage
585 # This ensures consistent page counting across the function
586 total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
587
Marc Kupietz24799fd2025-06-25 14:15:36 +0200588 # Calculate ETA using the centralized function from logging.R
589 current_page <- if (randomizePageOrder) page_index else display_page_number
590 total_pages_to_fetch <- if (!is.na(maxFetch)) {
591 # Account for offset - we can only fetch from the remaining results after offset
592 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
593 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
594 } else {
595 total_pages
596 }
Marc Kupietz365660e2025-06-25 15:09:55 +0200597
Marc Kupietz24799fd2025-06-25 14:15:36 +0200598 eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)
Marc Kupietz365660e2025-06-25 15:09:55 +0200599
Marc Kupietz24799fd2025-06-25 14:15:36 +0200600 # Extract timing information for display
Marc Kupietzae9b6172025-05-02 15:50:01 +0200601 time_per_page <- NA
Marc Kupietzae9b6172025-05-02 15:50:01 +0200602 if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
Marc Kupietzae9b6172025-05-02 15:50:01 +0200603 time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
Marc Kupietzacbaab02025-05-01 10:56:35 +0200604 }
605
Marc Kupietz623d7122025-05-25 12:46:12 +0200606 # Create the page display string with proper formatting
Marc Kupietzacbaab02025-05-01 10:56:35 +0200607
Marc Kupietz623d7122025-05-25 12:46:12 +0200608 # For global page tracking, calculate the absolute page number
609 actual_display_number <- if (randomizePageOrder) {
610 current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
611 } else {
612 # In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
613 current_offset_page + 1
614 }
615
616 # For subsequent calls to fetchNext, we need to calculate the correct page numbers
617 # based on the current batch being fetched
618
619 # For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
620 # Simply count from 1 within the current batch
621
622 # The relative page number is simply the current position in this batch
623 if (randomizePageOrder) {
624 relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
625 } else {
626 relative_page_number <- display_page_number - (page_count_start - 1)
627 }
628
629 # How many pages will we fetch in this batch?
Marc Kupietz021663d2025-06-18 17:49:22 +0200630 # If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz623d7122025-05-25 12:46:12 +0200631 pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz021663d2025-06-18 17:49:22 +0200632 # Account for offset - we can only fetch from the remaining results after offset
633 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
634 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz623d7122025-05-25 12:46:12 +0200635 } else {
636 # Otherwise fetch all remaining pages
637 total_pages - page_count_start + 1
638 }
639
640 # The total pages to be shown in this batch
641 batch_total_pages <- pages_in_this_batch
642
643 page_display <- paste0(
644 "Retrieved page ",
645 sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
646 "/",
647 sprintf("%d", batch_total_pages)
648 )
649
650 # If randomized, also show which actual page we fetched
651 if (randomizePageOrder) {
652 # Determine the maximum width needed for page numbers (based on total pages)
653 # This ensures consistent alignment
654 max_page_width <- nchar(as.character(total_pages))
655 # Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz7638ca42025-05-25 13:18:16 +0200656 page_display <- paste0(
657 page_display,
658 sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
659 )
Marc Kupietz623d7122025-05-25 12:46:12 +0200660 }
661 # Always show the absolute page number and total pages (for clarity)
662 else {
663 # Show the absolute page number (out of total possible pages)
664 page_display <- paste0(page_display, sprintf(
665 " (page %d of %d total)",
666 actual_display_number, total_pages
667 ))
668 }
669
670 # Add caching or timing information
671 if (!is.null(res$meta$cached)) {
672 page_display <- paste0(page_display, " [cached]")
673 } else {
674 page_display <- paste0(
675 page_display,
676 " in ",
677 if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
Marc Kupietz24799fd2025-06-25 14:15:36 +0200678 "s",
679 eta_info
Marc Kupietz623d7122025-05-25 12:46:12 +0200680 )
681 }
682
683 log_info(verbose, paste0(page_display, "\n"))
684
685 # Increment the appropriate counter based on mode
686 if (randomizePageOrder) {
687 page_index <- page_index + 1
688 } else {
689 current_page_number <- current_page_number + 1
690 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200691 results <- results + res$meta$itemsPerPage
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200692 if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200693 break
694 }
695 }
Marc Kupietz68170952021-06-30 09:37:21 +0200696 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietzd8851222025-05-01 10:57:19 +0200697 KorAPQuery(
698 nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200699 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200700 fields = kqo@fields,
701 requestUrl = kqo@requestUrl,
702 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200703 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200704 vc = kqo@vc,
705 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200706 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200707 apiResponse = res,
Marc Kupietzd8851222025-05-01 10:57:19 +0200708 collectedMatches = collectedMatches
709 )
Marc Kupietze95108e2019-09-18 13:23:58 +0200710})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200711
712#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200713#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200714#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100715#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200716#' @family corpus search functions
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200717#' @param kqo object obtained from [corpusQuery()]
718#' @param verbose print progress information if true
719#' @param ... further arguments passed to [fetchNext()]
720#' @return The updated `kqo` object with all results in `@collectedMatches`
Marc Kupietza8c40f42025-06-24 15:49:52 +0200721#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200722#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200723#' \dontrun{
Marc Kupietzecc86702025-06-24 12:12:51 +0200724#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
725#' q <- KorAPConnection() |>
726#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200727#' fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200728#' q@collectedMatches
Marc Kupietzecc86702025-06-24 12:12:51 +0200729#'
730#' # Fetch also all KWICs
731#' q <- KorAPConnection() |> auth() |>
732#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
733#' fetchAll()
734#' q@collectedMatches
735#'
736#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
737#' q <- KorAPConnection() |>
738#' corpusQuery("<base/s=t>", # this matches each text once
739#' vc = "pubDate in 1958-03-12",
740#' fields = c("textSigle", "title"),
741#' ) |>
742#' fetchAll()
743#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100744#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200745#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200746#' @aliases fetchAll
Marc Kupietz62da2b52019-09-12 17:43:34 +0200747#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200748setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
749 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200750})
751
752#' Fetches the remaining results of a KorAP query.
753#'
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200754#' @param kqo object obtained from [corpusQuery()]
755#' @param verbose print progress information if true
756#' @param ... further arguments passed to [fetchNext()]
757#' @return The updated `kqo` object with remaining results in `@collectedMatches`
758#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200759#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200760#' \dontrun{
761#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200762#' q <- KorAPConnection() |>
763#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200764#' fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200765#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100766#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200767#'
768#' @aliases fetchRest
Marc Kupietze95108e2019-09-18 13:23:58 +0200769#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200770setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
771 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200772})
773
Marc Kupietzbdedd022025-10-09 14:14:15 +0200774# Helper to collapse multiple annotation values while preserving order
775collapse_features <- function(values) {
776 if (length(values) == 0) {
777 return(NA_character_)
778 }
779 unique_values <- values[!duplicated(values)]
780 paste(unique_values, collapse = "|")
781}
782
783# Extract token-level annotations from a DOM node
784collect_token_annotations <- function(parent_node) {
785 if (inherits(parent_node, "xml_missing")) {
786 return(list(
787 node = list(),
788 token = character(0),
789 lemma = character(0),
790 pos = character(0),
791 morph = character(0)
792 ))
793 }
794
795 leaf_nodes <- xml2::xml_find_all(parent_node, ".//span[not(.//span)]")
796
797 if (length(leaf_nodes) == 0) {
798 return(list(
799 node = list(),
800 token = character(0),
801 lemma = character(0),
802 pos = character(0),
803 morph = character(0)
804 ))
805 }
806
807 tokens <- character(0)
808 lemmas <- character(0)
809 pos_tags <- character(0)
810 morph_tags <- character(0)
811 kept_nodes <- list()
812
813 for (idx in seq_along(leaf_nodes)) {
814 leaf <- leaf_nodes[[idx]]
815 token_text <- trimws(xml2::xml_text(leaf))
816 if (identical(token_text, "")) {
817 next
818 }
819
820 kept_nodes[[length(kept_nodes) + 1]] <- leaf
821 tokens <- c(tokens, token_text)
822
823 ancestors <- xml2::xml_find_all(leaf, "ancestor-or-self::span")
824 titles <- xml2::xml_attr(ancestors, "title")
825 titles <- titles[!is.na(titles)]
826
827 feature_pieces <- if (length(titles) > 0) unlist(strsplit(titles, "[[:space:]]+")) else character(0)
828
829 lemma_values <- sub('.*?/l:(.*)$', '\\1', feature_pieces[grepl('/l:', feature_pieces)], perl = TRUE)
830 pos_values <- sub('.*?/p:(.*)$', '\\1', feature_pieces[grepl('/p:', feature_pieces)], perl = TRUE)
831 morph_values <- sub('.*?/m:(.*)$', '\\1', feature_pieces[grepl('/m:', feature_pieces)], perl = TRUE)
832
833 lemmas <- c(lemmas, collapse_features(lemma_values))
834 pos_tags <- c(pos_tags, collapse_features(pos_values))
835 morph_tags <- c(morph_tags, collapse_features(morph_values))
836 }
837
838 list(
839 node = kept_nodes,
840 token = tokens,
841 lemma = lemmas,
842 pos = pos_tags,
843 morph = morph_tags
844 )
845}
846
Marc Kupietza29f3d42025-07-18 10:14:43 +0200847#'
848#' Parse XML annotations into linguistic layers
849#'
850#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
851#' from XML annotation snippets returned by the KorAP API.
852#'
853#' @param xml_snippet XML string containing annotation data
854#' @return Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
855#' @keywords internal
856parse_xml_annotations <- function(xml_snippet) {
857 if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
858 return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
859 }
860
Marc Kupietzbdedd022025-10-09 14:14:15 +0200861 doc <- tryCatch(xml2::read_html(paste0("<root>", xml_snippet, "</root>")), error = function(e) NULL)
862 if (is.null(doc)) {
863 return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
Marc Kupietzcd452182025-10-09 13:28:41 +0200864 }
865
Marc Kupietzbdedd022025-10-09 14:14:15 +0200866 match_node <- xml2::xml_find_first(doc, ".//span[contains(@class, 'match')]")
867 if (inherits(match_node, "xml_missing")) {
868 match_node <- xml2::xml_find_first(doc, ".//span")
869 if (inherits(match_node, "xml_missing")) {
870 return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
Marc Kupietza29f3d42025-07-18 10:14:43 +0200871 }
872 }
873
Marc Kupietzbdedd022025-10-09 14:14:15 +0200874 token_info <- collect_token_annotations(match_node)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200875
Marc Kupietzbdedd022025-10-09 14:14:15 +0200876 list(
877 token = token_info$token,
878 lemma = token_info$lemma,
879 pos = token_info$pos,
880 morph = token_info$morph
881 )
Marc Kupietza29f3d42025-07-18 10:14:43 +0200882}
883
884#'
885#' Parse XML annotations into linguistic layers with left/match/right structure
886#'
887#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
888#' from XML annotation snippets returned by the KorAP API, split into left context,
889#' match, and right context sections like the tokens field.
890#'
891#' @param xml_snippet XML string containing annotation data
892#' @return Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
893#' @keywords internal
894parse_xml_annotations_structured <- function(xml_snippet) {
895 if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
896 empty_result <- list(left = character(0), match = character(0), right = character(0))
897 return(list(
898 atokens = empty_result,
899 lemma = empty_result,
900 pos = empty_result,
901 morph = empty_result
902 ))
903 }
904
Marc Kupietzbdedd022025-10-09 14:14:15 +0200905 doc <- tryCatch(xml2::read_html(paste0("<root>", xml_snippet, "</root>")), error = function(e) NULL)
906 if (is.null(doc)) {
907 empty_result <- list(left = character(0), match = character(0), right = character(0))
Marc Kupietza29f3d42025-07-18 10:14:43 +0200908 return(list(
Marc Kupietzbdedd022025-10-09 14:14:15 +0200909 atokens = empty_result,
910 lemma = empty_result,
911 pos = empty_result,
912 morph = empty_result
Marc Kupietza29f3d42025-07-18 10:14:43 +0200913 ))
914 }
915
Marc Kupietzbdedd022025-10-09 14:14:15 +0200916 match_node <- xml2::xml_find_first(doc, ".//span[contains(@class, 'match')]")
917 if (inherits(match_node, "xml_missing")) {
918 empty_result <- list(left = character(0), match = character(0), right = character(0))
919 return(list(
920 atokens = empty_result,
921 lemma = empty_result,
922 pos = empty_result,
923 morph = empty_result
924 ))
Marc Kupietza29f3d42025-07-18 10:14:43 +0200925 }
Marc Kupietzc643a122025-07-18 18:18:36 +0200926
Marc Kupietzbdedd022025-10-09 14:14:15 +0200927 token_info <- collect_token_annotations(match_node)
928 tokens <- token_info$token
929 lemmas <- token_info$lemma
930 pos_tags <- token_info$pos
931 morph_tags <- token_info$morph
932 nodes <- token_info$node
Marc Kupietzc643a122025-07-18 18:18:36 +0200933
Marc Kupietzbdedd022025-10-09 14:14:15 +0200934 if (length(tokens) == 0) {
935 empty_result <- list(left = character(0), match = character(0), right = character(0))
936 return(list(
937 atokens = empty_result,
938 lemma = empty_result,
939 pos = empty_result,
940 morph = empty_result
941 ))
942 }
Marc Kupietzc643a122025-07-18 18:18:36 +0200943
Marc Kupietzbdedd022025-10-09 14:14:15 +0200944 mark_flags <- vapply(nodes, function(n) {
945 !inherits(xml2::xml_find_first(n, "ancestor::mark"), "xml_missing")
946 }, logical(1))
Marc Kupietzc643a122025-07-18 18:18:36 +0200947
Marc Kupietzbdedd022025-10-09 14:14:15 +0200948 if (any(mark_flags)) {
949 first_idx <- which(mark_flags)[1]
950 last_idx <- tail(which(mark_flags), 1)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200951 } else {
Marc Kupietzbdedd022025-10-09 14:14:15 +0200952 first_idx <- 1
953 last_idx <- length(tokens)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200954 }
955
Marc Kupietzbdedd022025-10-09 14:14:15 +0200956 sections <- rep("match", length(tokens))
957 if (first_idx > 1) {
958 sections[seq_len(first_idx - 1)] <- "left"
959 }
960 if (last_idx < length(tokens)) {
961 sections[seq(from = last_idx + 1, to = length(tokens))] <- "right"
962 }
Marc Kupietza29f3d42025-07-18 10:14:43 +0200963
Marc Kupietzbdedd022025-10-09 14:14:15 +0200964 subset_by_section <- function(values, section) {
965 idx <- sections == section
966 if (!any(idx)) {
967 return(character(0))
968 }
969 values[idx]
970 }
971
972 atokens <- list(
973 left = subset_by_section(tokens, "left"),
974 match = subset_by_section(tokens, "match"),
975 right = subset_by_section(tokens, "right")
976 )
977
978 lemma <- list(
979 left = subset_by_section(lemmas, "left"),
980 match = subset_by_section(lemmas, "match"),
981 right = subset_by_section(lemmas, "right")
982 )
983
984 pos <- list(
985 left = subset_by_section(pos_tags, "left"),
986 match = subset_by_section(pos_tags, "match"),
987 right = subset_by_section(pos_tags, "right")
988 )
989
990 morph <- list(
991 left = subset_by_section(morph_tags, "left"),
992 match = subset_by_section(morph_tags, "match"),
993 right = subset_by_section(morph_tags, "right")
994 )
995
996 list(
997 atokens = atokens,
998 lemma = lemma,
999 pos = pos,
1000 morph = morph
1001 )
Marc Kupietza29f3d42025-07-18 10:14:43 +02001002}
1003
Marc Kupietze52b2952025-07-17 16:53:02 +02001004#' Fetch annotations for all collected matches
1005#'
Marc Kupietz89f796e2025-07-19 09:05:06 +02001006#' `r lifecycle::badge("experimental")`
1007#'
1008#' **`fetchAnnotations`** fetches annotations (only token annotations, for now)
1009#' for all matches in the `@collectedMatches` slot
Marc Kupietzc643a122025-07-18 18:18:36 +02001010#' of a KorAPQuery object and adds annotation columns directly to the `@collectedMatches`
Marc Kupietz89f796e2025-07-19 09:05:06 +02001011#' data frame. The method uses the `matchID` from collected matches.
Marc Kupietza29f3d42025-07-18 10:14:43 +02001012#'
1013#' **Important**: For copyright-restricted corpora, users must be authorized via [auth()]
1014#' and the initial corpus query must have `metadataOnly = FALSE` to ensure snippets are
1015#' available for annotation parsing.
1016#'
1017#' The method parses XML snippet annotations and adds linguistic columns to the data frame:
1018#' - `pos`: data frame with `left`, `match`, `right` columns, each containing list vectors of part-of-speech tags
1019#' - `lemma`: data frame with `left`, `match`, `right` columns, each containing list vectors of lemmas
1020#' - `morph`: data frame with `left`, `match`, `right` columns, each containing list vectors of morphological tags
1021#' - `atokens`: data frame with `left`, `match`, `right` columns, each containing list vectors of token text (from annotations)
1022#' - `annotation_snippet`: original XML snippet from the annotation API
Marc Kupietze52b2952025-07-17 16:53:02 +02001023#'
1024#' @family corpus search functions
Marc Kupietz89f796e2025-07-19 09:05:06 +02001025#' @concept Annotations
Marc Kupietze52b2952025-07-17 16:53:02 +02001026#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001027#' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
Marc Kupietze52b2952025-07-17 16:53:02 +02001028#' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
Marc Kupietz93787d52025-09-03 13:33:25 +02001029#' @param overwrite logical; if TRUE, re-fetch and replace any existing
1030#' annotation columns. If FALSE (default), only add missing annotation layers
1031#' and preserve already fetched ones (e.g., keep POS/lemma from a previous
1032#' foundry while adding morph from another).
Marc Kupietze52b2952025-07-17 16:53:02 +02001033#' @param verbose print progress information if true
Marc Kupietz0af75932025-09-09 18:14:16 +02001034#' @return The updated `kqo` object with annotation columns
Marc Kupietz336c85d2025-07-24 13:52:03 +02001035#' @return The updated `kqo` object with annotation columns
Marc Kupietz89f796e2025-07-19 09:05:06 +02001036#' like `pos`, `lemma`, `morph` (and `atokens` and `annotation_snippet`)
1037#' in the `@collectedMatches` slot. Each column is a data frame
1038#' with `left`, `match`, and `right` columns containing list vectors of annotations
1039#' for the left context, matched tokens, and right context, respectively.
1040#' The original XML snippet for each match is also stored in `annotation_snippet`.
Marc Kupietze52b2952025-07-17 16:53:02 +02001041#'
1042#' @examples
1043#' \dontrun{
1044#'
1045#' # Fetch annotations for matches using Tree-Tagger foundry
Marc Kupietza29f3d42025-07-18 10:14:43 +02001046#' # Note: Authorization required for copyright-restricted corpora
Marc Kupietze52b2952025-07-17 16:53:02 +02001047#' q <- KorAPConnection() |>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001048#' auth() |>
1049#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
Marc Kupietze52b2952025-07-17 16:53:02 +02001050#' fetchNext(maxFetch = 10) |>
1051#' fetchAnnotations()
Marc Kupietze52b2952025-07-17 16:53:02 +02001052#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001053#' # Access linguistic annotations for match i:
Marc Kupietz6aa5a0d2025-09-08 17:51:47 +02001054#' pos_tags <- q@collectedMatches$pos
1055#' # Data frame with left/match/right columns for POS tags
1056#' lemmas <- q@collectedMatches$lemma
1057#' # Data frame with left/match/right columns for lemmas
1058#' morphology <- q@collectedMatches$morph
1059#' # Data frame with left/match/right columns for morphological tags
1060#' atokens <- q@collectedMatches$atokens
1061#' # Data frame with left/match/right columns for annotation token text
Marc Kupietz0af75932025-09-09 18:14:16 +02001062#' # Original XML snippet for match i
1063#' raw_snippet <- q@collectedMatches$annotation_snippet[[i]]
Marc Kupietzc643a122025-07-18 18:18:36 +02001064#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001065#' # Access specific components:
Marc Kupietz0af75932025-09-09 18:14:16 +02001066#' # POS tags for the matched tokens in match i
1067#' match_pos <- q@collectedMatches$pos$match[[i]]
1068#' # Lemmas for the left context in match i
1069#' left_lemmas <- q@collectedMatches$lemma$left[[i]]
1070#' # Token text for the right context in match i
1071#' right_tokens <- q@collectedMatches$atokens$right[[i]]
Marc Kupietza29f3d42025-07-18 10:14:43 +02001072#'
Marc Kupietz89f796e2025-07-19 09:05:06 +02001073#' # Use a different foundry (e.g., MarMoT)
Marc Kupietze52b2952025-07-17 16:53:02 +02001074#' q <- KorAPConnection() |>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001075#' auth() |>
1076#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
Marc Kupietze52b2952025-07-17 16:53:02 +02001077#' fetchNext(maxFetch = 10) |>
Marc Kupietz89f796e2025-07-19 09:05:06 +02001078#' fetchAnnotations(foundry = "marmot")
1079#' q@collectedMatches$pos$left[1] # POS tags for the left context of the first match
Marc Kupietze52b2952025-07-17 16:53:02 +02001080#' }
Marc Kupietze52b2952025-07-17 16:53:02 +02001081#' @export
Marc Kupietz0af75932025-09-09 18:14:16 +02001082setMethod("fetchAnnotations", "KorAPQuery", function(kqo,
1083 foundry = "tt",
1084 overwrite = FALSE,
1085 verbose = kqo@korapConnection@verbose) {
1086 if (is.null(kqo@collectedMatches) ||
1087 nrow(kqo@collectedMatches) == 0) {
1088 warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
1089 return(kqo)
1090 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001091
Marc Kupietze52b2952025-07-17 16:53:02 +02001092 df <- kqo@collectedMatches
1093 kco <- kqo@korapConnection
Marc Kupietza29f3d42025-07-18 10:14:43 +02001094
Marc Kupietza29f3d42025-07-18 10:14:43 +02001095 # Initialize annotation columns as data frames (like tokens field)
1096 # Create the structure more explicitly to avoid assignment issues
1097 nrows <- nrow(df)
Marc Kupietzc643a122025-07-18 18:18:36 +02001098
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001099 # Pre-compute the empty character vector list to avoid repeated computation
1100 empty_char_list <- I(replicate(nrows, character(0), simplify = FALSE))
Marc Kupietz0af75932025-09-09 18:14:16 +02001101
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001102 # Helper function to create annotation data frame structure
1103 create_annotation_df <- function(empty_list) {
1104 data.frame(
1105 left = empty_list,
1106 match = empty_list,
1107 right = empty_list,
1108 stringsAsFactors = FALSE
1109 )
1110 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001111
Marc Kupietz93787d52025-09-03 13:33:25 +02001112 # Track which annotation columns already existed to decide overwrite behavior
1113 existing_types <- list(
1114 pos = "pos" %in% colnames(df),
1115 lemma = "lemma" %in% colnames(df),
1116 morph = "morph" %in% colnames(df),
1117 atokens = "atokens" %in% colnames(df),
1118 annotation_snippet = "annotation_snippet" %in% colnames(df)
1119 )
1120
1121 # Initialize annotation columns using the helper function
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001122 annotation_types <- c("pos", "lemma", "morph", "atokens")
1123 for (type in annotation_types) {
Marc Kupietz93787d52025-09-03 13:33:25 +02001124 if (overwrite || !existing_types[[type]]) {
1125 df[[type]] <- create_annotation_df(empty_char_list)
1126 }
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001127 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001128
Marc Kupietz93787d52025-09-03 13:33:25 +02001129 if (overwrite || !existing_types$annotation_snippet) {
feldmuellera02f1932025-09-15 16:38:06 +02001130 df$annotation_snippet <- rep(NA_character_, nrows) # Fixed line
Marc Kupietz93787d52025-09-03 13:33:25 +02001131 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001132
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001133 # Initialize timing for ETA calculation
1134 start_time <- Sys.time()
1135 if (verbose) {
1136 log_info(verbose, paste("Starting to fetch annotations for", nrows, "matches\n"))
1137 }
1138
Marc Kupietz93787d52025-09-03 13:33:25 +02001139 # Helper to decide if existing annotation row is effectively empty
1140 is_empty_annotation_row <- function(ann_df, row_index) {
1141 if (is.null(ann_df) || nrow(ann_df) < row_index) return(TRUE)
1142 left_val <- ann_df$left[[row_index]]
1143 match_val <- ann_df$match[[row_index]]
1144 right_val <- ann_df$right[[row_index]]
1145 all(
1146 (is.null(left_val) || (length(left_val) == 0) || all(is.na(left_val))),
1147 (is.null(match_val) || (length(match_val) == 0) || all(is.na(match_val))),
1148 (is.null(right_val) || (length(right_val) == 0) || all(is.na(right_val)))
1149 )
1150 }
1151
Marc Kupietze52b2952025-07-17 16:53:02 +02001152 for (i in seq_len(nrow(df))) {
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001153 # ETA logging
1154 if (verbose && i > 1) {
1155 eta_info <- calculate_eta(i, nrows, start_time)
1156 log_info(verbose, paste("Fetching annotations for match", i, "of", nrows, eta_info, "\n"))
1157 }
Marc Kupietzff712a92025-07-18 09:07:23 +02001158 # Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
1159 if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001160 # matchID format: "match-match-A00/JUN/39609-p202-203" or encrypted format like
1161 # "match-DNB10/CSL/80400-p2343-2344x_MinDOhu_P6dd2MMZJyyus_7MairdKnr1LxY07Cya-Ow"
1162 # Extract document path and position, handling both regular and encrypted formats
Marc Kupietzc643a122025-07-18 18:18:36 +02001163
Marc Kupietza29f3d42025-07-18 10:14:43 +02001164 # More flexible regex to extract the document path with position and encryption
1165 # Look for pattern: match-(...)-p(\d+)-(\d+)(.*) where (.*) is the encrypted part
1166 # We need to capture the entire path including the encrypted suffix
1167 match_result <- regexpr("match-(.+?-p\\d+-\\d+.*)", df$matchID[i], perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001168
Marc Kupietza29f3d42025-07-18 10:14:43 +02001169 if (match_result > 0) {
1170 # Extract the complete path including encryption (everything after "match-")
1171 doc_path_with_pos_and_encryption <- gsub("^match-(.+)$", "\\1", df$matchID[i], perl = TRUE)
1172 # Convert the dash before position to slash, but keep everything after the position
1173 match_path <- gsub("-p(\\d+-\\d+.*)", "/p\\1", doc_path_with_pos_and_encryption)
Marc Kupietz25121302025-07-19 08:45:43 +02001174 # Use httr2 to construct URL safely
1175 base_url <- paste0(kco@apiUrl, "corpus/", match_path)
1176 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietza29f3d42025-07-18 10:14:43 +02001177 } else {
Marc Kupietz25121302025-07-19 08:45:43 +02001178 # If regex fails, fall back to the old method with httr2
1179 # Format numbers to avoid scientific notation
1180 match_start <- format(df$matchStart[i], scientific = FALSE)
1181 match_end <- format(df$matchEnd[i], scientific = FALSE)
1182 base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
1183 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietzff712a92025-07-18 09:07:23 +02001184 }
1185 } else {
Marc Kupietz25121302025-07-19 08:45:43 +02001186 # Fallback to the old method with httr2
1187 # Format numbers to avoid scientific notation
1188 match_start <- format(df$matchStart[i], scientific = FALSE)
1189 match_end <- format(df$matchEnd[i], scientific = FALSE)
1190 base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
1191 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietzff712a92025-07-18 09:07:23 +02001192 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001193
Marc Kupietze52b2952025-07-17 16:53:02 +02001194 tryCatch({
1195 res <- apiCall(kco, req)
Marc Kupietzc643a122025-07-18 18:18:36 +02001196
Marc Kupietze52b2952025-07-17 16:53:02 +02001197 if (!is.null(res)) {
Marc Kupietz93787d52025-09-03 13:33:25 +02001198 # Store the raw annotation snippet (respect overwrite flag)
1199 if (overwrite || !existing_types$annotation_snippet || is.null(df$annotation_snippet[[i]]) || is.na(df$annotation_snippet[[i]])) {
1200 df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
1201 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001202
1203 # Parse XML annotations if snippet is available
1204 if (is.list(res) && "snippet" %in% names(res)) {
1205 parsed_annotations <- parse_xml_annotations_structured(res$snippet)
1206
1207 # Store the parsed linguistic data in data frame format (like tokens)
1208 # Use individual assignment to avoid data frame mismatch errors
1209 tryCatch({
1210 # Assign POS annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001211 if (overwrite || !existing_types$pos || is_empty_annotation_row(df$pos, i)) {
1212 df$pos$left[i] <- list(parsed_annotations$pos$left)
1213 df$pos$match[i] <- list(parsed_annotations$pos$match)
1214 df$pos$right[i] <- list(parsed_annotations$pos$right)
1215 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001216
Marc Kupietza29f3d42025-07-18 10:14:43 +02001217 # Assign lemma annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001218 if (overwrite || !existing_types$lemma || is_empty_annotation_row(df$lemma, i)) {
1219 df$lemma$left[i] <- list(parsed_annotations$lemma$left)
1220 df$lemma$match[i] <- list(parsed_annotations$lemma$match)
1221 df$lemma$right[i] <- list(parsed_annotations$lemma$right)
1222 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001223
Marc Kupietza29f3d42025-07-18 10:14:43 +02001224 # Assign morphology annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001225 if (overwrite || !existing_types$morph || is_empty_annotation_row(df$morph, i)) {
1226 df$morph$left[i] <- list(parsed_annotations$morph$left)
1227 df$morph$match[i] <- list(parsed_annotations$morph$match)
1228 df$morph$right[i] <- list(parsed_annotations$morph$right)
1229 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001230
Marc Kupietza29f3d42025-07-18 10:14:43 +02001231 # Assign token annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001232 if (overwrite || !existing_types$atokens || is_empty_annotation_row(df$atokens, i)) {
1233 df$atokens$left[i] <- list(parsed_annotations$atokens$left)
1234 df$atokens$match[i] <- list(parsed_annotations$atokens$match)
1235 df$atokens$right[i] <- list(parsed_annotations$atokens$right)
1236 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001237 }, error = function(assign_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001238 # Set empty character vectors on assignment error using list assignment
Marc Kupietz93787d52025-09-03 13:33:25 +02001239 if (overwrite || !existing_types$pos) {
1240 df$pos$left[i] <<- list(character(0))
1241 df$pos$match[i] <<- list(character(0))
1242 df$pos$right[i] <<- list(character(0))
1243 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001244
Marc Kupietz93787d52025-09-03 13:33:25 +02001245 if (overwrite || !existing_types$lemma) {
1246 df$lemma$left[i] <<- list(character(0))
1247 df$lemma$match[i] <<- list(character(0))
1248 df$lemma$right[i] <<- list(character(0))
1249 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001250
Marc Kupietz93787d52025-09-03 13:33:25 +02001251 if (overwrite || !existing_types$morph) {
1252 df$morph$left[i] <<- list(character(0))
1253 df$morph$match[i] <<- list(character(0))
1254 df$morph$right[i] <<- list(character(0))
1255 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001256
Marc Kupietz93787d52025-09-03 13:33:25 +02001257 if (overwrite || !existing_types$atokens) {
1258 df$atokens$left[i] <<- list(character(0))
1259 df$atokens$match[i] <<- list(character(0))
1260 df$atokens$right[i] <<- list(character(0))
1261 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001262 })
Marc Kupietza29f3d42025-07-18 10:14:43 +02001263 } else {
1264 # No snippet available, store empty vectors
Marc Kupietz93787d52025-09-03 13:33:25 +02001265 if (overwrite || !existing_types$pos) {
1266 df$pos$left[i] <- list(character(0))
1267 df$pos$match[i] <- list(character(0))
1268 df$pos$right[i] <- list(character(0))
1269 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001270
Marc Kupietz93787d52025-09-03 13:33:25 +02001271 if (overwrite || !existing_types$lemma) {
1272 df$lemma$left[i] <- list(character(0))
1273 df$lemma$match[i] <- list(character(0))
1274 df$lemma$right[i] <- list(character(0))
1275 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001276
Marc Kupietz93787d52025-09-03 13:33:25 +02001277 if (overwrite || !existing_types$morph) {
1278 df$morph$left[i] <- list(character(0))
1279 df$morph$match[i] <- list(character(0))
1280 df$morph$right[i] <- list(character(0))
1281 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001282
Marc Kupietz93787d52025-09-03 13:33:25 +02001283 if (overwrite || !existing_types$atokens) {
1284 df$atokens$left[i] <- list(character(0))
1285 df$atokens$match[i] <- list(character(0))
1286 df$atokens$right[i] <- list(character(0))
1287 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001288 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001289 } else {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001290 # Store NAs for failed requests
Marc Kupietz93787d52025-09-03 13:33:25 +02001291 if (overwrite || !existing_types$pos) {
1292 df$pos$left[i] <- list(NA)
1293 df$pos$match[i] <- list(NA)
1294 df$pos$right[i] <- list(NA)
1295 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001296
Marc Kupietz93787d52025-09-03 13:33:25 +02001297 if (overwrite || !existing_types$lemma) {
1298 df$lemma$left[i] <- list(NA)
1299 df$lemma$match[i] <- list(NA)
1300 df$lemma$right[i] <- list(NA)
1301 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001302
Marc Kupietz93787d52025-09-03 13:33:25 +02001303 if (overwrite || !existing_types$morph) {
1304 df$morph$left[i] <- list(NA)
1305 df$morph$match[i] <- list(NA)
1306 df$morph$right[i] <- list(NA)
1307 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001308
Marc Kupietz93787d52025-09-03 13:33:25 +02001309 if (overwrite || !existing_types$atokens) {
1310 df$atokens$left[i] <- list(NA)
1311 df$atokens$match[i] <- list(NA)
1312 df$atokens$right[i] <- list(NA)
1313 }
1314 if (overwrite || !existing_types$annotation_snippet) {
1315 df$annotation_snippet[[i]] <- NA
1316 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001317 }
1318 }, error = function(e) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001319 # Store NAs for failed requests
Marc Kupietz93787d52025-09-03 13:33:25 +02001320 if (overwrite || !existing_types$pos) {
1321 df$pos$left[i] <- list(NA)
1322 df$pos$match[i] <- list(NA)
1323 df$pos$right[i] <- list(NA)
1324 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001325
Marc Kupietz93787d52025-09-03 13:33:25 +02001326 if (overwrite || !existing_types$lemma) {
1327 df$lemma$left[i] <- list(NA)
1328 df$lemma$match[i] <- list(NA)
1329 df$lemma$right[i] <- list(NA)
1330 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001331
Marc Kupietz93787d52025-09-03 13:33:25 +02001332 if (overwrite || !existing_types$morph) {
1333 df$morph$left[i] <- list(NA)
1334 df$morph$match[i] <- list(NA)
1335 df$morph$right[i] <- list(NA)
1336 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001337
Marc Kupietz93787d52025-09-03 13:33:25 +02001338 if (overwrite || !existing_types$atokens) {
1339 df$atokens$left[i] <- list(NA)
1340 df$atokens$match[i] <- list(NA)
1341 df$atokens$right[i] <- list(NA)
1342 }
1343 if (overwrite || !existing_types$annotation_snippet) {
1344 df$annotation_snippet[[i]] <- NA
1345 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001346 })
1347 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001348
Marc Kupietza29f3d42025-07-18 10:14:43 +02001349 # Validate data frame structure before assignment
1350 if (nrow(df) != nrow(kqo@collectedMatches)) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001351 }
1352
1353 # Update the collectedMatches with annotation data
1354 tryCatch({
1355 kqo@collectedMatches <- df
1356 }, error = function(assign_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001357 # Try a safer approach: add columns individually
1358 tryCatch({
1359 kqo@collectedMatches$pos <- df$pos
Marc Kupietzc643a122025-07-18 18:18:36 +02001360 kqo@collectedMatches$lemma <- df$lemma
Marc Kupietza29f3d42025-07-18 10:14:43 +02001361 kqo@collectedMatches$morph <- df$morph
1362 kqo@collectedMatches$atokens <- df$atokens
1363 kqo@collectedMatches$annotation_snippet <- df$annotation_snippet
1364 }, error = function(col_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001365 warning("Failed to add annotation data to collectedMatches")
1366 })
1367 })
1368
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001369 if (verbose) {
1370 elapsed_time <- Sys.time() - start_time
1371 log_info(verbose, paste("Finished fetching annotations for", nrows, "matches in", format_duration(as.numeric(elapsed_time, units = "secs")), "\n"))
1372 }
1373
Marc Kupietze52b2952025-07-17 16:53:02 +02001374 return(kqo)
1375})
1376
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001377#' Query frequencies of search expressions in virtual corpora
Marc Kupietz3f575282019-10-04 14:46:04 +02001378#'
Marc Kupietz67edcb52021-09-20 21:54:24 +02001379#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001380#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +02001381#' confidence intervals of one ore multiple search terms across one or multiple
1382#' virtual corpora.
1383#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02001384#' @family frequency analysis
Marc Kupietz3f575282019-10-04 14:46:04 +02001385#' @aliases frequencyQuery
Marc Kupietz3f575282019-10-04 14:46:04 +02001386#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +02001387#' \dontrun{
1388#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001389#' KorAPConnection(verbose = TRUE) |>
Marc Kupietz3f575282019-10-04 14:46:04 +02001390#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +01001391#' }
Marc Kupietz3f575282019-10-04 14:46:04 +02001392#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001393# @inheritParams corpusQuery
Marc Kupietz617266d2025-02-27 10:43:07 +01001394#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001395#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
1396#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz67edcb52021-09-20 21:54:24 +02001397#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
1398#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001399#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz3f575282019-10-04 14:46:04 +02001400#' @export
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001401#'
1402#' @return A tibble, with each row containing the following result columns for query and vc combinations:
1403#' - **query**: the query string used for the frequency analysis.
1404#' - **totalResults**: absolute frequency of query matches in the vc.
1405#' - **vc**: virtual corpus used for the query.
1406#' - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
1407#' - **total**: total number of words in vc.
1408#' - **f**: relative frequency of query matches in the vc.
1409#' - **conf.low**: lower bound of the confidence interval for the relative frequency, given `conf.level`.
1410#' - **conf.high**: upper bound of the confidence interval for the relative frequency, given `conf.level`.
1411
Marc Kupietzd8851222025-05-01 10:57:19 +02001412setMethod(
1413 "frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +01001414 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietzd8851222025-05-01 10:57:19 +02001415 (if (as.alternatives) {
1416 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
Marc Kupietzea34b812025-06-25 15:49:00 +02001417 group_by(vc) |>
Marc Kupietz71d6e052019-11-22 18:42:10 +01001418 mutate(total = sum(totalResults))
Marc Kupietzd8851222025-05-01 10:57:19 +02001419 } else {
1420 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
1421 mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
Marc Kupietzea34b812025-06-25 15:49:00 +02001422 }) |>
Marc Kupietz0c29cea2019-10-09 08:44:36 +02001423 ci(conf.level = conf.level)
Marc Kupietzd8851222025-05-01 10:57:19 +02001424 }
1425)
Marc Kupietz3f575282019-10-04 14:46:04 +02001426
Marc Kupietz38a9d682024-12-06 16:17:09 +01001427#' buildWebUIRequestUrlFromString
1428#'
1429#' @rdname KorAPQuery-class
1430#' @importFrom urltools url_encode
1431#' @export
1432buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +02001433 query,
1434 vc = "",
1435 ql = "poliqarp") {
Marc Kupietz38a9d682024-12-06 16:17:09 +01001436 if ("KorAPConnection" %in% class(KorAPUrl)) {
1437 KorAPUrl <- KorAPUrl@KorAPUrl
1438 }
1439
1440 request <-
1441 paste0(
Marc Kupietzd8851222025-05-01 10:57:19 +02001442 "?q=",
Marc Kupietz38a9d682024-12-06 16:17:09 +01001443 urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietzd8851222025-05-01 10:57:19 +02001444 ifelse(vc != "",
1445 paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
1446 ""
1447 ),
1448 "&ql=",
Marc Kupietz38a9d682024-12-06 16:17:09 +01001449 ql
1450 )
1451 paste0(KorAPUrl, request)
1452}
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001453
1454#' buildWebUIRequestUrl
1455#'
1456#' @rdname KorAPQuery-class
Marc Kupietzf9129592025-01-26 19:17:54 +01001457#' @importFrom httr2 url_parse
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001458#' @export
1459buildWebUIRequestUrl <- function(kco,
Marc Kupietzd8851222025-05-01 10:57:19 +02001460 query = if (missing(KorAPUrl)) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001461 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietzd8851222025-05-01 10:57:19 +02001462 } else {
1463 httr2::url_parse(KorAPUrl)$query$q
1464 },
Marc Kupietzf9129592025-01-26 19:17:54 +01001465 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001466 KorAPUrl,
Marc Kupietzf9129592025-01-26 19:17:54 +01001467 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz38a9d682024-12-06 16:17:09 +01001468 buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001469}
1470
Marc Kupietzd8851222025-05-01 10:57:19 +02001471#' format()
Marc Kupietze95108e2019-09-18 13:23:58 +02001472#' @rdname KorAPQuery-class
1473#' @param x KorAPQuery object
1474#' @param ... further arguments passed to or from other methods
Marc Kupietzb73ca0f2025-01-28 20:45:01 +01001475#' @importFrom urltools param_get url_decode
Marc Kupietze95108e2019-09-18 13:23:58 +02001476#' @export
1477format.KorAPQuery <- function(x, ...) {
1478 cat("<KorAPQuery>\n")
1479 q <- x
Marc Kupietzd8851222025-05-01 10:57:19 +02001480 param <- urltools::param_get(q@request) |> lapply(urltools::url_decode)
Marc Kupietzb73ca0f2025-01-28 20:45:01 +01001481 cat(" Query: ", param$q, "\n")
1482 if (!is.null(param$cq) && param$cq != "") {
1483 cat(" Virtual corpus: ", param$cq, "\n")
1484 }
1485 if (!is.null(q@collectedMatches)) {
1486 cat("==============================================================================================================", "\n")
1487 print(summary(q@collectedMatches))
1488 cat("==============================================================================================================", "\n")
1489 }
1490 cat(" Total results: ", q@totalResults, "\n")
1491 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietza29f3d42025-07-18 10:14:43 +02001492 if (!is.null(q@collectedMatches) && "pos" %in% colnames(q@collectedMatches)) {
1493 successful_annotations <- sum(!is.na(q@collectedMatches$annotation_snippet))
1494 parsed_annotations <- sum(!is.na(q@collectedMatches$pos))
1495 cat(" Annotations: ", successful_annotations, " of ", nrow(q@collectedMatches), " matches")
1496 if (parsed_annotations > 0) {
1497 cat(" (", parsed_annotations, " with parsed linguistic data)")
1498 }
1499 cat("\n")
Marc Kupietze52b2952025-07-17 16:53:02 +02001500 }
Marc Kupietz62da2b52019-09-12 17:43:34 +02001501}
1502
Marc Kupietze95108e2019-09-18 13:23:58 +02001503#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +02001504#'
Marc Kupietze95108e2019-09-18 13:23:58 +02001505#' @rdname KorAPQuery-class
1506#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +02001507#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +02001508setMethod("show", "KorAPQuery", function(object) {
1509 format(object)
Marc Kupietzc643a122025-07-18 18:18:36 +02001510 invisible(object)
Marc Kupietze95108e2019-09-18 13:23:58 +02001511})