blob: 5abd28105615cfb5d1ace22efcd4b5aa81721afc [file] [log] [blame]
Marc Kupietza8c40f42025-06-24 15:49:52 +02001#' KorAPQuery class (internal)
Marc Kupietze95108e2019-09-18 13:23:58 +02002#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02003#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
Marc Kupietze95108e2019-09-18 13:23:58 +02004#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02005#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +02006#' @include KorAPConnection.R
Marc Kupietz6dfeed92025-06-03 11:58:06 +02007#' @include logging.R
Marc Kupietzf9129592025-01-26 19:17:54 +01008#' @import httr2
Marc Kupietze95108e2019-09-18 13:23:58 +02009#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
24 "hasMoreMatches"
25))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietza8c40f42025-06-24 15:49:52 +020027#' Initialize KorAPQuery object
28#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +020029#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020030#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020031#' @param request query part of the request URL
32#' @param vc definition of a virtual corpus
33#' @param totalResults number of hits the query has yielded
34#' @param nextStartIndex at what index to start the next fetch of query results
35#' @param fields what data / metadata fields should be collected
36#' @param requestUrl complete URL of the API request
37#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
38#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020039#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020040#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020041#'
42#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020043#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +020044setMethod(
45 "initialize", "KorAPQuery",
46 function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
47 "corpusSigle", "textSigle", "pubDate", "pubPlace",
48 "availability", "textClass", "snippet", "tokens"
49 ),
50 requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
51 .Object <- callNextMethod()
52 .Object@korapConnection <- korapConnection
53 .Object@request <- request
54 .Object@vc <- vc
55 .Object@totalResults <- totalResults
56 .Object@nextStartIndex <- nextStartIndex
57 .Object@fields <- fields
58 .Object@requestUrl <- requestUrl
59 .Object@webUIRequestUrl <- webUIRequestUrl
60 .Object@apiResponse <- apiResponse
61 .Object@hasMoreMatches <- hasMoreMatches
62 .Object@collectedMatches <- collectedMatches
63 .Object
64 }
65)
Marc Kupietz632cbd42019-09-06 16:04:51 +020066
Marc Kupietzd8851222025-05-01 10:57:19 +020067setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
68setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
69setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
70setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
71setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietze95108e2019-09-18 13:23:58 +020072
73maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020074
Marc Kupietz4de53ec2019-10-04 09:12:00 +020075## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietzef1ef4a2025-02-19 12:12:40 +010076utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020077
Marc Kupietza8c40f42025-06-24 15:49:52 +020078#' Search corpus for query terms
Marc Kupietzdbd431a2021-08-29 12:17:45 +020079#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020080#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020081#'
Marc Kupietza8c40f42025-06-24 15:49:52 +020082#' @family corpus search functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +020083#' @aliases corpusQuery
84#'
85#' @importFrom urltools url_encode
86#' @importFrom purrr pmap
87#' @importFrom dplyr bind_rows
88#'
Marc Kupietz617266d2025-02-27 10:43:07 +010089#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz67edcb52021-09-20 21:54:24 +020090#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020091#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020092#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz132f0052023-04-16 14:23:05 +020093#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
94#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
95#' your RKorAPClient application as explained in the
96#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
97#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
98#' `FALSE`.
Marc Kupietz67edcb52021-09-20 21:54:24 +020099#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Marc Kupietz1623fe82025-06-24 16:31:46 +0200100#' @param fields character vector specifying which metadata fields to retrieve for each match.
101#' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
102#' \describe{
103#' \item{**Text identification**:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
104#' \item{**Publication info**:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
105#' \item{**Temporal data**:}{`pubDate`, `creationDate` - when text was published/created}
106#' \item{**Publication details**:}{`pubPlace`, `publisher`, `reference` - where/how published}
107#' \item{**Text classification**:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
108#' \item{**Adminstrative and technical info**:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
109#' \item{**Content data**:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
110#' \item{**System data**:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
111#' }
112#' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
113#' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default if `metadataOnly` is set to `FALSE`.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100114#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +0200115#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200116#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200117#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200118#' @param context string that specifies the size of the left and the right context returned in `snippet`
119#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
120#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
121#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
122#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
123#' as this is typically legally motivated.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200124#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz67edcb52021-09-20 21:54:24 +0200125#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
126#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200127#'
128#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200129#' \dontrun{
130#'
Marc Kupietz1623fe82025-06-24 16:31:46 +0200131#' # Fetch basic metadata for "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200132#' KorAPConnection() |>
133#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200134#' fetchAll()
Marc Kupietz1623fe82025-06-24 16:31:46 +0200135#'
136#' # Fetch specific metadata fields for bibliographic analysis
Marc Kupietzd3526422025-06-25 09:16:15 +0200137#' query <- KorAPConnection() |>
Marc Kupietz1623fe82025-06-24 16:31:46 +0200138#' corpusQuery("Ameisenplage",
139#' fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
140#' results <- fetchAll(query)
141#' results@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100142#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200143#'
Marc Kupietz6ae76052021-09-21 10:34:00 +0200144#' \dontrun{
145#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200146#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
147#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200148#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200149#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200150#' corpusQuery(
151#' KorAPUrl =
152#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
153#' )
Marc Kupietz6ae76052021-09-21 10:34:00 +0200154#' }
155#'
156#' \dontrun{
Marc Kupietz3c531f62019-09-13 12:17:24 +0200157#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200158#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200159#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200160#' {
161#' . ->> kco
Marc Kupietzd3526422025-06-25 09:16:15 +0200162#' } |>
163#' corpusQuery("Ameisenplage") |>
164#' fetchAll() |>
165#' slot("collectedMatches") |>
166#' mutate(year = lubridate::year(pubDate)) |>
167#' dplyr::select(year) |>
168#' group_by(year) |>
169#' summarise(Count = dplyr::n()) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200170#' mutate(Freq = mapply(function(f, y) {
171#' f / corpusStats(kco, paste("pubDate in", y))@tokens
Marc Kupietzd3526422025-06-25 09:16:15 +0200172#' }, Count, year)) |>
173#' dplyr::select(-Count) |>
174#' complete(year = min(year):max(year), fill = list(Freq = 0)) |>
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200175#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100176#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200177#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200178#'
179#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200180#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200181#'
182#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +0200183setMethod(
184 "corpusQuery", "KorAPConnection",
185 function(kco,
186 query = if (missing(KorAPUrl)) {
187 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
188 } else {
189 httr2::url_parse(KorAPUrl)$query$q
190 },
191 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
192 KorAPUrl,
193 metadataOnly = TRUE,
194 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
195 fields = c(
196 "corpusSigle",
197 "textSigle",
198 "pubDate",
199 "pubPlace",
200 "availability",
201 "textClass",
202 "snippet",
203 "tokens"
204 ),
205 accessRewriteFatal = TRUE,
206 verbose = kco@verbose,
207 expand = length(vc) != length(query),
208 as.df = FALSE,
209 context = NULL) {
210 if (length(query) > 1 || length(vc) > 1) {
211 grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200212
213 # Initialize timing variables for ETA calculation
214 total_queries <- nrow(grid)
215 current_query <- 0
216 start_time <- Sys.time()
217
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200218 results <- purrr::pmap(grid, function(query, vc, ...) {
219 current_query <<- current_query + 1
220
221 # Execute the single query directly (avoiding recursive call)
222 contentFields <- c("snippet", "tokens")
223 query_fields <- fields
224 if (metadataOnly) {
225 query_fields <- query_fields[!query_fields %in% contentFields]
226 }
227 if (!"textSigle" %in% query_fields) {
228 query_fields <- c(query_fields, "textSigle")
229 }
230 request <-
231 paste0(
232 "?q=",
233 url_encode(enc2utf8(query)),
234 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
235 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
236 ifelse(!metadataOnly, "&show-tokens=true", ""),
237 "&ql=", ql
238 )
239 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
240 requestUrl <- paste0(
241 kco@apiUrl,
242 "search",
243 request,
244 "&fields=",
245 paste(query_fields, collapse = ","),
246 if (metadataOnly) "&access-rewrite-disabled=true" else ""
247 )
248
249 # Show individual query progress
250 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
251 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
252 if (is.null(res)) {
253 log_info(verbose, ": API call failed\n")
254 totalResults <- 0
255 } else {
256 totalResults <- as.integer(res$meta$totalResults)
257 log_info(verbose, ": ", totalResults, " hits")
258 if (!is.null(res$meta$cached)) {
259 log_info(verbose, " [cached]")
260 } else if (!is.null(res$meta$benchmark)) {
261 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
262 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
263 formatted_time <- paste0(round(time_value, 2), "s")
264 log_info(verbose, ", took ", formatted_time)
265 } else {
266 log_info(verbose, ", took ", res$meta$benchmark)
267 }
268 }
269 log_info(verbose, "\n")
270 }
271
272 result <- data.frame(
273 query = query,
274 totalResults = totalResults,
275 vc = vc,
276 webUIRequestUrl = webUIRequestUrl,
277 stringsAsFactors = FALSE
278 )
279
280 # Calculate and display ETA information if verbose and we have more than one query
281 if (verbose && total_queries > 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200282 eta_info <- calculate_eta(current_query, total_queries, start_time)
283 if (eta_info != "") {
284 elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200285 avg_time_per_query <- elapsed_time / current_query
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200286
287 # Create progress display
288 progress_display <- paste0(
289 "Query ",
290 sprintf(paste0("%", nchar(total_queries), "d"), current_query),
291 "/",
292 sprintf("%d", total_queries),
293 " completed. Avg: ",
294 sprintf("%.1f", avg_time_per_query),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200295 "s/query",
296 eta_info
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200297 )
298
299 log_info(verbose, progress_display, "\n")
300 }
301 }
302
303 return(result)
304 })
305
306 results %>% bind_rows()
Marc Kupietzd8851222025-05-01 10:57:19 +0200307 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200308 contentFields <- c("snippet", "tokens")
Marc Kupietza96537f2019-11-09 23:07:44 +0100309 if (metadataOnly) {
310 fields <- fields[!fields %in% contentFields]
311 }
Marc Kupietz80dc6432025-02-07 16:57:40 +0100312 if (!"textSigle" %in% fields) {
313 fields <- c(fields, "textSigle")
314 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100315 request <-
Marc Kupietzd8851222025-05-01 10:57:19 +0200316 paste0(
317 "?q=",
318 url_encode(enc2utf8(query)),
319 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
320 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
321 ifelse(!metadataOnly, "&show-tokens=true", ""),
322 "&ql=", ql
323 )
Marc Kupietza96537f2019-11-09 23:07:44 +0100324 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
325 requestUrl <- paste0(
326 kco@apiUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200327 "search",
Marc Kupietza96537f2019-11-09 23:07:44 +0100328 request,
Marc Kupietzd8851222025-05-01 10:57:19 +0200329 "&fields=",
Marc Kupietza96537f2019-11-09 23:07:44 +0100330 paste(fields, collapse = ","),
Marc Kupietzd8851222025-05-01 10:57:19 +0200331 if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietza96537f2019-11-09 23:07:44 +0100332 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200333 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
334 sep =
335 ""
336 )
337 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietza4675722022-02-23 23:55:15 +0100338 if (is.null(res)) {
Marc Kupietza4675722022-02-23 23:55:15 +0100339 message("API call failed.")
340 totalResults <- 0
341 } else {
Marc Kupietzd8851222025-05-01 10:57:19 +0200342 totalResults <- as.integer(res$meta$totalResults)
Marc Kupietza47d1502023-04-18 15:26:47 +0200343 log_info(verbose, ": ", totalResults, " hits")
Marc Kupietzd8851222025-05-01 10:57:19 +0200344 if (!is.null(res$meta$cached)) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200345 log_info(verbose, " [cached]\n")
Marc Kupietzd8851222025-05-01 10:57:19 +0200346 } else if (!is.null(res$meta$benchmark)) {
Marc Kupietz7638ca42025-05-25 13:18:16 +0200347 # Round the benchmark time to 2 decimal places for better readability
348 # If it's a string ending with 's', extract the number, round it, and re-add 's'
349 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
350 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
351 formatted_time <- paste0(round(time_value, 2), "s")
352 log_info(verbose, ", took ", formatted_time, "\n", sep = "")
353 } else {
354 # Fallback if the format is different than expected
355 log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
356 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200357 } else {
358 log_info(verbose, "\n")
359 }
Marc Kupietza4675722022-02-23 23:55:15 +0100360 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200361 if (as.df) {
Marc Kupietza96537f2019-11-09 23:07:44 +0100362 data.frame(
363 query = query,
Marc Kupietza4675722022-02-23 23:55:15 +0100364 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100365 vc = vc,
366 webUIRequestUrl = webUIRequestUrl,
367 stringsAsFactors = FALSE
368 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200369 } else {
Marc Kupietza96537f2019-11-09 23:07:44 +0100370 KorAPQuery(
371 korapConnection = kco,
372 nextStartIndex = 0,
373 fields = fields,
374 requestUrl = requestUrl,
375 request = request,
Marc Kupietza4675722022-02-23 23:55:15 +0100376 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100377 vc = vc,
378 apiResponse = res,
379 webUIRequestUrl = webUIRequestUrl,
Marc Kupietza4675722022-02-23 23:55:15 +0100380 hasMoreMatches = (totalResults > 0),
Marc Kupietza96537f2019-11-09 23:07:44 +0100381 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200382 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100383 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200384 }
385)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200386
Marc Kupietz05a60792024-12-07 16:23:31 +0100387#' @importFrom purrr map
388repair_data_strcuture <- function(x) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200389 if (is.list(x)) {
390 as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz05a60792024-12-07 16:23:31 +0100391 paste(.x, collapse = " ")
392 } else {
393 .x
394 }))
Marc Kupietzd8851222025-05-01 10:57:19 +0200395 } else {
Marc Kupietz05a60792024-12-07 16:23:31 +0100396 ifelse(is.na(x), "", x)
Marc Kupietzd8851222025-05-01 10:57:19 +0200397 }
Marc Kupietz05a60792024-12-07 16:23:31 +0100398}
399
Marc Kupietz62da2b52019-09-12 17:43:34 +0200400#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200401#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200402#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200403#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200404#' @family corpus search functions
405#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200406#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200407#' @param offset start offset for query results to fetch
408#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200409#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200410#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
411#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200412#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100413#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200414#' \dontrun{
415#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200416#' q <- KorAPConnection() |>
417#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200418#' fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100419#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100420#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100421#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200422#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200423#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200424#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200425#' @aliases fetchNext
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200426#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietzf4881122024-12-17 14:55:39 +0100427#' @importFrom tibble enframe add_column
428#' @importFrom stringr word
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200429#' @importFrom tidyr unnest unchop pivot_wider
430#' @importFrom purrr map
Marc Kupietz632cbd42019-09-06 16:04:51 +0200431#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200432setMethod("fetchNext", "KorAPQuery", function(kqo,
433 offset = kqo@nextStartIndex,
434 maxFetch = maxResultsPerPage,
435 verbose = kqo@korapConnection@verbose,
436 randomizePageOrder = FALSE) {
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100437 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietzd8851222025-05-01 10:57:19 +0200438 results <- key <- name <- tmp_positions <- 0
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100439
Marc Kupietze95108e2019-09-18 13:23:58 +0200440 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
441 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200442 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200443 use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz623d7122025-05-25 12:46:12 +0200444 # Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietze95108e2019-09-18 13:23:58 +0200445 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200446
Marc Kupietz24799fd2025-06-25 14:15:36 +0200447 # Track start time for ETA calculation
448 start_time <- Sys.time()
449
Marc Kupietz623d7122025-05-25 12:46:12 +0200450 # For randomized page order, generate a list of randomized page indices
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200451 if (randomizePageOrder) {
Marc Kupietz623d7122025-05-25 12:46:12 +0200452 # Calculate how many pages we need to fetch based on maxFetch
453 total_pages_to_fetch <- if (!is.na(maxFetch)) {
454 # Either limited by maxFetch or total results, whichever is smaller
455 min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
456 } else {
457 # All pages
458 ceiling(kqo@totalResults / maxResultsPerPage)
459 }
460
461 # Generate randomized page indices (0-based for API)
462 pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
463 page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200464 }
465
Marc Kupietzd8851222025-05-01 10:57:19 +0200466 if (is.null(collectedMatches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200467 collectedMatches <- data.frame()
468 }
Marc Kupietz623d7122025-05-25 12:46:12 +0200469
470 # Initialize the page counter properly based on nextStartIndex and any previously fetched results
471 # We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
472 # For first call, this will be 1, for subsequent calls, it will reflect our actual position
473 current_page_number <- ceiling(offset / maxResultsPerPage) + 1
474
475 # For sequential fetches, keep track of which global page we're on
476 # This is important for correctly showing page numbers in subsequent fetchNext calls
477 page_count_start <- current_page_number
478
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200479 repeat {
Marc Kupietz623d7122025-05-25 12:46:12 +0200480 # Determine which page to fetch next
481 if (randomizePageOrder) {
482 # In randomized mode, get the page from our randomized list using the page_index
483 # Make sure we don't exceed the array bounds
484 if (page_index > length(pages)) {
485 break # No more pages to fetch in randomized mode
486 }
487 current_offset_page <- pages[page_index]
488 # For display purposes in randomized mode, show which page out of the total we're fetching
489 display_page_number <- page_index
490 } else {
491 # In sequential mode, use the current_page_number to calculate the offset
492 current_offset_page <- (current_page_number - 1)
493 display_page_number <- current_page_number
494 }
495
496 # Calculate the actual offset in tokens
497 currentOffset <- current_offset_page * maxResultsPerPage
498
Marc Kupietzef0e9392025-06-18 12:21:49 +0200499 # Build the query with the appropriate count and offset using httr2
500 count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
Marc Kupietzecc86702025-06-24 12:12:51 +0200501
Marc Kupietzef0e9392025-06-18 12:21:49 +0200502 # Parse existing URL to preserve all query parameters
503 parsed_url <- httr2::url_parse(kqo@requestUrl)
504 existing_query <- parsed_url$query
Marc Kupietzecc86702025-06-24 12:12:51 +0200505
Marc Kupietzef0e9392025-06-18 12:21:49 +0200506 # Add/update count and offset parameters
507 existing_query$count <- count_param
508 existing_query$offset <- currentOffset
Marc Kupietzecc86702025-06-24 12:12:51 +0200509
Marc Kupietzef0e9392025-06-18 12:21:49 +0200510 # Rebuild the URL with all parameters
511 query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz68170952021-06-30 09:37:21 +0200512 res <- apiCall(kqo@korapConnection, query)
513 if (length(res$matches) == 0) {
514 break
515 }
516
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200517 if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz16ccf112025-01-26 13:25:27 +0100518 log_info(verbose, "Using fields API: ")
Marc Kupietz05a60792024-12-07 16:23:31 +0100519 currentMatches <- res$matches$fields %>%
520 purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
521 tibble::enframe() %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200522 tidyr::unnest(cols = value) %>%
523 tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200524 dplyr::select(-name)
Marc Kupietzd8851222025-05-01 10:57:19 +0200525 if ("snippet" %in% colnames(res$matches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200526 currentMatches$snippet <- res$matches$snippet
527 }
Marc Kupietz3cd2c6c2025-01-08 20:35:39 +0100528 if ("tokens" %in% colnames(res$matches)) {
529 currentMatches$tokens <- res$matches$tokens
530 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200531 } else {
532 currentMatches <- res$matches
533 }
534
Marc Kupietze95108e2019-09-18 13:23:58 +0200535 for (field in kqo@fields) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200536 if (!field %in% colnames(currentMatches)) {
537 currentMatches[, field] <- NA
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200538 }
539 }
Marc Kupietzf4881122024-12-17 14:55:39 +0100540 currentMatches <- currentMatches %>%
541 select(kqo@fields) %>%
542 mutate(
Marc Kupietz0447da02025-01-08 20:51:09 +0100543 tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
Marc Kupietzf4881122024-12-17 14:55:39 +0100544 matchStart = as.integer(stringr::word(tmp_positions, 1)),
545 matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
546 ) %>%
547 select(-tmp_positions)
548
Marc Kupietz62da2b52019-09-12 17:43:34 +0200549 if (!is.list(collectedMatches)) {
550 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200551 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200552 collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200553 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200554
Marc Kupietz623d7122025-05-25 12:46:12 +0200555 # Get the actual items per page from the API response
556 # We now consistently use maxResultsPerPage instead
Marc Kupietzacbaab02025-05-01 10:56:35 +0200557
Marc Kupietz623d7122025-05-25 12:46:12 +0200558 # Calculate total pages consistently using fixed maxResultsPerPage
559 # This ensures consistent page counting across the function
560 total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
561
Marc Kupietz24799fd2025-06-25 14:15:36 +0200562 # Calculate ETA using the centralized function from logging.R
563 current_page <- if (randomizePageOrder) page_index else display_page_number
564 total_pages_to_fetch <- if (!is.na(maxFetch)) {
565 # Account for offset - we can only fetch from the remaining results after offset
566 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
567 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
568 } else {
569 total_pages
570 }
571
572 eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)
573
574 # Extract timing information for display
Marc Kupietzae9b6172025-05-02 15:50:01 +0200575 time_per_page <- NA
Marc Kupietzae9b6172025-05-02 15:50:01 +0200576 if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
Marc Kupietzae9b6172025-05-02 15:50:01 +0200577 time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
Marc Kupietzacbaab02025-05-01 10:56:35 +0200578 }
579
Marc Kupietz623d7122025-05-25 12:46:12 +0200580 # Create the page display string with proper formatting
Marc Kupietzacbaab02025-05-01 10:56:35 +0200581
Marc Kupietz623d7122025-05-25 12:46:12 +0200582 # For global page tracking, calculate the absolute page number
583 actual_display_number <- if (randomizePageOrder) {
584 current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
585 } else {
586 # In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
587 current_offset_page + 1
588 }
589
590 # For subsequent calls to fetchNext, we need to calculate the correct page numbers
591 # based on the current batch being fetched
592
593 # For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
594 # Simply count from 1 within the current batch
595
596 # The relative page number is simply the current position in this batch
597 if (randomizePageOrder) {
598 relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
599 } else {
600 relative_page_number <- display_page_number - (page_count_start - 1)
601 }
602
603 # How many pages will we fetch in this batch?
Marc Kupietz021663d2025-06-18 17:49:22 +0200604 # If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz623d7122025-05-25 12:46:12 +0200605 pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz021663d2025-06-18 17:49:22 +0200606 # Account for offset - we can only fetch from the remaining results after offset
607 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
608 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz623d7122025-05-25 12:46:12 +0200609 } else {
610 # Otherwise fetch all remaining pages
611 total_pages - page_count_start + 1
612 }
613
614 # The total pages to be shown in this batch
615 batch_total_pages <- pages_in_this_batch
616
617 page_display <- paste0(
618 "Retrieved page ",
619 sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
620 "/",
621 sprintf("%d", batch_total_pages)
622 )
623
624 # If randomized, also show which actual page we fetched
625 if (randomizePageOrder) {
626 # Determine the maximum width needed for page numbers (based on total pages)
627 # This ensures consistent alignment
628 max_page_width <- nchar(as.character(total_pages))
629 # Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz7638ca42025-05-25 13:18:16 +0200630 page_display <- paste0(
631 page_display,
632 sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
633 )
Marc Kupietz623d7122025-05-25 12:46:12 +0200634 }
635 # Always show the absolute page number and total pages (for clarity)
636 else {
637 # Show the absolute page number (out of total possible pages)
638 page_display <- paste0(page_display, sprintf(
639 " (page %d of %d total)",
640 actual_display_number, total_pages
641 ))
642 }
643
644 # Add caching or timing information
645 if (!is.null(res$meta$cached)) {
646 page_display <- paste0(page_display, " [cached]")
647 } else {
648 page_display <- paste0(
649 page_display,
650 " in ",
651 if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
Marc Kupietz24799fd2025-06-25 14:15:36 +0200652 "s",
653 eta_info
Marc Kupietz623d7122025-05-25 12:46:12 +0200654 )
655 }
656
657 log_info(verbose, paste0(page_display, "\n"))
658
659 # Increment the appropriate counter based on mode
660 if (randomizePageOrder) {
661 page_index <- page_index + 1
662 } else {
663 current_page_number <- current_page_number + 1
664 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200665 results <- results + res$meta$itemsPerPage
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200666 if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200667 break
668 }
669 }
Marc Kupietz68170952021-06-30 09:37:21 +0200670 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietzd8851222025-05-01 10:57:19 +0200671 KorAPQuery(
672 nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200673 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200674 fields = kqo@fields,
675 requestUrl = kqo@requestUrl,
676 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200677 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200678 vc = kqo@vc,
679 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200680 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200681 apiResponse = res,
Marc Kupietzd8851222025-05-01 10:57:19 +0200682 collectedMatches = collectedMatches
683 )
Marc Kupietze95108e2019-09-18 13:23:58 +0200684})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200685
686#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200687#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200688#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100689#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200690#' @family corpus search functions
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200691#' @param kqo object obtained from [corpusQuery()]
692#' @param verbose print progress information if true
693#' @param ... further arguments passed to [fetchNext()]
694#' @return The updated `kqo` object with all results in `@collectedMatches`
Marc Kupietza8c40f42025-06-24 15:49:52 +0200695#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200696#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200697#' \dontrun{
Marc Kupietzecc86702025-06-24 12:12:51 +0200698#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
699#' q <- KorAPConnection() |>
700#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200701#' fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200702#' q@collectedMatches
Marc Kupietzecc86702025-06-24 12:12:51 +0200703#'
704#' # Fetch also all KWICs
705#' q <- KorAPConnection() |> auth() |>
706#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
707#' fetchAll()
708#' q@collectedMatches
709#'
710#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
711#' q <- KorAPConnection() |>
712#' corpusQuery("<base/s=t>", # this matches each text once
713#' vc = "pubDate in 1958-03-12",
714#' fields = c("textSigle", "title"),
715#' ) |>
716#' fetchAll()
717#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100718#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200719#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200720#' @aliases fetchAll
Marc Kupietz62da2b52019-09-12 17:43:34 +0200721#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200722setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
723 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200724})
725
726#' Fetches the remaining results of a KorAP query.
727#'
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200728#' @param kqo object obtained from [corpusQuery()]
729#' @param verbose print progress information if true
730#' @param ... further arguments passed to [fetchNext()]
731#' @return The updated `kqo` object with remaining results in `@collectedMatches`
732#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200733#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200734#' \dontrun{
735#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200736#' q <- KorAPConnection() |>
737#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200738#' fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200739#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100740#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200741#'
742#' @aliases fetchRest
Marc Kupietze95108e2019-09-18 13:23:58 +0200743#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200744setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
745 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200746})
747
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200748#' Query frequencies of search expressions in virtual corpora
Marc Kupietz3f575282019-10-04 14:46:04 +0200749#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200750#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200751#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +0200752#' confidence intervals of one ore multiple search terms across one or multiple
753#' virtual corpora.
754#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200755#' @family frequency analysis
Marc Kupietz3f575282019-10-04 14:46:04 +0200756#' @aliases frequencyQuery
Marc Kupietz3f575282019-10-04 14:46:04 +0200757#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200758#' \dontrun{
759#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200760#' KorAPConnection(verbose = TRUE) |>
Marc Kupietz3f575282019-10-04 14:46:04 +0200761#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +0100762#' }
Marc Kupietz3f575282019-10-04 14:46:04 +0200763#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200764# @inheritParams corpusQuery
Marc Kupietz617266d2025-02-27 10:43:07 +0100765#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200766#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
767#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz67edcb52021-09-20 21:54:24 +0200768#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
769#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200770#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz3f575282019-10-04 14:46:04 +0200771#' @export
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200772#'
773#' @return A tibble, with each row containing the following result columns for query and vc combinations:
774#' - **query**: the query string used for the frequency analysis.
775#' - **totalResults**: absolute frequency of query matches in the vc.
776#' - **vc**: virtual corpus used for the query.
777#' - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
778#' - **total**: total number of words in vc.
779#' - **f**: relative frequency of query matches in the vc.
780#' - **conf.low**: lower bound of the confidence interval for the relative frequency, given `conf.level`.
781#' - **conf.high**: upper bound of the confidence interval for the relative frequency, given `conf.level`.
782
Marc Kupietzd8851222025-05-01 10:57:19 +0200783setMethod(
784 "frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +0100785 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200786 (if (as.alternatives) {
787 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
Marc Kupietz71d6e052019-11-22 18:42:10 +0100788 group_by(vc) %>%
789 mutate(total = sum(totalResults))
Marc Kupietzd8851222025-05-01 10:57:19 +0200790 } else {
791 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
792 mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
793 }) %>%
Marc Kupietz0c29cea2019-10-09 08:44:36 +0200794 ci(conf.level = conf.level)
Marc Kupietzd8851222025-05-01 10:57:19 +0200795 }
796)
Marc Kupietz3f575282019-10-04 14:46:04 +0200797
Marc Kupietz38a9d682024-12-06 16:17:09 +0100798#' buildWebUIRequestUrlFromString
799#'
800#' @rdname KorAPQuery-class
801#' @importFrom urltools url_encode
802#' @export
803buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200804 query,
805 vc = "",
806 ql = "poliqarp") {
Marc Kupietz38a9d682024-12-06 16:17:09 +0100807 if ("KorAPConnection" %in% class(KorAPUrl)) {
808 KorAPUrl <- KorAPUrl@KorAPUrl
809 }
810
811 request <-
812 paste0(
Marc Kupietzd8851222025-05-01 10:57:19 +0200813 "?q=",
Marc Kupietz38a9d682024-12-06 16:17:09 +0100814 urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietzd8851222025-05-01 10:57:19 +0200815 ifelse(vc != "",
816 paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
817 ""
818 ),
819 "&ql=",
Marc Kupietz38a9d682024-12-06 16:17:09 +0100820 ql
821 )
822 paste0(KorAPUrl, request)
823}
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200824
825#' buildWebUIRequestUrl
826#'
827#' @rdname KorAPQuery-class
Marc Kupietzf9129592025-01-26 19:17:54 +0100828#' @importFrom httr2 url_parse
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200829#' @export
830buildWebUIRequestUrl <- function(kco,
Marc Kupietzd8851222025-05-01 10:57:19 +0200831 query = if (missing(KorAPUrl)) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200832 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietzd8851222025-05-01 10:57:19 +0200833 } else {
834 httr2::url_parse(KorAPUrl)$query$q
835 },
Marc Kupietzf9129592025-01-26 19:17:54 +0100836 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200837 KorAPUrl,
Marc Kupietzf9129592025-01-26 19:17:54 +0100838 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz38a9d682024-12-06 16:17:09 +0100839 buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200840}
841
Marc Kupietzd8851222025-05-01 10:57:19 +0200842#' format()
Marc Kupietze95108e2019-09-18 13:23:58 +0200843#' @rdname KorAPQuery-class
844#' @param x KorAPQuery object
845#' @param ... further arguments passed to or from other methods
Marc Kupietzb73ca0f2025-01-28 20:45:01 +0100846#' @importFrom urltools param_get url_decode
Marc Kupietze95108e2019-09-18 13:23:58 +0200847#' @export
848format.KorAPQuery <- function(x, ...) {
849 cat("<KorAPQuery>\n")
850 q <- x
Marc Kupietzd8851222025-05-01 10:57:19 +0200851 param <- urltools::param_get(q@request) |> lapply(urltools::url_decode)
Marc Kupietzb73ca0f2025-01-28 20:45:01 +0100852 cat(" Query: ", param$q, "\n")
853 if (!is.null(param$cq) && param$cq != "") {
854 cat(" Virtual corpus: ", param$cq, "\n")
855 }
856 if (!is.null(q@collectedMatches)) {
857 cat("==============================================================================================================", "\n")
858 print(summary(q@collectedMatches))
859 cat("==============================================================================================================", "\n")
860 }
861 cat(" Total results: ", q@totalResults, "\n")
862 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200863}
864
Marc Kupietze95108e2019-09-18 13:23:58 +0200865#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200866#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200867#' @rdname KorAPQuery-class
868#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200869#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200870setMethod("show", "KorAPQuery", function(object) {
871 format(object)
872})