blob: 93f545813fb34c3f3cf9794ea635e697463fb5dd [file] [log] [blame]
Marc Kupietze95108e2019-09-18 13:23:58 +02001#' Class KorAPQuery
2#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +01003#' This class provides methods to perform different kinds of queries on the KorAP API server.
Marc Kupietz67edcb52021-09-20 21:54:24 +02004#' `KorAPQuery` objects, which are typically created by the [corpusQuery()] method,
Marc Kupietza6e4ee62021-03-05 09:00:15 +01005#' represent the current state of a query to a KorAP server.
Marc Kupietze95108e2019-09-18 13:23:58 +02006#'
7#' @include KorAPConnection.R
Marc Kupietzf9129592025-01-26 19:17:54 +01008#' @import httr2
Marc Kupietze95108e2019-09-18 13:23:58 +02009#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
24 "hasMoreMatches"
25))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietze95108e2019-09-18 13:23:58 +020027#' Method initialize
28#'
29#' @rdname KorAPQuery-class
30#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020031#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020032#' @param request query part of the request URL
33#' @param vc definition of a virtual corpus
34#' @param totalResults number of hits the query has yielded
35#' @param nextStartIndex at what index to start the next fetch of query results
36#' @param fields what data / metadata fields should be collected
37#' @param requestUrl complete URL of the API request
38#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
39#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020040#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020041#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020042#'
43#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020044#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +020045setMethod(
46 "initialize", "KorAPQuery",
47 function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
48 "corpusSigle", "textSigle", "pubDate", "pubPlace",
49 "availability", "textClass", "snippet", "tokens"
50 ),
51 requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
52 .Object <- callNextMethod()
53 .Object@korapConnection <- korapConnection
54 .Object@request <- request
55 .Object@vc <- vc
56 .Object@totalResults <- totalResults
57 .Object@nextStartIndex <- nextStartIndex
58 .Object@fields <- fields
59 .Object@requestUrl <- requestUrl
60 .Object@webUIRequestUrl <- webUIRequestUrl
61 .Object@apiResponse <- apiResponse
62 .Object@hasMoreMatches <- hasMoreMatches
63 .Object@collectedMatches <- collectedMatches
64 .Object
65 }
66)
Marc Kupietz632cbd42019-09-06 16:04:51 +020067
Marc Kupietzd8851222025-05-01 10:57:19 +020068setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
69setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
70setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
71setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
72setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietze95108e2019-09-18 13:23:58 +020073
74maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020075
Marc Kupietz4de53ec2019-10-04 09:12:00 +020076## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietzef1ef4a2025-02-19 12:12:40 +010077utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020078
Marc Kupietzdbd431a2021-08-29 12:17:45 +020079#' Corpus query
80#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020081#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020082#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020083#' @rdname KorAPQuery-class
84#' @aliases corpusQuery
85#'
86#' @importFrom urltools url_encode
87#' @importFrom purrr pmap
88#' @importFrom dplyr bind_rows
89#'
Marc Kupietz617266d2025-02-27 10:43:07 +010090#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz67edcb52021-09-20 21:54:24 +020091#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020092#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020093#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz132f0052023-04-16 14:23:05 +020094#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
95#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
96#' your RKorAPClient application as explained in the
97#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
98#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
99#' `FALSE`.
Marc Kupietz67edcb52021-09-20 21:54:24 +0200100#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Akron5e135462019-09-27 16:31:38 +0200101#' @param fields (meta)data fields that will be fetched for every match.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100102#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +0200103#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200104#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200105#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200106#' @param context string that specifies the size of the left and the right context returned in `snippet`
107#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
108#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
109#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
110#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
111#' as this is typically legally motivated.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200112#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz67edcb52021-09-20 21:54:24 +0200113#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
114#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200115#'
116#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200117#' \dontrun{
118#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200119#' # Fetch metadata of every query hit for "Ameisenplage" and show a summary
Marc Kupietzd8851222025-05-01 10:57:19 +0200120#' KorAPConnection() %>%
121#' corpusQuery("Ameisenplage") %>%
122#' fetchAll()
Marc Kupietz657d8e72020-02-25 18:31:50 +0100123#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200124#'
Marc Kupietz6ae76052021-09-21 10:34:00 +0200125#' \dontrun{
126#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200127#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
128#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200129#'
Marc Kupietz617266d2025-02-27 10:43:07 +0100130#' KorAPConnection(verbose = TRUE) %>%
Marc Kupietzd8851222025-05-01 10:57:19 +0200131#' corpusQuery(
132#' KorAPUrl =
133#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
134#' )
Marc Kupietz6ae76052021-09-21 10:34:00 +0200135#' }
136#'
137#' \dontrun{
Marc Kupietz3c531f62019-09-13 12:17:24 +0200138#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200139#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietzd8851222025-05-01 10:57:19 +0200140#' KorAPConnection(verbose = TRUE) %>%
141#' {
142#' . ->> kco
143#' } %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200144#' corpusQuery("Ameisenplage") %>%
145#' fetchAll() %>%
146#' slot("collectedMatches") %>%
147#' mutate(year = lubridate::year(pubDate)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200148#' dplyr::select(year) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200149#' group_by(year) %>%
Marc Kupietzcb3c59e2020-06-02 10:10:43 +0200150#' summarise(Count = dplyr::n()) %>%
Marc Kupietzd8851222025-05-01 10:57:19 +0200151#' mutate(Freq = mapply(function(f, y) {
152#' f / corpusStats(kco, paste("pubDate in", y))@tokens
153#' }, Count, year)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200154#' dplyr::select(-Count) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200155#' complete(year = min(year):max(year), fill = list(Freq = 0)) %>%
156#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100157#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200158#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200159#'
160#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200161#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200162#'
163#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +0200164setMethod(
165 "corpusQuery", "KorAPConnection",
166 function(kco,
167 query = if (missing(KorAPUrl)) {
168 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
169 } else {
170 httr2::url_parse(KorAPUrl)$query$q
171 },
172 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
173 KorAPUrl,
174 metadataOnly = TRUE,
175 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
176 fields = c(
177 "corpusSigle",
178 "textSigle",
179 "pubDate",
180 "pubPlace",
181 "availability",
182 "textClass",
183 "snippet",
184 "tokens"
185 ),
186 accessRewriteFatal = TRUE,
187 verbose = kco@verbose,
188 expand = length(vc) != length(query),
189 as.df = FALSE,
190 context = NULL) {
191 if (length(query) > 1 || length(vc) > 1) {
192 grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200193
194 # Initialize timing variables for ETA calculation
195 total_queries <- nrow(grid)
196 current_query <- 0
197 start_time <- Sys.time()
198
199 # Helper function to format duration
200 format_duration <- function(seconds) {
201 if (is.na(seconds) || seconds < 0) {
202 return("00s")
203 }
204 days <- floor(seconds / (24 * 3600))
205 seconds <- seconds %% (24 * 3600)
206 hours <- floor(seconds / 3600)
207 seconds <- seconds %% 3600
208 minutes <- floor(seconds / 60)
209 seconds <- floor(seconds %% 60)
210 paste0(
211 if (days > 0) paste0(days, "d ") else "",
212 if (hours > 0 || days > 0) paste0(sprintf("%02d", hours), "h ") else "",
213 if (minutes > 0 || hours > 0 || days > 0) paste0(sprintf("%02d", minutes), "m ") else "",
214 paste0(sprintf("%02d", seconds), "s")
215 )
216 }
217
218 results <- purrr::pmap(grid, function(query, vc, ...) {
219 current_query <<- current_query + 1
220
221 # Execute the single query directly (avoiding recursive call)
222 contentFields <- c("snippet", "tokens")
223 query_fields <- fields
224 if (metadataOnly) {
225 query_fields <- query_fields[!query_fields %in% contentFields]
226 }
227 if (!"textSigle" %in% query_fields) {
228 query_fields <- c(query_fields, "textSigle")
229 }
230 request <-
231 paste0(
232 "?q=",
233 url_encode(enc2utf8(query)),
234 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
235 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
236 ifelse(!metadataOnly, "&show-tokens=true", ""),
237 "&ql=", ql
238 )
239 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
240 requestUrl <- paste0(
241 kco@apiUrl,
242 "search",
243 request,
244 "&fields=",
245 paste(query_fields, collapse = ","),
246 if (metadataOnly) "&access-rewrite-disabled=true" else ""
247 )
248
249 # Show individual query progress
250 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
251 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
252 if (is.null(res)) {
253 log_info(verbose, ": API call failed\n")
254 totalResults <- 0
255 } else {
256 totalResults <- as.integer(res$meta$totalResults)
257 log_info(verbose, ": ", totalResults, " hits")
258 if (!is.null(res$meta$cached)) {
259 log_info(verbose, " [cached]")
260 } else if (!is.null(res$meta$benchmark)) {
261 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
262 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
263 formatted_time <- paste0(round(time_value, 2), "s")
264 log_info(verbose, ", took ", formatted_time)
265 } else {
266 log_info(verbose, ", took ", res$meta$benchmark)
267 }
268 }
269 log_info(verbose, "\n")
270 }
271
272 result <- data.frame(
273 query = query,
274 totalResults = totalResults,
275 vc = vc,
276 webUIRequestUrl = webUIRequestUrl,
277 stringsAsFactors = FALSE
278 )
279
280 # Calculate and display ETA information if verbose and we have more than one query
281 if (verbose && total_queries > 1) {
282 elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
283
284 if (current_query > 1) { # Only calculate ETA after the first query
285 avg_time_per_query <- elapsed_time / current_query
286 remaining_queries <- total_queries - current_query
287 estimated_remaining_seconds <- remaining_queries * avg_time_per_query
288 estimated_completion_time <- Sys.time() + estimated_remaining_seconds
289
290 eta_str <- format_duration(estimated_remaining_seconds)
291 completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
292
293 # Create progress display
294 progress_display <- paste0(
295 "Query ",
296 sprintf(paste0("%", nchar(total_queries), "d"), current_query),
297 "/",
298 sprintf("%d", total_queries),
299 " completed. Avg: ",
300 sprintf("%.1f", avg_time_per_query),
301 "s/query. ETA: ",
302 eta_str,
303 " (", completion_time_str, ")"
304 )
305
306 log_info(verbose, progress_display, "\n")
307 }
308 }
309
310 return(result)
311 })
312
313 results %>% bind_rows()
Marc Kupietzd8851222025-05-01 10:57:19 +0200314 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200315 contentFields <- c("snippet", "tokens")
Marc Kupietza96537f2019-11-09 23:07:44 +0100316 if (metadataOnly) {
317 fields <- fields[!fields %in% contentFields]
318 }
Marc Kupietz80dc6432025-02-07 16:57:40 +0100319 if (!"textSigle" %in% fields) {
320 fields <- c(fields, "textSigle")
321 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100322 request <-
Marc Kupietzd8851222025-05-01 10:57:19 +0200323 paste0(
324 "?q=",
325 url_encode(enc2utf8(query)),
326 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
327 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
328 ifelse(!metadataOnly, "&show-tokens=true", ""),
329 "&ql=", ql
330 )
Marc Kupietza96537f2019-11-09 23:07:44 +0100331 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
332 requestUrl <- paste0(
333 kco@apiUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200334 "search",
Marc Kupietza96537f2019-11-09 23:07:44 +0100335 request,
Marc Kupietzd8851222025-05-01 10:57:19 +0200336 "&fields=",
Marc Kupietza96537f2019-11-09 23:07:44 +0100337 paste(fields, collapse = ","),
Marc Kupietzd8851222025-05-01 10:57:19 +0200338 if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietza96537f2019-11-09 23:07:44 +0100339 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200340 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
341 sep =
342 ""
343 )
344 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietza4675722022-02-23 23:55:15 +0100345 if (is.null(res)) {
Marc Kupietza4675722022-02-23 23:55:15 +0100346 message("API call failed.")
347 totalResults <- 0
348 } else {
Marc Kupietzd8851222025-05-01 10:57:19 +0200349 totalResults <- as.integer(res$meta$totalResults)
Marc Kupietza47d1502023-04-18 15:26:47 +0200350 log_info(verbose, ": ", totalResults, " hits")
Marc Kupietzd8851222025-05-01 10:57:19 +0200351 if (!is.null(res$meta$cached)) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200352 log_info(verbose, " [cached]\n")
Marc Kupietzd8851222025-05-01 10:57:19 +0200353 } else if (!is.null(res$meta$benchmark)) {
Marc Kupietz7638ca42025-05-25 13:18:16 +0200354 # Round the benchmark time to 2 decimal places for better readability
355 # If it's a string ending with 's', extract the number, round it, and re-add 's'
356 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
357 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
358 formatted_time <- paste0(round(time_value, 2), "s")
359 log_info(verbose, ", took ", formatted_time, "\n", sep = "")
360 } else {
361 # Fallback if the format is different than expected
362 log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
363 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200364 } else {
365 log_info(verbose, "\n")
366 }
Marc Kupietza4675722022-02-23 23:55:15 +0100367 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200368 if (as.df) {
Marc Kupietza96537f2019-11-09 23:07:44 +0100369 data.frame(
370 query = query,
Marc Kupietza4675722022-02-23 23:55:15 +0100371 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100372 vc = vc,
373 webUIRequestUrl = webUIRequestUrl,
374 stringsAsFactors = FALSE
375 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200376 } else {
Marc Kupietza96537f2019-11-09 23:07:44 +0100377 KorAPQuery(
378 korapConnection = kco,
379 nextStartIndex = 0,
380 fields = fields,
381 requestUrl = requestUrl,
382 request = request,
Marc Kupietza4675722022-02-23 23:55:15 +0100383 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100384 vc = vc,
385 apiResponse = res,
386 webUIRequestUrl = webUIRequestUrl,
Marc Kupietza4675722022-02-23 23:55:15 +0100387 hasMoreMatches = (totalResults > 0),
Marc Kupietza96537f2019-11-09 23:07:44 +0100388 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200389 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100390 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200391 }
392)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200393
Marc Kupietz05a60792024-12-07 16:23:31 +0100394#' @importFrom purrr map
395repair_data_strcuture <- function(x) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200396 if (is.list(x)) {
397 as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz05a60792024-12-07 16:23:31 +0100398 paste(.x, collapse = " ")
399 } else {
400 .x
401 }))
Marc Kupietzd8851222025-05-01 10:57:19 +0200402 } else {
Marc Kupietz05a60792024-12-07 16:23:31 +0100403 ifelse(is.na(x), "", x)
Marc Kupietzd8851222025-05-01 10:57:19 +0200404 }
Marc Kupietz05a60792024-12-07 16:23:31 +0100405}
406
Marc Kupietz62da2b52019-09-12 17:43:34 +0200407#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200408#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200409#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200410#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200411#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200412#' @param offset start offset for query results to fetch
413#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200414#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200415#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
416#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200417#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100418#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200419#' \dontrun{
420#'
Marc Kupietzd8851222025-05-01 10:57:19 +0200421#' q <- KorAPConnection() %>%
422#' corpusQuery("Ameisenplage") %>%
423#' fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100424#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100425#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100426#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200427#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200428#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200429#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200430#' @aliases fetchNext
431#' @rdname KorAPQuery-class
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200432#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietzf4881122024-12-17 14:55:39 +0100433#' @importFrom tibble enframe add_column
434#' @importFrom stringr word
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200435#' @importFrom tidyr unnest unchop pivot_wider
436#' @importFrom purrr map
Marc Kupietz632cbd42019-09-06 16:04:51 +0200437#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200438setMethod("fetchNext", "KorAPQuery", function(kqo,
439 offset = kqo@nextStartIndex,
440 maxFetch = maxResultsPerPage,
441 verbose = kqo@korapConnection@verbose,
442 randomizePageOrder = FALSE) {
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100443 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietzd8851222025-05-01 10:57:19 +0200444 results <- key <- name <- tmp_positions <- 0
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100445
Marc Kupietze95108e2019-09-18 13:23:58 +0200446 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
447 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200448 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200449 use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz623d7122025-05-25 12:46:12 +0200450 # Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietze95108e2019-09-18 13:23:58 +0200451 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200452
Marc Kupietz623d7122025-05-25 12:46:12 +0200453 # For randomized page order, generate a list of randomized page indices
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200454 if (randomizePageOrder) {
Marc Kupietz623d7122025-05-25 12:46:12 +0200455 # Calculate how many pages we need to fetch based on maxFetch
456 total_pages_to_fetch <- if (!is.na(maxFetch)) {
457 # Either limited by maxFetch or total results, whichever is smaller
458 min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
459 } else {
460 # All pages
461 ceiling(kqo@totalResults / maxResultsPerPage)
462 }
463
464 # Generate randomized page indices (0-based for API)
465 pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
466 page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200467 }
468
Marc Kupietzd8851222025-05-01 10:57:19 +0200469 if (is.null(collectedMatches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200470 collectedMatches <- data.frame()
471 }
Marc Kupietz623d7122025-05-25 12:46:12 +0200472
473 # Initialize the page counter properly based on nextStartIndex and any previously fetched results
474 # We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
475 # For first call, this will be 1, for subsequent calls, it will reflect our actual position
476 current_page_number <- ceiling(offset / maxResultsPerPage) + 1
477
478 # For sequential fetches, keep track of which global page we're on
479 # This is important for correctly showing page numbers in subsequent fetchNext calls
480 page_count_start <- current_page_number
481
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200482 repeat {
Marc Kupietz623d7122025-05-25 12:46:12 +0200483 # Determine which page to fetch next
484 if (randomizePageOrder) {
485 # In randomized mode, get the page from our randomized list using the page_index
486 # Make sure we don't exceed the array bounds
487 if (page_index > length(pages)) {
488 break # No more pages to fetch in randomized mode
489 }
490 current_offset_page <- pages[page_index]
491 # For display purposes in randomized mode, show which page out of the total we're fetching
492 display_page_number <- page_index
493 } else {
494 # In sequential mode, use the current_page_number to calculate the offset
495 current_offset_page <- (current_page_number - 1)
496 display_page_number <- current_page_number
497 }
498
499 # Calculate the actual offset in tokens
500 currentOffset <- current_offset_page * maxResultsPerPage
501
502 # Build the query with the appropriate count and offset
Marc Kupietzd8851222025-05-01 10:57:19 +0200503 query <- paste0(kqo@requestUrl, "&count=", min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage), "&offset=", currentOffset, "&cutoff=true")
Marc Kupietz68170952021-06-30 09:37:21 +0200504 res <- apiCall(kqo@korapConnection, query)
505 if (length(res$matches) == 0) {
506 break
507 }
508
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200509 if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz16ccf112025-01-26 13:25:27 +0100510 log_info(verbose, "Using fields API: ")
Marc Kupietz05a60792024-12-07 16:23:31 +0100511 currentMatches <- res$matches$fields %>%
512 purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
513 tibble::enframe() %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200514 tidyr::unnest(cols = value) %>%
515 tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200516 dplyr::select(-name)
Marc Kupietzd8851222025-05-01 10:57:19 +0200517 if ("snippet" %in% colnames(res$matches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200518 currentMatches$snippet <- res$matches$snippet
519 }
Marc Kupietz3cd2c6c2025-01-08 20:35:39 +0100520 if ("tokens" %in% colnames(res$matches)) {
521 currentMatches$tokens <- res$matches$tokens
522 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200523 } else {
524 currentMatches <- res$matches
525 }
526
Marc Kupietze95108e2019-09-18 13:23:58 +0200527 for (field in kqo@fields) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200528 if (!field %in% colnames(currentMatches)) {
529 currentMatches[, field] <- NA
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200530 }
531 }
Marc Kupietzf4881122024-12-17 14:55:39 +0100532 currentMatches <- currentMatches %>%
533 select(kqo@fields) %>%
534 mutate(
Marc Kupietz0447da02025-01-08 20:51:09 +0100535 tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
Marc Kupietzf4881122024-12-17 14:55:39 +0100536 matchStart = as.integer(stringr::word(tmp_positions, 1)),
537 matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
538 ) %>%
539 select(-tmp_positions)
540
Marc Kupietz62da2b52019-09-12 17:43:34 +0200541 if (!is.list(collectedMatches)) {
542 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200543 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200544 collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200545 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200546
Marc Kupietz623d7122025-05-25 12:46:12 +0200547 # Get the actual items per page from the API response
548 # We now consistently use maxResultsPerPage instead
Marc Kupietzacbaab02025-05-01 10:56:35 +0200549
Marc Kupietz623d7122025-05-25 12:46:12 +0200550 # Calculate total pages consistently using fixed maxResultsPerPage
551 # This ensures consistent page counting across the function
552 total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
553
554 # Calculate the total pages based on what we've already fetched plus what we'll fetch
555 # This ensures the correct denominator is displayed for subsequent fetchNext calls
556
557 # Calculate the total number of pages for the entire result set
558 # This calculation is kept for reference and for showing in parentheses
Marc Kupietz669114b2025-05-02 22:02:20 +0200559
Marc Kupietzae9b6172025-05-02 15:50:01 +0200560 # Estimate remaining time
561 time_per_page <- NA
562 eta_str <- "N/A"
563 completion_time_str <- "N/A"
Marc Kupietzacbaab02025-05-01 10:56:35 +0200564
Marc Kupietzae9b6172025-05-02 15:50:01 +0200565 if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
566 # benchmark looks like "0.123s"
567 time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
568 if (!is.na(time_per_page)) {
Marc Kupietz623d7122025-05-25 12:46:12 +0200569 # First determine our current global position for ETA calculation
570 current_global_position <- if (randomizePageOrder) {
571 # In randomized mode, this is how many pages we've processed so far in this batch
572 page_index - 1 # -1 because we're calculating remaining
573 } else {
574 page_count_start + (current_page_number - 1) - 1 # -1 because we're calculating remaining
575 }
576
577 # Calculate remaining pages based on maxFetch if specified
578 if (!is.na(maxFetch) && maxFetch < kqo@totalResults) {
579 # We need to fetch up to maxFetch results
580 remaining_items_to_fetch <- maxFetch - nrow(collectedMatches)
581 remaining_pages <- ceiling(remaining_items_to_fetch / maxResultsPerPage)
582 } else {
583 # We need to fetch all results - account for our actual global position
584 # For randomized order, calculate remaining pages based on the randomized list or maxFetch
585 if (randomizePageOrder) {
586 if (exists("pages") && length(pages) > 0) {
587 remaining_pages <- length(pages) - page_index
588 } else if (!is.na(maxFetch)) {
589 # If pages is not available, use maxFetch to estimate remaining pages
590 remaining_pages <- ceiling(maxFetch / maxResultsPerPage) - page_index
591 } else {
592 # Fallback to a reasonable default
593 remaining_pages <- 1
594 }
595 } else {
596 # For sequential order, use the current global position
597 remaining_pages <- total_pages - current_global_position
598 }
599 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200600
601 estimated_remaining_seconds <- remaining_pages * time_per_page
602 estimated_completion_time <- Sys.time() + estimated_remaining_seconds
603
604 # Format time nicely
605 format_duration <- function(seconds) {
606 if (is.na(seconds) || seconds < 0) {
Marc Kupietz623d7122025-05-25 12:46:12 +0200607 # Instead of "N/A", return "00s" as a fallback
608 return("00s")
Marc Kupietzae9b6172025-05-02 15:50:01 +0200609 }
610 days <- floor(seconds / (24 * 3600))
611 seconds <- seconds %% (24 * 3600)
612 hours <- floor(seconds / 3600)
613 seconds <- seconds %% 3600
614 minutes <- floor(seconds / 60)
615 seconds <- floor(seconds %% 60)
616 paste0(
617 if (days > 0) paste0(days, "d ") else "",
618 if (hours > 0 || days > 0) paste0(sprintf("%02d", hours), "h ") else "",
619 if (minutes > 0 || hours > 0 || days > 0) paste0(sprintf("%02d", minutes), "m ") else "",
620 paste0(sprintf("%02d", seconds), "s")
621 )
622 }
623
624 eta_str <- format_duration(estimated_remaining_seconds)
625 completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
Marc Kupietzacbaab02025-05-01 10:56:35 +0200626 }
Marc Kupietzacbaab02025-05-01 10:56:35 +0200627 }
628
Marc Kupietz623d7122025-05-25 12:46:12 +0200629 # Create the page display string with proper formatting
Marc Kupietzacbaab02025-05-01 10:56:35 +0200630
Marc Kupietz623d7122025-05-25 12:46:12 +0200631 # For global page tracking, calculate the absolute page number
632 actual_display_number <- if (randomizePageOrder) {
633 current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
634 } else {
635 # In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
636 current_offset_page + 1
637 }
638
639 # For subsequent calls to fetchNext, we need to calculate the correct page numbers
640 # based on the current batch being fetched
641
642 # For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
643 # Simply count from 1 within the current batch
644
645 # The relative page number is simply the current position in this batch
646 if (randomizePageOrder) {
647 relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
648 } else {
649 relative_page_number <- display_page_number - (page_count_start - 1)
650 }
651
652 # How many pages will we fetch in this batch?
653 # If maxFetch is specified, calculate based on it
654 pages_in_this_batch <- if (!is.na(maxFetch)) {
655 ceiling(maxFetch / maxResultsPerPage)
656 } else {
657 # Otherwise fetch all remaining pages
658 total_pages - page_count_start + 1
659 }
660
661 # The total pages to be shown in this batch
662 batch_total_pages <- pages_in_this_batch
663
664 page_display <- paste0(
665 "Retrieved page ",
666 sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
667 "/",
668 sprintf("%d", batch_total_pages)
669 )
670
671 # If randomized, also show which actual page we fetched
672 if (randomizePageOrder) {
673 # Determine the maximum width needed for page numbers (based on total pages)
674 # This ensures consistent alignment
675 max_page_width <- nchar(as.character(total_pages))
676 # Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz7638ca42025-05-25 13:18:16 +0200677 page_display <- paste0(
678 page_display,
679 sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
680 )
Marc Kupietz623d7122025-05-25 12:46:12 +0200681 }
682 # Always show the absolute page number and total pages (for clarity)
683 else {
684 # Show the absolute page number (out of total possible pages)
685 page_display <- paste0(page_display, sprintf(
686 " (page %d of %d total)",
687 actual_display_number, total_pages
688 ))
689 }
690
691 # Add caching or timing information
692 if (!is.null(res$meta$cached)) {
693 page_display <- paste0(page_display, " [cached]")
694 } else {
695 page_display <- paste0(
696 page_display,
697 " in ",
698 if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
699 "s. ETA: ",
700 # Display ETA for both randomized and sequential modes
701 eta_str,
702 # Show completion time for both modes
703 paste0(" (", completion_time_str, ")")
704 )
705 }
706
707 log_info(verbose, paste0(page_display, "\n"))
708
709 # Increment the appropriate counter based on mode
710 if (randomizePageOrder) {
711 page_index <- page_index + 1
712 } else {
713 current_page_number <- current_page_number + 1
714 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200715 results <- results + res$meta$itemsPerPage
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200716 if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200717 break
718 }
719 }
Marc Kupietz68170952021-06-30 09:37:21 +0200720 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietzd8851222025-05-01 10:57:19 +0200721 KorAPQuery(
722 nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200723 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200724 fields = kqo@fields,
725 requestUrl = kqo@requestUrl,
726 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200727 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200728 vc = kqo@vc,
729 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200730 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200731 apiResponse = res,
Marc Kupietzd8851222025-05-01 10:57:19 +0200732 collectedMatches = collectedMatches
733 )
Marc Kupietze95108e2019-09-18 13:23:58 +0200734})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200735
736#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200737#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200738#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100739#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200740#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200741#' \dontrun{
742#'
Marc Kupietzd8851222025-05-01 10:57:19 +0200743#' q <- KorAPConnection() %>%
744#' corpusQuery("Ameisenplage") %>%
745#' fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200746#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100747#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200748#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200749#' @aliases fetchAll
750#' @rdname KorAPQuery-class
Marc Kupietz62da2b52019-09-12 17:43:34 +0200751#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200752setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
753 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200754})
755
756#' Fetches the remaining results of a KorAP query.
757#'
758#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200759#' \dontrun{
760#'
Marc Kupietzd8851222025-05-01 10:57:19 +0200761#' q <- KorAPConnection() %>%
762#' corpusQuery("Ameisenplage") %>%
763#' fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200764#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100765#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200766#'
767#' @aliases fetchRest
768#' @rdname KorAPQuery-class
769#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200770setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
771 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200772})
773
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200774#' Query frequencies of search expressions in virtual corpora
Marc Kupietz3f575282019-10-04 14:46:04 +0200775#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200776#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200777#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +0200778#' confidence intervals of one ore multiple search terms across one or multiple
779#' virtual corpora.
780#'
781#' @aliases frequencyQuery
Marc Kupietz3f575282019-10-04 14:46:04 +0200782#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200783#' \dontrun{
784#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200785#' KorAPConnection(verbose = TRUE) |>
Marc Kupietz3f575282019-10-04 14:46:04 +0200786#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +0100787#' }
Marc Kupietz3f575282019-10-04 14:46:04 +0200788#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200789# @inheritParams corpusQuery
Marc Kupietz617266d2025-02-27 10:43:07 +0100790#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200791#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
792#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz67edcb52021-09-20 21:54:24 +0200793#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
794#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200795#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz3f575282019-10-04 14:46:04 +0200796#' @export
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200797#'
798#' @return A tibble, with each row containing the following result columns for query and vc combinations:
799#' - **query**: the query string used for the frequency analysis.
800#' - **totalResults**: absolute frequency of query matches in the vc.
801#' - **vc**: virtual corpus used for the query.
802#' - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
803#' - **total**: total number of words in vc.
804#' - **f**: relative frequency of query matches in the vc.
805#' - **conf.low**: lower bound of the confidence interval for the relative frequency, given `conf.level`.
806#' - **conf.high**: upper bound of the confidence interval for the relative frequency, given `conf.level`.
807
Marc Kupietzd8851222025-05-01 10:57:19 +0200808setMethod(
809 "frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +0100810 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200811 (if (as.alternatives) {
812 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
Marc Kupietz71d6e052019-11-22 18:42:10 +0100813 group_by(vc) %>%
814 mutate(total = sum(totalResults))
Marc Kupietzd8851222025-05-01 10:57:19 +0200815 } else {
816 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
817 mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
818 }) %>%
Marc Kupietz0c29cea2019-10-09 08:44:36 +0200819 ci(conf.level = conf.level)
Marc Kupietzd8851222025-05-01 10:57:19 +0200820 }
821)
Marc Kupietz3f575282019-10-04 14:46:04 +0200822
Marc Kupietz38a9d682024-12-06 16:17:09 +0100823#' buildWebUIRequestUrlFromString
824#'
825#' @rdname KorAPQuery-class
826#' @importFrom urltools url_encode
827#' @export
828buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200829 query,
830 vc = "",
831 ql = "poliqarp") {
Marc Kupietz38a9d682024-12-06 16:17:09 +0100832 if ("KorAPConnection" %in% class(KorAPUrl)) {
833 KorAPUrl <- KorAPUrl@KorAPUrl
834 }
835
836 request <-
837 paste0(
Marc Kupietzd8851222025-05-01 10:57:19 +0200838 "?q=",
Marc Kupietz38a9d682024-12-06 16:17:09 +0100839 urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietzd8851222025-05-01 10:57:19 +0200840 ifelse(vc != "",
841 paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
842 ""
843 ),
844 "&ql=",
Marc Kupietz38a9d682024-12-06 16:17:09 +0100845 ql
846 )
847 paste0(KorAPUrl, request)
848}
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200849
850#' buildWebUIRequestUrl
851#'
852#' @rdname KorAPQuery-class
Marc Kupietzf9129592025-01-26 19:17:54 +0100853#' @importFrom httr2 url_parse
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200854#' @export
855buildWebUIRequestUrl <- function(kco,
Marc Kupietzd8851222025-05-01 10:57:19 +0200856 query = if (missing(KorAPUrl)) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200857 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietzd8851222025-05-01 10:57:19 +0200858 } else {
859 httr2::url_parse(KorAPUrl)$query$q
860 },
Marc Kupietzf9129592025-01-26 19:17:54 +0100861 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200862 KorAPUrl,
Marc Kupietzf9129592025-01-26 19:17:54 +0100863 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz38a9d682024-12-06 16:17:09 +0100864 buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200865}
866
Marc Kupietzd8851222025-05-01 10:57:19 +0200867#' format()
Marc Kupietze95108e2019-09-18 13:23:58 +0200868#' @rdname KorAPQuery-class
869#' @param x KorAPQuery object
870#' @param ... further arguments passed to or from other methods
Marc Kupietzb73ca0f2025-01-28 20:45:01 +0100871#' @importFrom urltools param_get url_decode
Marc Kupietze95108e2019-09-18 13:23:58 +0200872#' @export
873format.KorAPQuery <- function(x, ...) {
874 cat("<KorAPQuery>\n")
875 q <- x
Marc Kupietzd8851222025-05-01 10:57:19 +0200876 param <- urltools::param_get(q@request) |> lapply(urltools::url_decode)
Marc Kupietzb73ca0f2025-01-28 20:45:01 +0100877 cat(" Query: ", param$q, "\n")
878 if (!is.null(param$cq) && param$cq != "") {
879 cat(" Virtual corpus: ", param$cq, "\n")
880 }
881 if (!is.null(q@collectedMatches)) {
882 cat("==============================================================================================================", "\n")
883 print(summary(q@collectedMatches))
884 cat("==============================================================================================================", "\n")
885 }
886 cat(" Total results: ", q@totalResults, "\n")
887 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200888}
889
Marc Kupietze95108e2019-09-18 13:23:58 +0200890#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200891#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200892#' @rdname KorAPQuery-class
893#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200894#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200895setMethod("show", "KorAPQuery", function(object) {
896 format(object)
897})