blob: 45425b6e7d68c3542dc5c358b847e50360921317 [file] [log] [blame]
Marc Kupietze95108e2019-09-18 13:23:58 +02001#' Class KorAPQuery
2#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +01003#' This class provides methods to perform different kinds of queries on the KorAP API server.
Marc Kupietz67edcb52021-09-20 21:54:24 +02004#' `KorAPQuery` objects, which are typically created by the [corpusQuery()] method,
Marc Kupietza6e4ee62021-03-05 09:00:15 +01005#' represent the current state of a query to a KorAP server.
Marc Kupietze95108e2019-09-18 13:23:58 +02006#'
7#' @include KorAPConnection.R
Marc Kupietz6dfeed92025-06-03 11:58:06 +02008#' @include logging.R
Marc Kupietzf9129592025-01-26 19:17:54 +01009#' @import httr2
Marc Kupietze95108e2019-09-18 13:23:58 +020010#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010011#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020012
Marc Kupietze95108e2019-09-18 13:23:58 +020013#' @export
14KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020015 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020016 "request",
17 "vc",
18 "totalResults",
19 "nextStartIndex",
20 "fields",
21 "requestUrl",
22 "webUIRequestUrl",
23 "apiResponse",
24 "collectedMatches",
25 "hasMoreMatches"
26))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020027
Marc Kupietze95108e2019-09-18 13:23:58 +020028#' Method initialize
29#'
30#' @rdname KorAPQuery-class
31#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020032#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020033#' @param request query part of the request URL
34#' @param vc definition of a virtual corpus
35#' @param totalResults number of hits the query has yielded
36#' @param nextStartIndex at what index to start the next fetch of query results
37#' @param fields what data / metadata fields should be collected
38#' @param requestUrl complete URL of the API request
39#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
40#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020041#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020042#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020043#'
44#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020045#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +020046setMethod(
47 "initialize", "KorAPQuery",
48 function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
49 "corpusSigle", "textSigle", "pubDate", "pubPlace",
50 "availability", "textClass", "snippet", "tokens"
51 ),
52 requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
53 .Object <- callNextMethod()
54 .Object@korapConnection <- korapConnection
55 .Object@request <- request
56 .Object@vc <- vc
57 .Object@totalResults <- totalResults
58 .Object@nextStartIndex <- nextStartIndex
59 .Object@fields <- fields
60 .Object@requestUrl <- requestUrl
61 .Object@webUIRequestUrl <- webUIRequestUrl
62 .Object@apiResponse <- apiResponse
63 .Object@hasMoreMatches <- hasMoreMatches
64 .Object@collectedMatches <- collectedMatches
65 .Object
66 }
67)
Marc Kupietz632cbd42019-09-06 16:04:51 +020068
Marc Kupietzd8851222025-05-01 10:57:19 +020069setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
70setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
71setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
72setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
73setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietze95108e2019-09-18 13:23:58 +020074
75maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020076
Marc Kupietz4de53ec2019-10-04 09:12:00 +020077## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietzef1ef4a2025-02-19 12:12:40 +010078utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020079
Marc Kupietzdbd431a2021-08-29 12:17:45 +020080#' Corpus query
81#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020082#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020083#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020084#' @rdname KorAPQuery-class
85#' @aliases corpusQuery
86#'
87#' @importFrom urltools url_encode
88#' @importFrom purrr pmap
89#' @importFrom dplyr bind_rows
90#'
Marc Kupietz617266d2025-02-27 10:43:07 +010091#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz67edcb52021-09-20 21:54:24 +020092#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020093#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020094#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz132f0052023-04-16 14:23:05 +020095#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
96#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
97#' your RKorAPClient application as explained in the
98#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
99#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
100#' `FALSE`.
Marc Kupietz67edcb52021-09-20 21:54:24 +0200101#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Akron5e135462019-09-27 16:31:38 +0200102#' @param fields (meta)data fields that will be fetched for every match.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100103#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +0200104#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200105#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200106#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200107#' @param context string that specifies the size of the left and the right context returned in `snippet`
108#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
109#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
110#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
111#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
112#' as this is typically legally motivated.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200113#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz67edcb52021-09-20 21:54:24 +0200114#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
115#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200116#'
117#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200118#' \dontrun{
119#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200120#' # Fetch metadata of every query hit for "Ameisenplage" and show a summary
Marc Kupietzd8851222025-05-01 10:57:19 +0200121#' KorAPConnection() %>%
122#' corpusQuery("Ameisenplage") %>%
123#' fetchAll()
Marc Kupietz657d8e72020-02-25 18:31:50 +0100124#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200125#'
Marc Kupietz6ae76052021-09-21 10:34:00 +0200126#' \dontrun{
127#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200128#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
129#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200130#'
Marc Kupietz617266d2025-02-27 10:43:07 +0100131#' KorAPConnection(verbose = TRUE) %>%
Marc Kupietzd8851222025-05-01 10:57:19 +0200132#' corpusQuery(
133#' KorAPUrl =
134#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
135#' )
Marc Kupietz6ae76052021-09-21 10:34:00 +0200136#' }
137#'
138#' \dontrun{
Marc Kupietz3c531f62019-09-13 12:17:24 +0200139#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200140#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietzd8851222025-05-01 10:57:19 +0200141#' KorAPConnection(verbose = TRUE) %>%
142#' {
143#' . ->> kco
144#' } %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200145#' corpusQuery("Ameisenplage") %>%
146#' fetchAll() %>%
147#' slot("collectedMatches") %>%
148#' mutate(year = lubridate::year(pubDate)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200149#' dplyr::select(year) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200150#' group_by(year) %>%
Marc Kupietzcb3c59e2020-06-02 10:10:43 +0200151#' summarise(Count = dplyr::n()) %>%
Marc Kupietzd8851222025-05-01 10:57:19 +0200152#' mutate(Freq = mapply(function(f, y) {
153#' f / corpusStats(kco, paste("pubDate in", y))@tokens
154#' }, Count, year)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200155#' dplyr::select(-Count) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200156#' complete(year = min(year):max(year), fill = list(Freq = 0)) %>%
157#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100158#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200159#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200160#'
161#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200162#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200163#'
164#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +0200165setMethod(
166 "corpusQuery", "KorAPConnection",
167 function(kco,
168 query = if (missing(KorAPUrl)) {
169 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
170 } else {
171 httr2::url_parse(KorAPUrl)$query$q
172 },
173 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
174 KorAPUrl,
175 metadataOnly = TRUE,
176 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
177 fields = c(
178 "corpusSigle",
179 "textSigle",
180 "pubDate",
181 "pubPlace",
182 "availability",
183 "textClass",
184 "snippet",
185 "tokens"
186 ),
187 accessRewriteFatal = TRUE,
188 verbose = kco@verbose,
189 expand = length(vc) != length(query),
190 as.df = FALSE,
191 context = NULL) {
192 if (length(query) > 1 || length(vc) > 1) {
193 grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200194
195 # Initialize timing variables for ETA calculation
196 total_queries <- nrow(grid)
197 current_query <- 0
198 start_time <- Sys.time()
199
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200200 results <- purrr::pmap(grid, function(query, vc, ...) {
201 current_query <<- current_query + 1
202
203 # Execute the single query directly (avoiding recursive call)
204 contentFields <- c("snippet", "tokens")
205 query_fields <- fields
206 if (metadataOnly) {
207 query_fields <- query_fields[!query_fields %in% contentFields]
208 }
209 if (!"textSigle" %in% query_fields) {
210 query_fields <- c(query_fields, "textSigle")
211 }
212 request <-
213 paste0(
214 "?q=",
215 url_encode(enc2utf8(query)),
216 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
217 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
218 ifelse(!metadataOnly, "&show-tokens=true", ""),
219 "&ql=", ql
220 )
221 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
222 requestUrl <- paste0(
223 kco@apiUrl,
224 "search",
225 request,
226 "&fields=",
227 paste(query_fields, collapse = ","),
228 if (metadataOnly) "&access-rewrite-disabled=true" else ""
229 )
230
231 # Show individual query progress
232 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
233 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
234 if (is.null(res)) {
235 log_info(verbose, ": API call failed\n")
236 totalResults <- 0
237 } else {
238 totalResults <- as.integer(res$meta$totalResults)
239 log_info(verbose, ": ", totalResults, " hits")
240 if (!is.null(res$meta$cached)) {
241 log_info(verbose, " [cached]")
242 } else if (!is.null(res$meta$benchmark)) {
243 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
244 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
245 formatted_time <- paste0(round(time_value, 2), "s")
246 log_info(verbose, ", took ", formatted_time)
247 } else {
248 log_info(verbose, ", took ", res$meta$benchmark)
249 }
250 }
251 log_info(verbose, "\n")
252 }
253
254 result <- data.frame(
255 query = query,
256 totalResults = totalResults,
257 vc = vc,
258 webUIRequestUrl = webUIRequestUrl,
259 stringsAsFactors = FALSE
260 )
261
262 # Calculate and display ETA information if verbose and we have more than one query
263 if (verbose && total_queries > 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200264 eta_info <- calculate_eta(current_query, total_queries, start_time)
265 if (eta_info != "") {
266 elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200267 avg_time_per_query <- elapsed_time / current_query
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200268
269 # Create progress display
270 progress_display <- paste0(
271 "Query ",
272 sprintf(paste0("%", nchar(total_queries), "d"), current_query),
273 "/",
274 sprintf("%d", total_queries),
275 " completed. Avg: ",
276 sprintf("%.1f", avg_time_per_query),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200277 "s/query",
278 eta_info
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200279 )
280
281 log_info(verbose, progress_display, "\n")
282 }
283 }
284
285 return(result)
286 })
287
288 results %>% bind_rows()
Marc Kupietzd8851222025-05-01 10:57:19 +0200289 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200290 contentFields <- c("snippet", "tokens")
Marc Kupietza96537f2019-11-09 23:07:44 +0100291 if (metadataOnly) {
292 fields <- fields[!fields %in% contentFields]
293 }
Marc Kupietz80dc6432025-02-07 16:57:40 +0100294 if (!"textSigle" %in% fields) {
295 fields <- c(fields, "textSigle")
296 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100297 request <-
Marc Kupietzd8851222025-05-01 10:57:19 +0200298 paste0(
299 "?q=",
300 url_encode(enc2utf8(query)),
301 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
302 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
303 ifelse(!metadataOnly, "&show-tokens=true", ""),
304 "&ql=", ql
305 )
Marc Kupietza96537f2019-11-09 23:07:44 +0100306 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
307 requestUrl <- paste0(
308 kco@apiUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200309 "search",
Marc Kupietza96537f2019-11-09 23:07:44 +0100310 request,
Marc Kupietzd8851222025-05-01 10:57:19 +0200311 "&fields=",
Marc Kupietza96537f2019-11-09 23:07:44 +0100312 paste(fields, collapse = ","),
Marc Kupietzd8851222025-05-01 10:57:19 +0200313 if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietza96537f2019-11-09 23:07:44 +0100314 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200315 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
316 sep =
317 ""
318 )
319 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietza4675722022-02-23 23:55:15 +0100320 if (is.null(res)) {
Marc Kupietza4675722022-02-23 23:55:15 +0100321 message("API call failed.")
322 totalResults <- 0
323 } else {
Marc Kupietzd8851222025-05-01 10:57:19 +0200324 totalResults <- as.integer(res$meta$totalResults)
Marc Kupietza47d1502023-04-18 15:26:47 +0200325 log_info(verbose, ": ", totalResults, " hits")
Marc Kupietzd8851222025-05-01 10:57:19 +0200326 if (!is.null(res$meta$cached)) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200327 log_info(verbose, " [cached]\n")
Marc Kupietzd8851222025-05-01 10:57:19 +0200328 } else if (!is.null(res$meta$benchmark)) {
Marc Kupietz7638ca42025-05-25 13:18:16 +0200329 # Round the benchmark time to 2 decimal places for better readability
330 # If it's a string ending with 's', extract the number, round it, and re-add 's'
331 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
332 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
333 formatted_time <- paste0(round(time_value, 2), "s")
334 log_info(verbose, ", took ", formatted_time, "\n", sep = "")
335 } else {
336 # Fallback if the format is different than expected
337 log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
338 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200339 } else {
340 log_info(verbose, "\n")
341 }
Marc Kupietza4675722022-02-23 23:55:15 +0100342 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200343 if (as.df) {
Marc Kupietza96537f2019-11-09 23:07:44 +0100344 data.frame(
345 query = query,
Marc Kupietza4675722022-02-23 23:55:15 +0100346 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100347 vc = vc,
348 webUIRequestUrl = webUIRequestUrl,
349 stringsAsFactors = FALSE
350 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200351 } else {
Marc Kupietza96537f2019-11-09 23:07:44 +0100352 KorAPQuery(
353 korapConnection = kco,
354 nextStartIndex = 0,
355 fields = fields,
356 requestUrl = requestUrl,
357 request = request,
Marc Kupietza4675722022-02-23 23:55:15 +0100358 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100359 vc = vc,
360 apiResponse = res,
361 webUIRequestUrl = webUIRequestUrl,
Marc Kupietza4675722022-02-23 23:55:15 +0100362 hasMoreMatches = (totalResults > 0),
Marc Kupietza96537f2019-11-09 23:07:44 +0100363 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200364 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100365 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200366 }
367)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200368
Marc Kupietz05a60792024-12-07 16:23:31 +0100369#' @importFrom purrr map
370repair_data_strcuture <- function(x) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200371 if (is.list(x)) {
372 as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz05a60792024-12-07 16:23:31 +0100373 paste(.x, collapse = " ")
374 } else {
375 .x
376 }))
Marc Kupietzd8851222025-05-01 10:57:19 +0200377 } else {
Marc Kupietz05a60792024-12-07 16:23:31 +0100378 ifelse(is.na(x), "", x)
Marc Kupietzd8851222025-05-01 10:57:19 +0200379 }
Marc Kupietz05a60792024-12-07 16:23:31 +0100380}
381
Marc Kupietz62da2b52019-09-12 17:43:34 +0200382#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200383#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200384#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200385#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200386#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200387#' @param offset start offset for query results to fetch
388#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200389#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200390#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
391#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200392#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100393#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200394#' \dontrun{
395#'
Marc Kupietzd8851222025-05-01 10:57:19 +0200396#' q <- KorAPConnection() %>%
397#' corpusQuery("Ameisenplage") %>%
398#' fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100399#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100400#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100401#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200402#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200403#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200404#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200405#' @aliases fetchNext
406#' @rdname KorAPQuery-class
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200407#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietzf4881122024-12-17 14:55:39 +0100408#' @importFrom tibble enframe add_column
409#' @importFrom stringr word
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200410#' @importFrom tidyr unnest unchop pivot_wider
411#' @importFrom purrr map
Marc Kupietz632cbd42019-09-06 16:04:51 +0200412#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200413setMethod("fetchNext", "KorAPQuery", function(kqo,
414 offset = kqo@nextStartIndex,
415 maxFetch = maxResultsPerPage,
416 verbose = kqo@korapConnection@verbose,
417 randomizePageOrder = FALSE) {
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100418 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietzd8851222025-05-01 10:57:19 +0200419 results <- key <- name <- tmp_positions <- 0
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100420
Marc Kupietze95108e2019-09-18 13:23:58 +0200421 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
422 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200423 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200424 use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz623d7122025-05-25 12:46:12 +0200425 # Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietze95108e2019-09-18 13:23:58 +0200426 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200427
Marc Kupietz623d7122025-05-25 12:46:12 +0200428 # For randomized page order, generate a list of randomized page indices
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200429 if (randomizePageOrder) {
Marc Kupietz623d7122025-05-25 12:46:12 +0200430 # Calculate how many pages we need to fetch based on maxFetch
431 total_pages_to_fetch <- if (!is.na(maxFetch)) {
432 # Either limited by maxFetch or total results, whichever is smaller
433 min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
434 } else {
435 # All pages
436 ceiling(kqo@totalResults / maxResultsPerPage)
437 }
438
439 # Generate randomized page indices (0-based for API)
440 pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
441 page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200442 }
443
Marc Kupietzd8851222025-05-01 10:57:19 +0200444 if (is.null(collectedMatches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200445 collectedMatches <- data.frame()
446 }
Marc Kupietz623d7122025-05-25 12:46:12 +0200447
448 # Initialize the page counter properly based on nextStartIndex and any previously fetched results
449 # We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
450 # For first call, this will be 1, for subsequent calls, it will reflect our actual position
451 current_page_number <- ceiling(offset / maxResultsPerPage) + 1
452
453 # For sequential fetches, keep track of which global page we're on
454 # This is important for correctly showing page numbers in subsequent fetchNext calls
455 page_count_start <- current_page_number
456
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200457 repeat {
Marc Kupietz623d7122025-05-25 12:46:12 +0200458 # Determine which page to fetch next
459 if (randomizePageOrder) {
460 # In randomized mode, get the page from our randomized list using the page_index
461 # Make sure we don't exceed the array bounds
462 if (page_index > length(pages)) {
463 break # No more pages to fetch in randomized mode
464 }
465 current_offset_page <- pages[page_index]
466 # For display purposes in randomized mode, show which page out of the total we're fetching
467 display_page_number <- page_index
468 } else {
469 # In sequential mode, use the current_page_number to calculate the offset
470 current_offset_page <- (current_page_number - 1)
471 display_page_number <- current_page_number
472 }
473
474 # Calculate the actual offset in tokens
475 currentOffset <- current_offset_page * maxResultsPerPage
476
Marc Kupietzef0e9392025-06-18 12:21:49 +0200477 # Build the query with the appropriate count and offset using httr2
478 count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
479
480 # Parse existing URL to preserve all query parameters
481 parsed_url <- httr2::url_parse(kqo@requestUrl)
482 existing_query <- parsed_url$query
483
484 # Add/update count and offset parameters
485 existing_query$count <- count_param
486 existing_query$offset <- currentOffset
487
488 # Rebuild the URL with all parameters
489 query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz68170952021-06-30 09:37:21 +0200490 res <- apiCall(kqo@korapConnection, query)
491 if (length(res$matches) == 0) {
492 break
493 }
494
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200495 if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz16ccf112025-01-26 13:25:27 +0100496 log_info(verbose, "Using fields API: ")
Marc Kupietz05a60792024-12-07 16:23:31 +0100497 currentMatches <- res$matches$fields %>%
498 purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
499 tibble::enframe() %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200500 tidyr::unnest(cols = value) %>%
501 tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200502 dplyr::select(-name)
Marc Kupietzd8851222025-05-01 10:57:19 +0200503 if ("snippet" %in% colnames(res$matches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200504 currentMatches$snippet <- res$matches$snippet
505 }
Marc Kupietz3cd2c6c2025-01-08 20:35:39 +0100506 if ("tokens" %in% colnames(res$matches)) {
507 currentMatches$tokens <- res$matches$tokens
508 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200509 } else {
510 currentMatches <- res$matches
511 }
512
Marc Kupietze95108e2019-09-18 13:23:58 +0200513 for (field in kqo@fields) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200514 if (!field %in% colnames(currentMatches)) {
515 currentMatches[, field] <- NA
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200516 }
517 }
Marc Kupietzf4881122024-12-17 14:55:39 +0100518 currentMatches <- currentMatches %>%
519 select(kqo@fields) %>%
520 mutate(
Marc Kupietz0447da02025-01-08 20:51:09 +0100521 tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
Marc Kupietzf4881122024-12-17 14:55:39 +0100522 matchStart = as.integer(stringr::word(tmp_positions, 1)),
523 matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
524 ) %>%
525 select(-tmp_positions)
526
Marc Kupietz62da2b52019-09-12 17:43:34 +0200527 if (!is.list(collectedMatches)) {
528 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200529 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200530 collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200531 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200532
Marc Kupietz623d7122025-05-25 12:46:12 +0200533 # Get the actual items per page from the API response
534 # We now consistently use maxResultsPerPage instead
Marc Kupietzacbaab02025-05-01 10:56:35 +0200535
Marc Kupietz623d7122025-05-25 12:46:12 +0200536 # Calculate total pages consistently using fixed maxResultsPerPage
537 # This ensures consistent page counting across the function
538 total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
539
540 # Calculate the total pages based on what we've already fetched plus what we'll fetch
541 # This ensures the correct denominator is displayed for subsequent fetchNext calls
542
543 # Calculate the total number of pages for the entire result set
544 # This calculation is kept for reference and for showing in parentheses
Marc Kupietz669114b2025-05-02 22:02:20 +0200545
Marc Kupietzae9b6172025-05-02 15:50:01 +0200546 # Estimate remaining time
547 time_per_page <- NA
548 eta_str <- "N/A"
549 completion_time_str <- "N/A"
Marc Kupietzacbaab02025-05-01 10:56:35 +0200550
Marc Kupietzae9b6172025-05-02 15:50:01 +0200551 if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
552 # benchmark looks like "0.123s"
553 time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
554 if (!is.na(time_per_page)) {
Marc Kupietz021663d2025-06-18 17:49:22 +0200555 # Calculate remaining pages based on what we still need to fetch
556 if (!is.na(maxFetch)) {
557 # Use the same logic as page display calculation - account for offset
558 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
559 total_pages_this_batch <- min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
560 current_page_in_batch <- ceiling(nrow(collectedMatches) / maxResultsPerPage) + 1
561 remaining_pages <- max(0, total_pages_this_batch - current_page_in_batch)
Marc Kupietz623d7122025-05-25 12:46:12 +0200562 } else {
Marc Kupietz021663d2025-06-18 17:49:22 +0200563 # We need to fetch all results - calculate based on actual position
Marc Kupietz623d7122025-05-25 12:46:12 +0200564 if (randomizePageOrder) {
565 if (exists("pages") && length(pages) > 0) {
Marc Kupietz021663d2025-06-18 17:49:22 +0200566 remaining_pages <- max(0, length(pages) - page_index)
Marc Kupietz623d7122025-05-25 12:46:12 +0200567 } else {
568 # Fallback to a reasonable default
569 remaining_pages <- 1
570 }
571 } else {
Marc Kupietz021663d2025-06-18 17:49:22 +0200572 # For sequential order, calculate remaining pages from current offset
573 current_absolute_page <- ceiling((currentOffset + maxResultsPerPage) / maxResultsPerPage)
574 remaining_pages <- max(0, total_pages - current_absolute_page)
Marc Kupietz623d7122025-05-25 12:46:12 +0200575 }
576 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200577
578 estimated_remaining_seconds <- remaining_pages * time_per_page
579 estimated_completion_time <- Sys.time() + estimated_remaining_seconds
580
Marc Kupietze8c8e1a2025-06-19 17:37:59 +0200581 # Format time nicely using centralized function from logging.R
Marc Kupietzae9b6172025-05-02 15:50:01 +0200582
583 eta_str <- format_duration(estimated_remaining_seconds)
584 completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
Marc Kupietzacbaab02025-05-01 10:56:35 +0200585 }
Marc Kupietzacbaab02025-05-01 10:56:35 +0200586 }
587
Marc Kupietz623d7122025-05-25 12:46:12 +0200588 # Create the page display string with proper formatting
Marc Kupietzacbaab02025-05-01 10:56:35 +0200589
Marc Kupietz623d7122025-05-25 12:46:12 +0200590 # For global page tracking, calculate the absolute page number
591 actual_display_number <- if (randomizePageOrder) {
592 current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
593 } else {
594 # In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
595 current_offset_page + 1
596 }
597
598 # For subsequent calls to fetchNext, we need to calculate the correct page numbers
599 # based on the current batch being fetched
600
601 # For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
602 # Simply count from 1 within the current batch
603
604 # The relative page number is simply the current position in this batch
605 if (randomizePageOrder) {
606 relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
607 } else {
608 relative_page_number <- display_page_number - (page_count_start - 1)
609 }
610
611 # How many pages will we fetch in this batch?
Marc Kupietz021663d2025-06-18 17:49:22 +0200612 # If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz623d7122025-05-25 12:46:12 +0200613 pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz021663d2025-06-18 17:49:22 +0200614 # Account for offset - we can only fetch from the remaining results after offset
615 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
616 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz623d7122025-05-25 12:46:12 +0200617 } else {
618 # Otherwise fetch all remaining pages
619 total_pages - page_count_start + 1
620 }
621
622 # The total pages to be shown in this batch
623 batch_total_pages <- pages_in_this_batch
624
625 page_display <- paste0(
626 "Retrieved page ",
627 sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
628 "/",
629 sprintf("%d", batch_total_pages)
630 )
631
632 # If randomized, also show which actual page we fetched
633 if (randomizePageOrder) {
634 # Determine the maximum width needed for page numbers (based on total pages)
635 # This ensures consistent alignment
636 max_page_width <- nchar(as.character(total_pages))
637 # Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz7638ca42025-05-25 13:18:16 +0200638 page_display <- paste0(
639 page_display,
640 sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
641 )
Marc Kupietz623d7122025-05-25 12:46:12 +0200642 }
643 # Always show the absolute page number and total pages (for clarity)
644 else {
645 # Show the absolute page number (out of total possible pages)
646 page_display <- paste0(page_display, sprintf(
647 " (page %d of %d total)",
648 actual_display_number, total_pages
649 ))
650 }
651
652 # Add caching or timing information
653 if (!is.null(res$meta$cached)) {
654 page_display <- paste0(page_display, " [cached]")
655 } else {
656 page_display <- paste0(
657 page_display,
658 " in ",
659 if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
660 "s. ETA: ",
661 # Display ETA for both randomized and sequential modes
662 eta_str,
663 # Show completion time for both modes
664 paste0(" (", completion_time_str, ")")
665 )
666 }
667
668 log_info(verbose, paste0(page_display, "\n"))
669
670 # Increment the appropriate counter based on mode
671 if (randomizePageOrder) {
672 page_index <- page_index + 1
673 } else {
674 current_page_number <- current_page_number + 1
675 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200676 results <- results + res$meta$itemsPerPage
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200677 if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200678 break
679 }
680 }
Marc Kupietz68170952021-06-30 09:37:21 +0200681 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietzd8851222025-05-01 10:57:19 +0200682 KorAPQuery(
683 nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200684 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200685 fields = kqo@fields,
686 requestUrl = kqo@requestUrl,
687 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200688 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200689 vc = kqo@vc,
690 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200691 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200692 apiResponse = res,
Marc Kupietzd8851222025-05-01 10:57:19 +0200693 collectedMatches = collectedMatches
694 )
Marc Kupietze95108e2019-09-18 13:23:58 +0200695})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200696
697#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200698#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200699#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100700#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200701#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200702#' \dontrun{
703#'
Marc Kupietzd8851222025-05-01 10:57:19 +0200704#' q <- KorAPConnection() %>%
705#' corpusQuery("Ameisenplage") %>%
706#' fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200707#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100708#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200709#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200710#' @aliases fetchAll
711#' @rdname KorAPQuery-class
Marc Kupietz62da2b52019-09-12 17:43:34 +0200712#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200713setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
714 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200715})
716
717#' Fetches the remaining results of a KorAP query.
718#'
719#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200720#' \dontrun{
721#'
Marc Kupietzd8851222025-05-01 10:57:19 +0200722#' q <- KorAPConnection() %>%
723#' corpusQuery("Ameisenplage") %>%
724#' fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200725#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100726#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200727#'
728#' @aliases fetchRest
729#' @rdname KorAPQuery-class
730#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200731setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
732 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200733})
734
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200735#' Query frequencies of search expressions in virtual corpora
Marc Kupietz3f575282019-10-04 14:46:04 +0200736#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200737#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200738#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +0200739#' confidence intervals of one ore multiple search terms across one or multiple
740#' virtual corpora.
741#'
742#' @aliases frequencyQuery
Marc Kupietz3f575282019-10-04 14:46:04 +0200743#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200744#' \dontrun{
745#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200746#' KorAPConnection(verbose = TRUE) |>
Marc Kupietz3f575282019-10-04 14:46:04 +0200747#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +0100748#' }
Marc Kupietz3f575282019-10-04 14:46:04 +0200749#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200750# @inheritParams corpusQuery
Marc Kupietz617266d2025-02-27 10:43:07 +0100751#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200752#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
753#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz67edcb52021-09-20 21:54:24 +0200754#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
755#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200756#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz3f575282019-10-04 14:46:04 +0200757#' @export
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200758#'
759#' @return A tibble, with each row containing the following result columns for query and vc combinations:
760#' - **query**: the query string used for the frequency analysis.
761#' - **totalResults**: absolute frequency of query matches in the vc.
762#' - **vc**: virtual corpus used for the query.
763#' - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
764#' - **total**: total number of words in vc.
765#' - **f**: relative frequency of query matches in the vc.
766#' - **conf.low**: lower bound of the confidence interval for the relative frequency, given `conf.level`.
767#' - **conf.high**: upper bound of the confidence interval for the relative frequency, given `conf.level`.
768
Marc Kupietzd8851222025-05-01 10:57:19 +0200769setMethod(
770 "frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +0100771 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200772 (if (as.alternatives) {
773 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
Marc Kupietz71d6e052019-11-22 18:42:10 +0100774 group_by(vc) %>%
775 mutate(total = sum(totalResults))
Marc Kupietzd8851222025-05-01 10:57:19 +0200776 } else {
777 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
778 mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
779 }) %>%
Marc Kupietz0c29cea2019-10-09 08:44:36 +0200780 ci(conf.level = conf.level)
Marc Kupietzd8851222025-05-01 10:57:19 +0200781 }
782)
Marc Kupietz3f575282019-10-04 14:46:04 +0200783
Marc Kupietz38a9d682024-12-06 16:17:09 +0100784#' buildWebUIRequestUrlFromString
785#'
786#' @rdname KorAPQuery-class
787#' @importFrom urltools url_encode
788#' @export
789buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200790 query,
791 vc = "",
792 ql = "poliqarp") {
Marc Kupietz38a9d682024-12-06 16:17:09 +0100793 if ("KorAPConnection" %in% class(KorAPUrl)) {
794 KorAPUrl <- KorAPUrl@KorAPUrl
795 }
796
797 request <-
798 paste0(
Marc Kupietzd8851222025-05-01 10:57:19 +0200799 "?q=",
Marc Kupietz38a9d682024-12-06 16:17:09 +0100800 urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietzd8851222025-05-01 10:57:19 +0200801 ifelse(vc != "",
802 paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
803 ""
804 ),
805 "&ql=",
Marc Kupietz38a9d682024-12-06 16:17:09 +0100806 ql
807 )
808 paste0(KorAPUrl, request)
809}
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200810
811#' buildWebUIRequestUrl
812#'
813#' @rdname KorAPQuery-class
Marc Kupietzf9129592025-01-26 19:17:54 +0100814#' @importFrom httr2 url_parse
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200815#' @export
816buildWebUIRequestUrl <- function(kco,
Marc Kupietzd8851222025-05-01 10:57:19 +0200817 query = if (missing(KorAPUrl)) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200818 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietzd8851222025-05-01 10:57:19 +0200819 } else {
820 httr2::url_parse(KorAPUrl)$query$q
821 },
Marc Kupietzf9129592025-01-26 19:17:54 +0100822 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200823 KorAPUrl,
Marc Kupietzf9129592025-01-26 19:17:54 +0100824 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz38a9d682024-12-06 16:17:09 +0100825 buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200826}
827
Marc Kupietzd8851222025-05-01 10:57:19 +0200828#' format()
Marc Kupietze95108e2019-09-18 13:23:58 +0200829#' @rdname KorAPQuery-class
830#' @param x KorAPQuery object
831#' @param ... further arguments passed to or from other methods
Marc Kupietzb73ca0f2025-01-28 20:45:01 +0100832#' @importFrom urltools param_get url_decode
Marc Kupietze95108e2019-09-18 13:23:58 +0200833#' @export
834format.KorAPQuery <- function(x, ...) {
835 cat("<KorAPQuery>\n")
836 q <- x
Marc Kupietzd8851222025-05-01 10:57:19 +0200837 param <- urltools::param_get(q@request) |> lapply(urltools::url_decode)
Marc Kupietzb73ca0f2025-01-28 20:45:01 +0100838 cat(" Query: ", param$q, "\n")
839 if (!is.null(param$cq) && param$cq != "") {
840 cat(" Virtual corpus: ", param$cq, "\n")
841 }
842 if (!is.null(q@collectedMatches)) {
843 cat("==============================================================================================================", "\n")
844 print(summary(q@collectedMatches))
845 cat("==============================================================================================================", "\n")
846 }
847 cat(" Total results: ", q@totalResults, "\n")
848 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200849}
850
Marc Kupietze95108e2019-09-18 13:23:58 +0200851#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200852#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200853#' @rdname KorAPQuery-class
854#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200855#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200856setMethod("show", "KorAPQuery", function(object) {
857 format(object)
858})