blob: 9b77c39b0c0421ab8d8deeda5636e8d98dbf6a51 [file] [log] [blame]
Marc Kupietza8c40f42025-06-24 15:49:52 +02001#' KorAPQuery class (internal)
Marc Kupietze95108e2019-09-18 13:23:58 +02002#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02003#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
Marc Kupietze95108e2019-09-18 13:23:58 +02004#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02005#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +02006#' @include KorAPConnection.R
Marc Kupietz6dfeed92025-06-03 11:58:06 +02007#' @include logging.R
Marc Kupietzf9129592025-01-26 19:17:54 +01008#' @import httr2
Marc Kupietze95108e2019-09-18 13:23:58 +02009#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
24 "hasMoreMatches"
25))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietza8c40f42025-06-24 15:49:52 +020027#' Initialize KorAPQuery object
28#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +020029#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020030#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020031#' @param request query part of the request URL
32#' @param vc definition of a virtual corpus
33#' @param totalResults number of hits the query has yielded
34#' @param nextStartIndex at what index to start the next fetch of query results
35#' @param fields what data / metadata fields should be collected
36#' @param requestUrl complete URL of the API request
37#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
38#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020039#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020040#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020041#'
42#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020043#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +020044setMethod(
45 "initialize", "KorAPQuery",
46 function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
47 "corpusSigle", "textSigle", "pubDate", "pubPlace",
48 "availability", "textClass", "snippet", "tokens"
49 ),
50 requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
51 .Object <- callNextMethod()
52 .Object@korapConnection <- korapConnection
53 .Object@request <- request
54 .Object@vc <- vc
55 .Object@totalResults <- totalResults
56 .Object@nextStartIndex <- nextStartIndex
57 .Object@fields <- fields
58 .Object@requestUrl <- requestUrl
59 .Object@webUIRequestUrl <- webUIRequestUrl
60 .Object@apiResponse <- apiResponse
61 .Object@hasMoreMatches <- hasMoreMatches
62 .Object@collectedMatches <- collectedMatches
63 .Object
64 }
65)
Marc Kupietz632cbd42019-09-06 16:04:51 +020066
Marc Kupietzd8851222025-05-01 10:57:19 +020067setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
68setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
69setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
70setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
71setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietze95108e2019-09-18 13:23:58 +020072
73maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020074
Marc Kupietz4de53ec2019-10-04 09:12:00 +020075## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietzef1ef4a2025-02-19 12:12:40 +010076utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020077
Marc Kupietza8c40f42025-06-24 15:49:52 +020078#' Search corpus for query terms
Marc Kupietzdbd431a2021-08-29 12:17:45 +020079#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020080#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020081#'
Marc Kupietza8c40f42025-06-24 15:49:52 +020082#' @family corpus search functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +020083#' @aliases corpusQuery
84#'
85#' @importFrom urltools url_encode
86#' @importFrom purrr pmap
87#' @importFrom dplyr bind_rows
88#'
Marc Kupietz617266d2025-02-27 10:43:07 +010089#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz67edcb52021-09-20 21:54:24 +020090#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020091#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020092#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz132f0052023-04-16 14:23:05 +020093#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
94#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
95#' your RKorAPClient application as explained in the
96#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
97#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
98#' `FALSE`.
Marc Kupietz67edcb52021-09-20 21:54:24 +020099#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Akron5e135462019-09-27 16:31:38 +0200100#' @param fields (meta)data fields that will be fetched for every match.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100101#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +0200102#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200103#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200104#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200105#' @param context string that specifies the size of the left and the right context returned in `snippet`
106#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
107#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
108#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
109#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
110#' as this is typically legally motivated.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200111#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz67edcb52021-09-20 21:54:24 +0200112#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
113#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200114#'
115#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200116#' \dontrun{
117#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200118#' # Fetch metadata of every query hit for "Ameisenplage" and show a summary
Marc Kupietzd8851222025-05-01 10:57:19 +0200119#' KorAPConnection() %>%
120#' corpusQuery("Ameisenplage") %>%
121#' fetchAll()
Marc Kupietz657d8e72020-02-25 18:31:50 +0100122#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200123#'
Marc Kupietz6ae76052021-09-21 10:34:00 +0200124#' \dontrun{
125#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200126#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
127#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200128#'
Marc Kupietz617266d2025-02-27 10:43:07 +0100129#' KorAPConnection(verbose = TRUE) %>%
Marc Kupietzd8851222025-05-01 10:57:19 +0200130#' corpusQuery(
131#' KorAPUrl =
132#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
133#' )
Marc Kupietz6ae76052021-09-21 10:34:00 +0200134#' }
135#'
136#' \dontrun{
Marc Kupietz3c531f62019-09-13 12:17:24 +0200137#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200138#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietzd8851222025-05-01 10:57:19 +0200139#' KorAPConnection(verbose = TRUE) %>%
140#' {
141#' . ->> kco
142#' } %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200143#' corpusQuery("Ameisenplage") %>%
144#' fetchAll() %>%
145#' slot("collectedMatches") %>%
146#' mutate(year = lubridate::year(pubDate)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200147#' dplyr::select(year) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200148#' group_by(year) %>%
Marc Kupietzcb3c59e2020-06-02 10:10:43 +0200149#' summarise(Count = dplyr::n()) %>%
Marc Kupietzd8851222025-05-01 10:57:19 +0200150#' mutate(Freq = mapply(function(f, y) {
151#' f / corpusStats(kco, paste("pubDate in", y))@tokens
152#' }, Count, year)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200153#' dplyr::select(-Count) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200154#' complete(year = min(year):max(year), fill = list(Freq = 0)) %>%
155#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100156#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200157#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200158#'
159#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200160#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200161#'
162#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +0200163setMethod(
164 "corpusQuery", "KorAPConnection",
165 function(kco,
166 query = if (missing(KorAPUrl)) {
167 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
168 } else {
169 httr2::url_parse(KorAPUrl)$query$q
170 },
171 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
172 KorAPUrl,
173 metadataOnly = TRUE,
174 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
175 fields = c(
176 "corpusSigle",
177 "textSigle",
178 "pubDate",
179 "pubPlace",
180 "availability",
181 "textClass",
182 "snippet",
183 "tokens"
184 ),
185 accessRewriteFatal = TRUE,
186 verbose = kco@verbose,
187 expand = length(vc) != length(query),
188 as.df = FALSE,
189 context = NULL) {
190 if (length(query) > 1 || length(vc) > 1) {
191 grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200192
193 # Initialize timing variables for ETA calculation
194 total_queries <- nrow(grid)
195 current_query <- 0
196 start_time <- Sys.time()
197
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200198 results <- purrr::pmap(grid, function(query, vc, ...) {
199 current_query <<- current_query + 1
200
201 # Execute the single query directly (avoiding recursive call)
202 contentFields <- c("snippet", "tokens")
203 query_fields <- fields
204 if (metadataOnly) {
205 query_fields <- query_fields[!query_fields %in% contentFields]
206 }
207 if (!"textSigle" %in% query_fields) {
208 query_fields <- c(query_fields, "textSigle")
209 }
210 request <-
211 paste0(
212 "?q=",
213 url_encode(enc2utf8(query)),
214 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
215 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
216 ifelse(!metadataOnly, "&show-tokens=true", ""),
217 "&ql=", ql
218 )
219 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
220 requestUrl <- paste0(
221 kco@apiUrl,
222 "search",
223 request,
224 "&fields=",
225 paste(query_fields, collapse = ","),
226 if (metadataOnly) "&access-rewrite-disabled=true" else ""
227 )
228
229 # Show individual query progress
230 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
231 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
232 if (is.null(res)) {
233 log_info(verbose, ": API call failed\n")
234 totalResults <- 0
235 } else {
236 totalResults <- as.integer(res$meta$totalResults)
237 log_info(verbose, ": ", totalResults, " hits")
238 if (!is.null(res$meta$cached)) {
239 log_info(verbose, " [cached]")
240 } else if (!is.null(res$meta$benchmark)) {
241 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
242 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
243 formatted_time <- paste0(round(time_value, 2), "s")
244 log_info(verbose, ", took ", formatted_time)
245 } else {
246 log_info(verbose, ", took ", res$meta$benchmark)
247 }
248 }
249 log_info(verbose, "\n")
250 }
251
252 result <- data.frame(
253 query = query,
254 totalResults = totalResults,
255 vc = vc,
256 webUIRequestUrl = webUIRequestUrl,
257 stringsAsFactors = FALSE
258 )
259
260 # Calculate and display ETA information if verbose and we have more than one query
261 if (verbose && total_queries > 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200262 eta_info <- calculate_eta(current_query, total_queries, start_time)
263 if (eta_info != "") {
264 elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200265 avg_time_per_query <- elapsed_time / current_query
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200266
267 # Create progress display
268 progress_display <- paste0(
269 "Query ",
270 sprintf(paste0("%", nchar(total_queries), "d"), current_query),
271 "/",
272 sprintf("%d", total_queries),
273 " completed. Avg: ",
274 sprintf("%.1f", avg_time_per_query),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200275 "s/query",
276 eta_info
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200277 )
278
279 log_info(verbose, progress_display, "\n")
280 }
281 }
282
283 return(result)
284 })
285
286 results %>% bind_rows()
Marc Kupietzd8851222025-05-01 10:57:19 +0200287 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200288 contentFields <- c("snippet", "tokens")
Marc Kupietza96537f2019-11-09 23:07:44 +0100289 if (metadataOnly) {
290 fields <- fields[!fields %in% contentFields]
291 }
Marc Kupietz80dc6432025-02-07 16:57:40 +0100292 if (!"textSigle" %in% fields) {
293 fields <- c(fields, "textSigle")
294 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100295 request <-
Marc Kupietzd8851222025-05-01 10:57:19 +0200296 paste0(
297 "?q=",
298 url_encode(enc2utf8(query)),
299 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
300 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
301 ifelse(!metadataOnly, "&show-tokens=true", ""),
302 "&ql=", ql
303 )
Marc Kupietza96537f2019-11-09 23:07:44 +0100304 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
305 requestUrl <- paste0(
306 kco@apiUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200307 "search",
Marc Kupietza96537f2019-11-09 23:07:44 +0100308 request,
Marc Kupietzd8851222025-05-01 10:57:19 +0200309 "&fields=",
Marc Kupietza96537f2019-11-09 23:07:44 +0100310 paste(fields, collapse = ","),
Marc Kupietzd8851222025-05-01 10:57:19 +0200311 if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietza96537f2019-11-09 23:07:44 +0100312 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200313 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
314 sep =
315 ""
316 )
317 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietza4675722022-02-23 23:55:15 +0100318 if (is.null(res)) {
Marc Kupietza4675722022-02-23 23:55:15 +0100319 message("API call failed.")
320 totalResults <- 0
321 } else {
Marc Kupietzd8851222025-05-01 10:57:19 +0200322 totalResults <- as.integer(res$meta$totalResults)
Marc Kupietza47d1502023-04-18 15:26:47 +0200323 log_info(verbose, ": ", totalResults, " hits")
Marc Kupietzd8851222025-05-01 10:57:19 +0200324 if (!is.null(res$meta$cached)) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200325 log_info(verbose, " [cached]\n")
Marc Kupietzd8851222025-05-01 10:57:19 +0200326 } else if (!is.null(res$meta$benchmark)) {
Marc Kupietz7638ca42025-05-25 13:18:16 +0200327 # Round the benchmark time to 2 decimal places for better readability
328 # If it's a string ending with 's', extract the number, round it, and re-add 's'
329 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
330 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
331 formatted_time <- paste0(round(time_value, 2), "s")
332 log_info(verbose, ", took ", formatted_time, "\n", sep = "")
333 } else {
334 # Fallback if the format is different than expected
335 log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
336 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200337 } else {
338 log_info(verbose, "\n")
339 }
Marc Kupietza4675722022-02-23 23:55:15 +0100340 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200341 if (as.df) {
Marc Kupietza96537f2019-11-09 23:07:44 +0100342 data.frame(
343 query = query,
Marc Kupietza4675722022-02-23 23:55:15 +0100344 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100345 vc = vc,
346 webUIRequestUrl = webUIRequestUrl,
347 stringsAsFactors = FALSE
348 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200349 } else {
Marc Kupietza96537f2019-11-09 23:07:44 +0100350 KorAPQuery(
351 korapConnection = kco,
352 nextStartIndex = 0,
353 fields = fields,
354 requestUrl = requestUrl,
355 request = request,
Marc Kupietza4675722022-02-23 23:55:15 +0100356 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100357 vc = vc,
358 apiResponse = res,
359 webUIRequestUrl = webUIRequestUrl,
Marc Kupietza4675722022-02-23 23:55:15 +0100360 hasMoreMatches = (totalResults > 0),
Marc Kupietza96537f2019-11-09 23:07:44 +0100361 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200362 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100363 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200364 }
365)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200366
Marc Kupietz05a60792024-12-07 16:23:31 +0100367#' @importFrom purrr map
368repair_data_strcuture <- function(x) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200369 if (is.list(x)) {
370 as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz05a60792024-12-07 16:23:31 +0100371 paste(.x, collapse = " ")
372 } else {
373 .x
374 }))
Marc Kupietzd8851222025-05-01 10:57:19 +0200375 } else {
Marc Kupietz05a60792024-12-07 16:23:31 +0100376 ifelse(is.na(x), "", x)
Marc Kupietzd8851222025-05-01 10:57:19 +0200377 }
Marc Kupietz05a60792024-12-07 16:23:31 +0100378}
379
Marc Kupietz62da2b52019-09-12 17:43:34 +0200380#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200381#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200382#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200383#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200384#' @family corpus search functions
385#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200386#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200387#' @param offset start offset for query results to fetch
388#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200389#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200390#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
391#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200392#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100393#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200394#' \dontrun{
395#'
Marc Kupietzd8851222025-05-01 10:57:19 +0200396#' q <- KorAPConnection() %>%
397#' corpusQuery("Ameisenplage") %>%
398#' fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100399#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100400#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100401#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200402#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200403#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200404#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200405#' @aliases fetchNext
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200406#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietzf4881122024-12-17 14:55:39 +0100407#' @importFrom tibble enframe add_column
408#' @importFrom stringr word
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200409#' @importFrom tidyr unnest unchop pivot_wider
410#' @importFrom purrr map
Marc Kupietz632cbd42019-09-06 16:04:51 +0200411#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200412setMethod("fetchNext", "KorAPQuery", function(kqo,
413 offset = kqo@nextStartIndex,
414 maxFetch = maxResultsPerPage,
415 verbose = kqo@korapConnection@verbose,
416 randomizePageOrder = FALSE) {
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100417 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietzd8851222025-05-01 10:57:19 +0200418 results <- key <- name <- tmp_positions <- 0
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100419
Marc Kupietze95108e2019-09-18 13:23:58 +0200420 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
421 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200422 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200423 use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz623d7122025-05-25 12:46:12 +0200424 # Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietze95108e2019-09-18 13:23:58 +0200425 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200426
Marc Kupietz623d7122025-05-25 12:46:12 +0200427 # For randomized page order, generate a list of randomized page indices
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200428 if (randomizePageOrder) {
Marc Kupietz623d7122025-05-25 12:46:12 +0200429 # Calculate how many pages we need to fetch based on maxFetch
430 total_pages_to_fetch <- if (!is.na(maxFetch)) {
431 # Either limited by maxFetch or total results, whichever is smaller
432 min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
433 } else {
434 # All pages
435 ceiling(kqo@totalResults / maxResultsPerPage)
436 }
437
438 # Generate randomized page indices (0-based for API)
439 pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
440 page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200441 }
442
Marc Kupietzd8851222025-05-01 10:57:19 +0200443 if (is.null(collectedMatches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200444 collectedMatches <- data.frame()
445 }
Marc Kupietz623d7122025-05-25 12:46:12 +0200446
447 # Initialize the page counter properly based on nextStartIndex and any previously fetched results
448 # We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
449 # For first call, this will be 1, for subsequent calls, it will reflect our actual position
450 current_page_number <- ceiling(offset / maxResultsPerPage) + 1
451
452 # For sequential fetches, keep track of which global page we're on
453 # This is important for correctly showing page numbers in subsequent fetchNext calls
454 page_count_start <- current_page_number
455
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200456 repeat {
Marc Kupietz623d7122025-05-25 12:46:12 +0200457 # Determine which page to fetch next
458 if (randomizePageOrder) {
459 # In randomized mode, get the page from our randomized list using the page_index
460 # Make sure we don't exceed the array bounds
461 if (page_index > length(pages)) {
462 break # No more pages to fetch in randomized mode
463 }
464 current_offset_page <- pages[page_index]
465 # For display purposes in randomized mode, show which page out of the total we're fetching
466 display_page_number <- page_index
467 } else {
468 # In sequential mode, use the current_page_number to calculate the offset
469 current_offset_page <- (current_page_number - 1)
470 display_page_number <- current_page_number
471 }
472
473 # Calculate the actual offset in tokens
474 currentOffset <- current_offset_page * maxResultsPerPage
475
Marc Kupietzef0e9392025-06-18 12:21:49 +0200476 # Build the query with the appropriate count and offset using httr2
477 count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
Marc Kupietzecc86702025-06-24 12:12:51 +0200478
Marc Kupietzef0e9392025-06-18 12:21:49 +0200479 # Parse existing URL to preserve all query parameters
480 parsed_url <- httr2::url_parse(kqo@requestUrl)
481 existing_query <- parsed_url$query
Marc Kupietzecc86702025-06-24 12:12:51 +0200482
Marc Kupietzef0e9392025-06-18 12:21:49 +0200483 # Add/update count and offset parameters
484 existing_query$count <- count_param
485 existing_query$offset <- currentOffset
Marc Kupietzecc86702025-06-24 12:12:51 +0200486
Marc Kupietzef0e9392025-06-18 12:21:49 +0200487 # Rebuild the URL with all parameters
488 query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz68170952021-06-30 09:37:21 +0200489 res <- apiCall(kqo@korapConnection, query)
490 if (length(res$matches) == 0) {
491 break
492 }
493
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200494 if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz16ccf112025-01-26 13:25:27 +0100495 log_info(verbose, "Using fields API: ")
Marc Kupietz05a60792024-12-07 16:23:31 +0100496 currentMatches <- res$matches$fields %>%
497 purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
498 tibble::enframe() %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200499 tidyr::unnest(cols = value) %>%
500 tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200501 dplyr::select(-name)
Marc Kupietzd8851222025-05-01 10:57:19 +0200502 if ("snippet" %in% colnames(res$matches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200503 currentMatches$snippet <- res$matches$snippet
504 }
Marc Kupietz3cd2c6c2025-01-08 20:35:39 +0100505 if ("tokens" %in% colnames(res$matches)) {
506 currentMatches$tokens <- res$matches$tokens
507 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200508 } else {
509 currentMatches <- res$matches
510 }
511
Marc Kupietze95108e2019-09-18 13:23:58 +0200512 for (field in kqo@fields) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200513 if (!field %in% colnames(currentMatches)) {
514 currentMatches[, field] <- NA
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200515 }
516 }
Marc Kupietzf4881122024-12-17 14:55:39 +0100517 currentMatches <- currentMatches %>%
518 select(kqo@fields) %>%
519 mutate(
Marc Kupietz0447da02025-01-08 20:51:09 +0100520 tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
Marc Kupietzf4881122024-12-17 14:55:39 +0100521 matchStart = as.integer(stringr::word(tmp_positions, 1)),
522 matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
523 ) %>%
524 select(-tmp_positions)
525
Marc Kupietz62da2b52019-09-12 17:43:34 +0200526 if (!is.list(collectedMatches)) {
527 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200528 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200529 collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200530 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200531
Marc Kupietz623d7122025-05-25 12:46:12 +0200532 # Get the actual items per page from the API response
533 # We now consistently use maxResultsPerPage instead
Marc Kupietzacbaab02025-05-01 10:56:35 +0200534
Marc Kupietz623d7122025-05-25 12:46:12 +0200535 # Calculate total pages consistently using fixed maxResultsPerPage
536 # This ensures consistent page counting across the function
537 total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
538
539 # Calculate the total pages based on what we've already fetched plus what we'll fetch
540 # This ensures the correct denominator is displayed for subsequent fetchNext calls
541
542 # Calculate the total number of pages for the entire result set
543 # This calculation is kept for reference and for showing in parentheses
Marc Kupietz669114b2025-05-02 22:02:20 +0200544
Marc Kupietzae9b6172025-05-02 15:50:01 +0200545 # Estimate remaining time
546 time_per_page <- NA
547 eta_str <- "N/A"
548 completion_time_str <- "N/A"
Marc Kupietzacbaab02025-05-01 10:56:35 +0200549
Marc Kupietzae9b6172025-05-02 15:50:01 +0200550 if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
551 # benchmark looks like "0.123s"
552 time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
553 if (!is.na(time_per_page)) {
Marc Kupietz021663d2025-06-18 17:49:22 +0200554 # Calculate remaining pages based on what we still need to fetch
555 if (!is.na(maxFetch)) {
556 # Use the same logic as page display calculation - account for offset
557 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
558 total_pages_this_batch <- min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
559 current_page_in_batch <- ceiling(nrow(collectedMatches) / maxResultsPerPage) + 1
560 remaining_pages <- max(0, total_pages_this_batch - current_page_in_batch)
Marc Kupietz623d7122025-05-25 12:46:12 +0200561 } else {
Marc Kupietz021663d2025-06-18 17:49:22 +0200562 # We need to fetch all results - calculate based on actual position
Marc Kupietz623d7122025-05-25 12:46:12 +0200563 if (randomizePageOrder) {
564 if (exists("pages") && length(pages) > 0) {
Marc Kupietz021663d2025-06-18 17:49:22 +0200565 remaining_pages <- max(0, length(pages) - page_index)
Marc Kupietz623d7122025-05-25 12:46:12 +0200566 } else {
567 # Fallback to a reasonable default
568 remaining_pages <- 1
569 }
570 } else {
Marc Kupietz021663d2025-06-18 17:49:22 +0200571 # For sequential order, calculate remaining pages from current offset
572 current_absolute_page <- ceiling((currentOffset + maxResultsPerPage) / maxResultsPerPage)
573 remaining_pages <- max(0, total_pages - current_absolute_page)
Marc Kupietz623d7122025-05-25 12:46:12 +0200574 }
575 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200576
577 estimated_remaining_seconds <- remaining_pages * time_per_page
578 estimated_completion_time <- Sys.time() + estimated_remaining_seconds
579
Marc Kupietze8c8e1a2025-06-19 17:37:59 +0200580 # Format time nicely using centralized function from logging.R
Marc Kupietzae9b6172025-05-02 15:50:01 +0200581
582 eta_str <- format_duration(estimated_remaining_seconds)
583 completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
Marc Kupietzacbaab02025-05-01 10:56:35 +0200584 }
Marc Kupietzacbaab02025-05-01 10:56:35 +0200585 }
586
Marc Kupietz623d7122025-05-25 12:46:12 +0200587 # Create the page display string with proper formatting
Marc Kupietzacbaab02025-05-01 10:56:35 +0200588
Marc Kupietz623d7122025-05-25 12:46:12 +0200589 # For global page tracking, calculate the absolute page number
590 actual_display_number <- if (randomizePageOrder) {
591 current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
592 } else {
593 # In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
594 current_offset_page + 1
595 }
596
597 # For subsequent calls to fetchNext, we need to calculate the correct page numbers
598 # based on the current batch being fetched
599
600 # For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
601 # Simply count from 1 within the current batch
602
603 # The relative page number is simply the current position in this batch
604 if (randomizePageOrder) {
605 relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
606 } else {
607 relative_page_number <- display_page_number - (page_count_start - 1)
608 }
609
610 # How many pages will we fetch in this batch?
Marc Kupietz021663d2025-06-18 17:49:22 +0200611 # If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz623d7122025-05-25 12:46:12 +0200612 pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz021663d2025-06-18 17:49:22 +0200613 # Account for offset - we can only fetch from the remaining results after offset
614 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
615 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz623d7122025-05-25 12:46:12 +0200616 } else {
617 # Otherwise fetch all remaining pages
618 total_pages - page_count_start + 1
619 }
620
621 # The total pages to be shown in this batch
622 batch_total_pages <- pages_in_this_batch
623
624 page_display <- paste0(
625 "Retrieved page ",
626 sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
627 "/",
628 sprintf("%d", batch_total_pages)
629 )
630
631 # If randomized, also show which actual page we fetched
632 if (randomizePageOrder) {
633 # Determine the maximum width needed for page numbers (based on total pages)
634 # This ensures consistent alignment
635 max_page_width <- nchar(as.character(total_pages))
636 # Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz7638ca42025-05-25 13:18:16 +0200637 page_display <- paste0(
638 page_display,
639 sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
640 )
Marc Kupietz623d7122025-05-25 12:46:12 +0200641 }
642 # Always show the absolute page number and total pages (for clarity)
643 else {
644 # Show the absolute page number (out of total possible pages)
645 page_display <- paste0(page_display, sprintf(
646 " (page %d of %d total)",
647 actual_display_number, total_pages
648 ))
649 }
650
651 # Add caching or timing information
652 if (!is.null(res$meta$cached)) {
653 page_display <- paste0(page_display, " [cached]")
654 } else {
655 page_display <- paste0(
656 page_display,
657 " in ",
658 if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
659 "s. ETA: ",
660 # Display ETA for both randomized and sequential modes
661 eta_str,
662 # Show completion time for both modes
663 paste0(" (", completion_time_str, ")")
664 )
665 }
666
667 log_info(verbose, paste0(page_display, "\n"))
668
669 # Increment the appropriate counter based on mode
670 if (randomizePageOrder) {
671 page_index <- page_index + 1
672 } else {
673 current_page_number <- current_page_number + 1
674 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200675 results <- results + res$meta$itemsPerPage
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200676 if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200677 break
678 }
679 }
Marc Kupietz68170952021-06-30 09:37:21 +0200680 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietzd8851222025-05-01 10:57:19 +0200681 KorAPQuery(
682 nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200683 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200684 fields = kqo@fields,
685 requestUrl = kqo@requestUrl,
686 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200687 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200688 vc = kqo@vc,
689 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200690 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200691 apiResponse = res,
Marc Kupietzd8851222025-05-01 10:57:19 +0200692 collectedMatches = collectedMatches
693 )
Marc Kupietze95108e2019-09-18 13:23:58 +0200694})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200695
696#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200697#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200698#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100699#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200700#' @family corpus search functions
701#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200702#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200703#' \dontrun{
Marc Kupietzecc86702025-06-24 12:12:51 +0200704#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
705#' q <- KorAPConnection() |>
706#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200707#' fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200708#' q@collectedMatches
Marc Kupietzecc86702025-06-24 12:12:51 +0200709#'
710#' # Fetch also all KWICs
711#' q <- KorAPConnection() |> auth() |>
712#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
713#' fetchAll()
714#' q@collectedMatches
715#'
716#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
717#' q <- KorAPConnection() |>
718#' corpusQuery("<base/s=t>", # this matches each text once
719#' vc = "pubDate in 1958-03-12",
720#' fields = c("textSigle", "title"),
721#' ) |>
722#' fetchAll()
723#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100724#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200725#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200726#' @aliases fetchAll
Marc Kupietz62da2b52019-09-12 17:43:34 +0200727#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200728setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
729 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200730})
731
732#' Fetches the remaining results of a KorAP query.
733#'
734#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200735#' \dontrun{
736#'
Marc Kupietzd8851222025-05-01 10:57:19 +0200737#' q <- KorAPConnection() %>%
738#' corpusQuery("Ameisenplage") %>%
739#' fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200740#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100741#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200742#'
743#' @aliases fetchRest
Marc Kupietze95108e2019-09-18 13:23:58 +0200744#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200745setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
746 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200747})
748
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200749#' Query frequencies of search expressions in virtual corpora
Marc Kupietz3f575282019-10-04 14:46:04 +0200750#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200751#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200752#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +0200753#' confidence intervals of one ore multiple search terms across one or multiple
754#' virtual corpora.
755#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200756#' @family frequency analysis
Marc Kupietz3f575282019-10-04 14:46:04 +0200757#' @aliases frequencyQuery
Marc Kupietz3f575282019-10-04 14:46:04 +0200758#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200759#' \dontrun{
760#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200761#' KorAPConnection(verbose = TRUE) |>
Marc Kupietz3f575282019-10-04 14:46:04 +0200762#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +0100763#' }
Marc Kupietz3f575282019-10-04 14:46:04 +0200764#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200765# @inheritParams corpusQuery
Marc Kupietz617266d2025-02-27 10:43:07 +0100766#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200767#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
768#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz67edcb52021-09-20 21:54:24 +0200769#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
770#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200771#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz3f575282019-10-04 14:46:04 +0200772#' @export
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200773#'
774#' @return A tibble, with each row containing the following result columns for query and vc combinations:
775#' - **query**: the query string used for the frequency analysis.
776#' - **totalResults**: absolute frequency of query matches in the vc.
777#' - **vc**: virtual corpus used for the query.
778#' - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
779#' - **total**: total number of words in vc.
780#' - **f**: relative frequency of query matches in the vc.
781#' - **conf.low**: lower bound of the confidence interval for the relative frequency, given `conf.level`.
782#' - **conf.high**: upper bound of the confidence interval for the relative frequency, given `conf.level`.
783
Marc Kupietzd8851222025-05-01 10:57:19 +0200784setMethod(
785 "frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +0100786 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200787 (if (as.alternatives) {
788 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
Marc Kupietz71d6e052019-11-22 18:42:10 +0100789 group_by(vc) %>%
790 mutate(total = sum(totalResults))
Marc Kupietzd8851222025-05-01 10:57:19 +0200791 } else {
792 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
793 mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
794 }) %>%
Marc Kupietz0c29cea2019-10-09 08:44:36 +0200795 ci(conf.level = conf.level)
Marc Kupietzd8851222025-05-01 10:57:19 +0200796 }
797)
Marc Kupietz3f575282019-10-04 14:46:04 +0200798
Marc Kupietz38a9d682024-12-06 16:17:09 +0100799#' buildWebUIRequestUrlFromString
800#'
801#' @rdname KorAPQuery-class
802#' @importFrom urltools url_encode
803#' @export
804buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200805 query,
806 vc = "",
807 ql = "poliqarp") {
Marc Kupietz38a9d682024-12-06 16:17:09 +0100808 if ("KorAPConnection" %in% class(KorAPUrl)) {
809 KorAPUrl <- KorAPUrl@KorAPUrl
810 }
811
812 request <-
813 paste0(
Marc Kupietzd8851222025-05-01 10:57:19 +0200814 "?q=",
Marc Kupietz38a9d682024-12-06 16:17:09 +0100815 urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietzd8851222025-05-01 10:57:19 +0200816 ifelse(vc != "",
817 paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
818 ""
819 ),
820 "&ql=",
Marc Kupietz38a9d682024-12-06 16:17:09 +0100821 ql
822 )
823 paste0(KorAPUrl, request)
824}
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200825
826#' buildWebUIRequestUrl
827#'
828#' @rdname KorAPQuery-class
Marc Kupietzf9129592025-01-26 19:17:54 +0100829#' @importFrom httr2 url_parse
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200830#' @export
831buildWebUIRequestUrl <- function(kco,
Marc Kupietzd8851222025-05-01 10:57:19 +0200832 query = if (missing(KorAPUrl)) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200833 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietzd8851222025-05-01 10:57:19 +0200834 } else {
835 httr2::url_parse(KorAPUrl)$query$q
836 },
Marc Kupietzf9129592025-01-26 19:17:54 +0100837 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200838 KorAPUrl,
Marc Kupietzf9129592025-01-26 19:17:54 +0100839 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz38a9d682024-12-06 16:17:09 +0100840 buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200841}
842
Marc Kupietzd8851222025-05-01 10:57:19 +0200843#' format()
Marc Kupietze95108e2019-09-18 13:23:58 +0200844#' @rdname KorAPQuery-class
845#' @param x KorAPQuery object
846#' @param ... further arguments passed to or from other methods
Marc Kupietzb73ca0f2025-01-28 20:45:01 +0100847#' @importFrom urltools param_get url_decode
Marc Kupietze95108e2019-09-18 13:23:58 +0200848#' @export
849format.KorAPQuery <- function(x, ...) {
850 cat("<KorAPQuery>\n")
851 q <- x
Marc Kupietzd8851222025-05-01 10:57:19 +0200852 param <- urltools::param_get(q@request) |> lapply(urltools::url_decode)
Marc Kupietzb73ca0f2025-01-28 20:45:01 +0100853 cat(" Query: ", param$q, "\n")
854 if (!is.null(param$cq) && param$cq != "") {
855 cat(" Virtual corpus: ", param$cq, "\n")
856 }
857 if (!is.null(q@collectedMatches)) {
858 cat("==============================================================================================================", "\n")
859 print(summary(q@collectedMatches))
860 cat("==============================================================================================================", "\n")
861 }
862 cat(" Total results: ", q@totalResults, "\n")
863 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200864}
865
Marc Kupietze95108e2019-09-18 13:23:58 +0200866#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200867#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200868#' @rdname KorAPQuery-class
869#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200870#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200871setMethod("show", "KorAPQuery", function(object) {
872 format(object)
873})