blob: df12f1f880de68df1b2e974a3d962b027272f891 [file] [log] [blame]
Marc Kupietze95108e2019-09-18 13:23:58 +02001#' Class KorAPQuery
2#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +01003#' This class provides methods to perform different kinds of queries on the KorAP API server.
Marc Kupietz67edcb52021-09-20 21:54:24 +02004#' `KorAPQuery` objects, which are typically created by the [corpusQuery()] method,
Marc Kupietza6e4ee62021-03-05 09:00:15 +01005#' represent the current state of a query to a KorAP server.
Marc Kupietze95108e2019-09-18 13:23:58 +02006#'
7#' @include KorAPConnection.R
Marc Kupietzf9129592025-01-26 19:17:54 +01008#' @import httr2
Marc Kupietze95108e2019-09-18 13:23:58 +02009#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
24 "hasMoreMatches"
25))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietze95108e2019-09-18 13:23:58 +020027#' Method initialize
28#'
29#' @rdname KorAPQuery-class
30#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020031#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020032#' @param request query part of the request URL
33#' @param vc definition of a virtual corpus
34#' @param totalResults number of hits the query has yielded
35#' @param nextStartIndex at what index to start the next fetch of query results
36#' @param fields what data / metadata fields should be collected
37#' @param requestUrl complete URL of the API request
38#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
39#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020040#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020041#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020042#'
43#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020044#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +020045setMethod(
46 "initialize", "KorAPQuery",
47 function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
48 "corpusSigle", "textSigle", "pubDate", "pubPlace",
49 "availability", "textClass", "snippet", "tokens"
50 ),
51 requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
52 .Object <- callNextMethod()
53 .Object@korapConnection <- korapConnection
54 .Object@request <- request
55 .Object@vc <- vc
56 .Object@totalResults <- totalResults
57 .Object@nextStartIndex <- nextStartIndex
58 .Object@fields <- fields
59 .Object@requestUrl <- requestUrl
60 .Object@webUIRequestUrl <- webUIRequestUrl
61 .Object@apiResponse <- apiResponse
62 .Object@hasMoreMatches <- hasMoreMatches
63 .Object@collectedMatches <- collectedMatches
64 .Object
65 }
66)
Marc Kupietz632cbd42019-09-06 16:04:51 +020067
Marc Kupietzd8851222025-05-01 10:57:19 +020068setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
69setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
70setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
71setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
72setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietze95108e2019-09-18 13:23:58 +020073
74maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020075
Marc Kupietz4de53ec2019-10-04 09:12:00 +020076## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietzef1ef4a2025-02-19 12:12:40 +010077utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020078
Marc Kupietzdbd431a2021-08-29 12:17:45 +020079#' Corpus query
80#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020081#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020082#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020083#' @rdname KorAPQuery-class
84#' @aliases corpusQuery
85#'
86#' @importFrom urltools url_encode
87#' @importFrom purrr pmap
88#' @importFrom dplyr bind_rows
89#'
Marc Kupietz617266d2025-02-27 10:43:07 +010090#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz67edcb52021-09-20 21:54:24 +020091#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020092#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020093#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz132f0052023-04-16 14:23:05 +020094#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
95#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
96#' your RKorAPClient application as explained in the
97#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
98#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
99#' `FALSE`.
Marc Kupietz67edcb52021-09-20 21:54:24 +0200100#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Akron5e135462019-09-27 16:31:38 +0200101#' @param fields (meta)data fields that will be fetched for every match.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100102#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +0200103#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200104#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200105#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200106#' @param context string that specifies the size of the left and the right context returned in `snippet`
107#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
108#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
109#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
110#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
111#' as this is typically legally motivated.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200112#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz67edcb52021-09-20 21:54:24 +0200113#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
114#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200115#'
116#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200117#' \dontrun{
118#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200119#' # Fetch metadata of every query hit for "Ameisenplage" and show a summary
Marc Kupietzd8851222025-05-01 10:57:19 +0200120#' KorAPConnection() %>%
121#' corpusQuery("Ameisenplage") %>%
122#' fetchAll()
Marc Kupietz657d8e72020-02-25 18:31:50 +0100123#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200124#'
Marc Kupietz6ae76052021-09-21 10:34:00 +0200125#' \dontrun{
126#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200127#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
128#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200129#'
Marc Kupietz617266d2025-02-27 10:43:07 +0100130#' KorAPConnection(verbose = TRUE) %>%
Marc Kupietzd8851222025-05-01 10:57:19 +0200131#' corpusQuery(
132#' KorAPUrl =
133#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
134#' )
Marc Kupietz6ae76052021-09-21 10:34:00 +0200135#' }
136#'
137#' \dontrun{
Marc Kupietz3c531f62019-09-13 12:17:24 +0200138#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200139#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietzd8851222025-05-01 10:57:19 +0200140#' KorAPConnection(verbose = TRUE) %>%
141#' {
142#' . ->> kco
143#' } %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200144#' corpusQuery("Ameisenplage") %>%
145#' fetchAll() %>%
146#' slot("collectedMatches") %>%
147#' mutate(year = lubridate::year(pubDate)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200148#' dplyr::select(year) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200149#' group_by(year) %>%
Marc Kupietzcb3c59e2020-06-02 10:10:43 +0200150#' summarise(Count = dplyr::n()) %>%
Marc Kupietzd8851222025-05-01 10:57:19 +0200151#' mutate(Freq = mapply(function(f, y) {
152#' f / corpusStats(kco, paste("pubDate in", y))@tokens
153#' }, Count, year)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200154#' dplyr::select(-Count) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200155#' complete(year = min(year):max(year), fill = list(Freq = 0)) %>%
156#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100157#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200158#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200159#'
160#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200161#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200162#'
163#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +0200164setMethod(
165 "corpusQuery", "KorAPConnection",
166 function(kco,
167 query = if (missing(KorAPUrl)) {
168 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
169 } else {
170 httr2::url_parse(KorAPUrl)$query$q
171 },
172 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
173 KorAPUrl,
174 metadataOnly = TRUE,
175 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
176 fields = c(
177 "corpusSigle",
178 "textSigle",
179 "pubDate",
180 "pubPlace",
181 "availability",
182 "textClass",
183 "snippet",
184 "tokens"
185 ),
186 accessRewriteFatal = TRUE,
187 verbose = kco@verbose,
188 expand = length(vc) != length(query),
189 as.df = FALSE,
190 context = NULL) {
191 if (length(query) > 1 || length(vc) > 1) {
192 grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
193 purrr::pmap(grid, function(query, vc, ...) {
194 corpusQuery(kco, query = query, vc = vc, ql = ql, verbose = verbose, as.df = TRUE)
195 }) %>%
196 bind_rows()
197 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200198 contentFields <- c("snippet", "tokens")
Marc Kupietza96537f2019-11-09 23:07:44 +0100199 if (metadataOnly) {
200 fields <- fields[!fields %in% contentFields]
201 }
Marc Kupietz80dc6432025-02-07 16:57:40 +0100202 if (!"textSigle" %in% fields) {
203 fields <- c(fields, "textSigle")
204 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100205 request <-
Marc Kupietzd8851222025-05-01 10:57:19 +0200206 paste0(
207 "?q=",
208 url_encode(enc2utf8(query)),
209 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
210 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
211 ifelse(!metadataOnly, "&show-tokens=true", ""),
212 "&ql=", ql
213 )
Marc Kupietza96537f2019-11-09 23:07:44 +0100214 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
215 requestUrl <- paste0(
216 kco@apiUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200217 "search",
Marc Kupietza96537f2019-11-09 23:07:44 +0100218 request,
Marc Kupietzd8851222025-05-01 10:57:19 +0200219 "&fields=",
Marc Kupietza96537f2019-11-09 23:07:44 +0100220 paste(fields, collapse = ","),
Marc Kupietzd8851222025-05-01 10:57:19 +0200221 if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietza96537f2019-11-09 23:07:44 +0100222 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200223 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
224 sep =
225 ""
226 )
227 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietza4675722022-02-23 23:55:15 +0100228 if (is.null(res)) {
Marc Kupietza4675722022-02-23 23:55:15 +0100229 message("API call failed.")
230 totalResults <- 0
231 } else {
Marc Kupietzd8851222025-05-01 10:57:19 +0200232 totalResults <- as.integer(res$meta$totalResults)
Marc Kupietza47d1502023-04-18 15:26:47 +0200233 log_info(verbose, ": ", totalResults, " hits")
Marc Kupietzd8851222025-05-01 10:57:19 +0200234 if (!is.null(res$meta$cached)) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200235 log_info(verbose, " [cached]\n")
Marc Kupietzd8851222025-05-01 10:57:19 +0200236 } else if (!is.null(res$meta$benchmark)) {
237 log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
238 } else {
239 log_info(verbose, "\n")
240 }
Marc Kupietza4675722022-02-23 23:55:15 +0100241 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200242 if (as.df) {
Marc Kupietza96537f2019-11-09 23:07:44 +0100243 data.frame(
244 query = query,
Marc Kupietza4675722022-02-23 23:55:15 +0100245 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100246 vc = vc,
247 webUIRequestUrl = webUIRequestUrl,
248 stringsAsFactors = FALSE
249 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200250 } else {
Marc Kupietza96537f2019-11-09 23:07:44 +0100251 KorAPQuery(
252 korapConnection = kco,
253 nextStartIndex = 0,
254 fields = fields,
255 requestUrl = requestUrl,
256 request = request,
Marc Kupietza4675722022-02-23 23:55:15 +0100257 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100258 vc = vc,
259 apiResponse = res,
260 webUIRequestUrl = webUIRequestUrl,
Marc Kupietza4675722022-02-23 23:55:15 +0100261 hasMoreMatches = (totalResults > 0),
Marc Kupietza96537f2019-11-09 23:07:44 +0100262 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200263 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100264 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200265 }
266)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200267
Marc Kupietz05a60792024-12-07 16:23:31 +0100268#' @importFrom purrr map
269repair_data_strcuture <- function(x) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200270 if (is.list(x)) {
271 as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz05a60792024-12-07 16:23:31 +0100272 paste(.x, collapse = " ")
273 } else {
274 .x
275 }))
Marc Kupietzd8851222025-05-01 10:57:19 +0200276 } else {
Marc Kupietz05a60792024-12-07 16:23:31 +0100277 ifelse(is.na(x), "", x)
Marc Kupietzd8851222025-05-01 10:57:19 +0200278 }
Marc Kupietz05a60792024-12-07 16:23:31 +0100279}
280
Marc Kupietz62da2b52019-09-12 17:43:34 +0200281#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200282#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200283#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200284#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200285#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200286#' @param offset start offset for query results to fetch
287#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200288#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200289#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
290#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200291#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100292#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200293#' \dontrun{
294#'
Marc Kupietzd8851222025-05-01 10:57:19 +0200295#' q <- KorAPConnection() %>%
296#' corpusQuery("Ameisenplage") %>%
297#' fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100298#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100299#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100300#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200301#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200302#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200303#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200304#' @aliases fetchNext
305#' @rdname KorAPQuery-class
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200306#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietzf4881122024-12-17 14:55:39 +0100307#' @importFrom tibble enframe add_column
308#' @importFrom stringr word
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200309#' @importFrom tidyr unnest unchop pivot_wider
310#' @importFrom purrr map
Marc Kupietz632cbd42019-09-06 16:04:51 +0200311#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200312setMethod("fetchNext", "KorAPQuery", function(kqo,
313 offset = kqo@nextStartIndex,
314 maxFetch = maxResultsPerPage,
315 verbose = kqo@korapConnection@verbose,
316 randomizePageOrder = FALSE) {
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100317 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietzd8851222025-05-01 10:57:19 +0200318 results <- key <- name <- tmp_positions <- 0
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100319
Marc Kupietze95108e2019-09-18 13:23:58 +0200320 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
321 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200322 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200323 use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz705488d2021-06-30 18:26:36 +0200324 page <- kqo@nextStartIndex / maxResultsPerPage + 1
Marc Kupietze95108e2019-09-18 13:23:58 +0200325 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200326
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200327 if (randomizePageOrder) {
328 pages <- head(sample.int(ceiling(kqo@totalResults / maxResultsPerPage)), maxFetch) - 1
329 }
330
Marc Kupietzd8851222025-05-01 10:57:19 +0200331 if (is.null(collectedMatches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200332 collectedMatches <- data.frame()
333 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200334 repeat {
Marc Kupietzd8851222025-05-01 10:57:19 +0200335 page <- nrow(collectedMatches) %/% maxResultsPerPage + 1
336 currentOffset <- ifelse(randomizePageOrder, pages[page], page - 1) * maxResultsPerPage
337 query <- paste0(kqo@requestUrl, "&count=", min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage), "&offset=", currentOffset, "&cutoff=true")
Marc Kupietz68170952021-06-30 09:37:21 +0200338 res <- apiCall(kqo@korapConnection, query)
339 if (length(res$matches) == 0) {
340 break
341 }
342
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200343 if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz16ccf112025-01-26 13:25:27 +0100344 log_info(verbose, "Using fields API: ")
Marc Kupietz05a60792024-12-07 16:23:31 +0100345 currentMatches <- res$matches$fields %>%
346 purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
347 tibble::enframe() %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200348 tidyr::unnest(cols = value) %>%
349 tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200350 dplyr::select(-name)
Marc Kupietzd8851222025-05-01 10:57:19 +0200351 if ("snippet" %in% colnames(res$matches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200352 currentMatches$snippet <- res$matches$snippet
353 }
Marc Kupietz3cd2c6c2025-01-08 20:35:39 +0100354 if ("tokens" %in% colnames(res$matches)) {
355 currentMatches$tokens <- res$matches$tokens
356 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200357 } else {
358 currentMatches <- res$matches
359 }
360
Marc Kupietze95108e2019-09-18 13:23:58 +0200361 for (field in kqo@fields) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200362 if (!field %in% colnames(currentMatches)) {
363 currentMatches[, field] <- NA
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200364 }
365 }
Marc Kupietzf4881122024-12-17 14:55:39 +0100366 currentMatches <- currentMatches %>%
367 select(kqo@fields) %>%
368 mutate(
Marc Kupietz0447da02025-01-08 20:51:09 +0100369 tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
Marc Kupietzf4881122024-12-17 14:55:39 +0100370 matchStart = as.integer(stringr::word(tmp_positions, 1)),
371 matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
372 ) %>%
373 select(-tmp_positions)
374
Marc Kupietz62da2b52019-09-12 17:43:34 +0200375 if (!is.list(collectedMatches)) {
376 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200377 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200378 collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200379 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200380
381 # Calculate page numbers regardless of ETA calculation
Marc Kupietzacbaab02025-05-01 10:56:35 +0200382 items_per_page <- res$meta$itemsPerPage
383 total_pages <- ceiling(kqo@totalResults / items_per_page)
Marc Kupietzacbaab02025-05-01 10:56:35 +0200384 current_page_number <- ceiling(nrow(collectedMatches) / items_per_page)
Marc Kupietzacbaab02025-05-01 10:56:35 +0200385
Marc Kupietz669114b2025-05-02 22:02:20 +0200386 # Determine the actual total pages to display, considering maxFetch
387 actual_total_pages <- if (!is.na(maxFetch) && maxFetch < kqo@totalResults) {
388 ceiling(maxFetch / items_per_page)
389 } else {
390 total_pages
391 }
392
Marc Kupietzae9b6172025-05-02 15:50:01 +0200393 # Estimate remaining time
394 time_per_page <- NA
395 eta_str <- "N/A"
396 completion_time_str <- "N/A"
Marc Kupietzacbaab02025-05-01 10:56:35 +0200397
Marc Kupietzae9b6172025-05-02 15:50:01 +0200398 if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
399 # benchmark looks like "0.123s"
400 time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
401 if (!is.na(time_per_page)) {
402 remaining_pages <- total_pages - current_page_number
403
404 estimated_remaining_seconds <- remaining_pages * time_per_page
405 estimated_completion_time <- Sys.time() + estimated_remaining_seconds
406
407 # Format time nicely
408 format_duration <- function(seconds) {
409 if (is.na(seconds) || seconds < 0) {
410 return("N/A")
411 }
412 days <- floor(seconds / (24 * 3600))
413 seconds <- seconds %% (24 * 3600)
414 hours <- floor(seconds / 3600)
415 seconds <- seconds %% 3600
416 minutes <- floor(seconds / 60)
417 seconds <- floor(seconds %% 60)
418 paste0(
419 if (days > 0) paste0(days, "d ") else "",
420 if (hours > 0 || days > 0) paste0(sprintf("%02d", hours), "h ") else "",
421 if (minutes > 0 || hours > 0 || days > 0) paste0(sprintf("%02d", minutes), "m ") else "",
422 paste0(sprintf("%02d", seconds), "s")
423 )
424 }
425
426 eta_str <- format_duration(estimated_remaining_seconds)
427 completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
Marc Kupietzacbaab02025-05-01 10:56:35 +0200428 }
Marc Kupietzacbaab02025-05-01 10:56:35 +0200429 }
430
Marc Kupietzacbaab02025-05-01 10:56:35 +0200431 log_info(verbose, paste0(
432 "Retrieved page ",
Marc Kupietz669114b2025-05-02 22:02:20 +0200433 sprintf(paste0("%", nchar(actual_total_pages), "d"), current_page_number),
Marc Kupietzacbaab02025-05-01 10:56:35 +0200434 "/",
435 if (!is.na(maxFetch) && maxFetch < kqo@totalResults) {
Marc Kupietz669114b2025-05-02 22:02:20 +0200436 sprintf("%d (%d)", actual_total_pages, total_pages)
Marc Kupietzacbaab02025-05-01 10:56:35 +0200437 } else {
Marc Kupietz669114b2025-05-02 22:02:20 +0200438 sprintf("%d", actual_total_pages)
Marc Kupietzacbaab02025-05-01 10:56:35 +0200439 },
440 if (!is.null(res$meta$cached)) {
441 " [cached]\n"
442 } else {
443 paste0(
444 " in ",
Marc Kupietz669114b2025-05-02 22:02:20 +0200445 if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
Marc Kupietzae9b6172025-05-02 15:50:01 +0200446 "s. ETA: ", eta_str, " (", completion_time_str, ")",
Marc Kupietzacbaab02025-05-01 10:56:35 +0200447 "\n"
448 )
449 }
450 ))
451
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200452 page <- page + 1
453 results <- results + res$meta$itemsPerPage
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200454 if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200455 break
456 }
457 }
Marc Kupietz68170952021-06-30 09:37:21 +0200458 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietzd8851222025-05-01 10:57:19 +0200459 KorAPQuery(
460 nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200461 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200462 fields = kqo@fields,
463 requestUrl = kqo@requestUrl,
464 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200465 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200466 vc = kqo@vc,
467 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200468 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200469 apiResponse = res,
Marc Kupietzd8851222025-05-01 10:57:19 +0200470 collectedMatches = collectedMatches
471 )
Marc Kupietze95108e2019-09-18 13:23:58 +0200472})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200473
474#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200475#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200476#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100477#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200478#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200479#' \dontrun{
480#'
Marc Kupietzd8851222025-05-01 10:57:19 +0200481#' q <- KorAPConnection() %>%
482#' corpusQuery("Ameisenplage") %>%
483#' fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200484#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100485#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200486#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200487#' @aliases fetchAll
488#' @rdname KorAPQuery-class
Marc Kupietz62da2b52019-09-12 17:43:34 +0200489#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200490setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
491 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200492})
493
494#' Fetches the remaining results of a KorAP query.
495#'
496#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200497#' \dontrun{
498#'
Marc Kupietzd8851222025-05-01 10:57:19 +0200499#' q <- KorAPConnection() %>%
500#' corpusQuery("Ameisenplage") %>%
501#' fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200502#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100503#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200504#'
505#' @aliases fetchRest
506#' @rdname KorAPQuery-class
507#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200508setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
509 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200510})
511
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200512#' Query frequencies of search expressions in virtual corpora
Marc Kupietz3f575282019-10-04 14:46:04 +0200513#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200514#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200515#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +0200516#' confidence intervals of one ore multiple search terms across one or multiple
517#' virtual corpora.
518#'
519#' @aliases frequencyQuery
Marc Kupietz3f575282019-10-04 14:46:04 +0200520#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200521#' \dontrun{
522#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200523#' KorAPConnection(verbose = TRUE) |>
Marc Kupietz3f575282019-10-04 14:46:04 +0200524#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +0100525#' }
Marc Kupietz3f575282019-10-04 14:46:04 +0200526#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200527# @inheritParams corpusQuery
Marc Kupietz617266d2025-02-27 10:43:07 +0100528#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200529#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
530#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz67edcb52021-09-20 21:54:24 +0200531#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
532#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200533#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz3f575282019-10-04 14:46:04 +0200534#' @export
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200535#'
536#' @return A tibble, with each row containing the following result columns for query and vc combinations:
537#' - **query**: the query string used for the frequency analysis.
538#' - **totalResults**: absolute frequency of query matches in the vc.
539#' - **vc**: virtual corpus used for the query.
540#' - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
541#' - **total**: total number of words in vc.
542#' - **f**: relative frequency of query matches in the vc.
543#' - **conf.low**: lower bound of the confidence interval for the relative frequency, given `conf.level`.
544#' - **conf.high**: upper bound of the confidence interval for the relative frequency, given `conf.level`.
545
Marc Kupietzd8851222025-05-01 10:57:19 +0200546setMethod(
547 "frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +0100548 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200549 (if (as.alternatives) {
550 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
Marc Kupietz71d6e052019-11-22 18:42:10 +0100551 group_by(vc) %>%
552 mutate(total = sum(totalResults))
Marc Kupietzd8851222025-05-01 10:57:19 +0200553 } else {
554 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
555 mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
556 }) %>%
Marc Kupietz0c29cea2019-10-09 08:44:36 +0200557 ci(conf.level = conf.level)
Marc Kupietzd8851222025-05-01 10:57:19 +0200558 }
559)
Marc Kupietz3f575282019-10-04 14:46:04 +0200560
Marc Kupietz38a9d682024-12-06 16:17:09 +0100561#' buildWebUIRequestUrlFromString
562#'
563#' @rdname KorAPQuery-class
564#' @importFrom urltools url_encode
565#' @export
566buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200567 query,
568 vc = "",
569 ql = "poliqarp") {
Marc Kupietz38a9d682024-12-06 16:17:09 +0100570 if ("KorAPConnection" %in% class(KorAPUrl)) {
571 KorAPUrl <- KorAPUrl@KorAPUrl
572 }
573
574 request <-
575 paste0(
Marc Kupietzd8851222025-05-01 10:57:19 +0200576 "?q=",
Marc Kupietz38a9d682024-12-06 16:17:09 +0100577 urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietzd8851222025-05-01 10:57:19 +0200578 ifelse(vc != "",
579 paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
580 ""
581 ),
582 "&ql=",
Marc Kupietz38a9d682024-12-06 16:17:09 +0100583 ql
584 )
585 paste0(KorAPUrl, request)
586}
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200587
588#' buildWebUIRequestUrl
589#'
590#' @rdname KorAPQuery-class
Marc Kupietzf9129592025-01-26 19:17:54 +0100591#' @importFrom httr2 url_parse
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200592#' @export
593buildWebUIRequestUrl <- function(kco,
Marc Kupietzd8851222025-05-01 10:57:19 +0200594 query = if (missing(KorAPUrl)) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200595 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietzd8851222025-05-01 10:57:19 +0200596 } else {
597 httr2::url_parse(KorAPUrl)$query$q
598 },
Marc Kupietzf9129592025-01-26 19:17:54 +0100599 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200600 KorAPUrl,
Marc Kupietzf9129592025-01-26 19:17:54 +0100601 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz38a9d682024-12-06 16:17:09 +0100602 buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200603}
604
Marc Kupietzd8851222025-05-01 10:57:19 +0200605#' format()
Marc Kupietze95108e2019-09-18 13:23:58 +0200606#' @rdname KorAPQuery-class
607#' @param x KorAPQuery object
608#' @param ... further arguments passed to or from other methods
Marc Kupietzb73ca0f2025-01-28 20:45:01 +0100609#' @importFrom urltools param_get url_decode
Marc Kupietze95108e2019-09-18 13:23:58 +0200610#' @export
611format.KorAPQuery <- function(x, ...) {
612 cat("<KorAPQuery>\n")
613 q <- x
Marc Kupietzd8851222025-05-01 10:57:19 +0200614 param <- urltools::param_get(q@request) |> lapply(urltools::url_decode)
Marc Kupietzb73ca0f2025-01-28 20:45:01 +0100615 cat(" Query: ", param$q, "\n")
616 if (!is.null(param$cq) && param$cq != "") {
617 cat(" Virtual corpus: ", param$cq, "\n")
618 }
619 if (!is.null(q@collectedMatches)) {
620 cat("==============================================================================================================", "\n")
621 print(summary(q@collectedMatches))
622 cat("==============================================================================================================", "\n")
623 }
624 cat(" Total results: ", q@totalResults, "\n")
625 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200626}
627
Marc Kupietze95108e2019-09-18 13:23:58 +0200628#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200629#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200630#' @rdname KorAPQuery-class
631#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200632#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200633setMethod("show", "KorAPQuery", function(object) {
634 format(object)
635})