blob: e00e19e56e234bf88c5c6cf55ec3b2323c0be0b6 [file] [log] [blame]
Marc Kupietza8c40f42025-06-24 15:49:52 +02001#' KorAPQuery class (internal)
Marc Kupietze95108e2019-09-18 13:23:58 +02002#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02003#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
Marc Kupietze95108e2019-09-18 13:23:58 +02004#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02005#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +02006#' @include KorAPConnection.R
Marc Kupietz6dfeed92025-06-03 11:58:06 +02007#' @include logging.R
Marc Kupietzf9129592025-01-26 19:17:54 +01008#' @import httr2
Marc Kupietze95108e2019-09-18 13:23:58 +02009#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
Marc Kupietze52b2952025-07-17 16:53:02 +020024 "hasMoreMatches",
25 "annotations"
Marc Kupietze95108e2019-09-18 13:23:58 +020026))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020027
Marc Kupietza8c40f42025-06-24 15:49:52 +020028#' Initialize KorAPQuery object
29#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +020030#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020031#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020032#' @param request query part of the request URL
33#' @param vc definition of a virtual corpus
34#' @param totalResults number of hits the query has yielded
35#' @param nextStartIndex at what index to start the next fetch of query results
36#' @param fields what data / metadata fields should be collected
37#' @param requestUrl complete URL of the API request
38#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
39#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020040#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020041#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietze52b2952025-07-17 16:53:02 +020042#' @param annotations list of annotation data for collected matches
Marc Kupietz97a1bca2019-10-04 22:52:09 +020043#'
44#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020045#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +020046setMethod(
47 "initialize", "KorAPQuery",
48 function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
49 "corpusSigle", "textSigle", "pubDate", "pubPlace",
50 "availability", "textClass", "snippet", "tokens"
51 ),
Marc Kupietze52b2952025-07-17 16:53:02 +020052 requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL, annotations = NULL) {
Marc Kupietzd8851222025-05-01 10:57:19 +020053 .Object <- callNextMethod()
54 .Object@korapConnection <- korapConnection
55 .Object@request <- request
56 .Object@vc <- vc
57 .Object@totalResults <- totalResults
58 .Object@nextStartIndex <- nextStartIndex
59 .Object@fields <- fields
60 .Object@requestUrl <- requestUrl
61 .Object@webUIRequestUrl <- webUIRequestUrl
62 .Object@apiResponse <- apiResponse
63 .Object@hasMoreMatches <- hasMoreMatches
64 .Object@collectedMatches <- collectedMatches
Marc Kupietze52b2952025-07-17 16:53:02 +020065 .Object@annotations <- annotations
Marc Kupietzd8851222025-05-01 10:57:19 +020066 .Object
67 }
68)
Marc Kupietz632cbd42019-09-06 16:04:51 +020069
Marc Kupietzd8851222025-05-01 10:57:19 +020070setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
71setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
72setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
73setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
Marc Kupietze52b2952025-07-17 16:53:02 +020074setGeneric("fetchAnnotations", function(kqo, ...) standardGeneric("fetchAnnotations"))
Marc Kupietzd8851222025-05-01 10:57:19 +020075setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietze95108e2019-09-18 13:23:58 +020076
77maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020078
Marc Kupietz4de53ec2019-10-04 09:12:00 +020079## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietzef1ef4a2025-02-19 12:12:40 +010080utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020081
Marc Kupietza8c40f42025-06-24 15:49:52 +020082#' Search corpus for query terms
Marc Kupietzdbd431a2021-08-29 12:17:45 +020083#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020084#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020085#'
Marc Kupietza8c40f42025-06-24 15:49:52 +020086#' @family corpus search functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +020087#' @aliases corpusQuery
88#'
89#' @importFrom urltools url_encode
90#' @importFrom purrr pmap
Marc Kupietzea34b812025-06-25 15:49:00 +020091#' @importFrom dplyr bind_rows group_by
Marc Kupietzdbd431a2021-08-29 12:17:45 +020092#'
Marc Kupietz617266d2025-02-27 10:43:07 +010093#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz67edcb52021-09-20 21:54:24 +020094#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020095#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020096#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz132f0052023-04-16 14:23:05 +020097#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
98#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
99#' your RKorAPClient application as explained in the
100#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
101#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
102#' `FALSE`.
Marc Kupietz67edcb52021-09-20 21:54:24 +0200103#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Marc Kupietz1623fe82025-06-24 16:31:46 +0200104#' @param fields character vector specifying which metadata fields to retrieve for each match.
105#' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
106#' \describe{
107#' \item{**Text identification**:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
108#' \item{**Publication info**:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
109#' \item{**Temporal data**:}{`pubDate`, `creationDate` - when text was published/created}
110#' \item{**Publication details**:}{`pubPlace`, `publisher`, `reference` - where/how published}
111#' \item{**Text classification**:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
112#' \item{**Adminstrative and technical info**:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
113#' \item{**Content data**:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
114#' \item{**System data**:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
115#' }
116#' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
117#' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default if `metadataOnly` is set to `FALSE`.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100118#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +0200119#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200120#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200121#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200122#' @param context string that specifies the size of the left and the right context returned in `snippet`
123#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
124#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
125#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
126#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
127#' as this is typically legally motivated.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200128#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz67edcb52021-09-20 21:54:24 +0200129#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
130#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200131#'
132#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200133#' \dontrun{
134#'
Marc Kupietz1623fe82025-06-24 16:31:46 +0200135#' # Fetch basic metadata for "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200136#' KorAPConnection() |>
137#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200138#' fetchAll()
Marc Kupietz1623fe82025-06-24 16:31:46 +0200139#'
140#' # Fetch specific metadata fields for bibliographic analysis
Marc Kupietzd3526422025-06-25 09:16:15 +0200141#' query <- KorAPConnection() |>
Marc Kupietz1623fe82025-06-24 16:31:46 +0200142#' corpusQuery("Ameisenplage",
143#' fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
144#' results <- fetchAll(query)
145#' results@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100146#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200147#'
Marc Kupietz6ae76052021-09-21 10:34:00 +0200148#' \dontrun{
149#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200150#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
151#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200152#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200153#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200154#' corpusQuery(
155#' KorAPUrl =
156#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
157#' )
Marc Kupietz6ae76052021-09-21 10:34:00 +0200158#' }
159#'
160#' \dontrun{
Marc Kupietz3c531f62019-09-13 12:17:24 +0200161#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200162#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200163#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200164#' {
165#' . ->> kco
Marc Kupietzd3526422025-06-25 09:16:15 +0200166#' } |>
167#' corpusQuery("Ameisenplage") |>
168#' fetchAll() |>
169#' slot("collectedMatches") |>
170#' mutate(year = lubridate::year(pubDate)) |>
171#' dplyr::select(year) |>
172#' group_by(year) |>
173#' summarise(Count = dplyr::n()) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200174#' mutate(Freq = mapply(function(f, y) {
175#' f / corpusStats(kco, paste("pubDate in", y))@tokens
Marc Kupietzd3526422025-06-25 09:16:15 +0200176#' }, Count, year)) |>
177#' dplyr::select(-Count) |>
178#' complete(year = min(year):max(year), fill = list(Freq = 0)) |>
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200179#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100180#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200181#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200182#'
183#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200184#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200185#'
186#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +0200187setMethod(
188 "corpusQuery", "KorAPConnection",
189 function(kco,
190 query = if (missing(KorAPUrl)) {
191 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
192 } else {
193 httr2::url_parse(KorAPUrl)$query$q
194 },
195 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
196 KorAPUrl,
197 metadataOnly = TRUE,
198 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
199 fields = c(
200 "corpusSigle",
201 "textSigle",
202 "pubDate",
203 "pubPlace",
204 "availability",
205 "textClass",
206 "snippet",
207 "tokens"
208 ),
209 accessRewriteFatal = TRUE,
210 verbose = kco@verbose,
211 expand = length(vc) != length(query),
212 as.df = FALSE,
213 context = NULL) {
214 if (length(query) > 1 || length(vc) > 1) {
215 grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200216
217 # Initialize timing variables for ETA calculation
218 total_queries <- nrow(grid)
219 current_query <- 0
220 start_time <- Sys.time()
221
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200222 results <- purrr::pmap(grid, function(query, vc, ...) {
223 current_query <<- current_query + 1
224
225 # Execute the single query directly (avoiding recursive call)
226 contentFields <- c("snippet", "tokens")
227 query_fields <- fields
228 if (metadataOnly) {
229 query_fields <- query_fields[!query_fields %in% contentFields]
230 }
231 if (!"textSigle" %in% query_fields) {
232 query_fields <- c(query_fields, "textSigle")
233 }
234 request <-
235 paste0(
236 "?q=",
237 url_encode(enc2utf8(query)),
238 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
239 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
240 ifelse(!metadataOnly, "&show-tokens=true", ""),
241 "&ql=", ql
242 )
243 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
244 requestUrl <- paste0(
245 kco@apiUrl,
246 "search",
247 request,
248 "&fields=",
249 paste(query_fields, collapse = ","),
250 if (metadataOnly) "&access-rewrite-disabled=true" else ""
251 )
252
253 # Show individual query progress
254 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
255 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
256 if (is.null(res)) {
257 log_info(verbose, ": API call failed\n")
258 totalResults <- 0
259 } else {
260 totalResults <- as.integer(res$meta$totalResults)
261 log_info(verbose, ": ", totalResults, " hits")
262 if (!is.null(res$meta$cached)) {
263 log_info(verbose, " [cached]")
264 } else if (!is.null(res$meta$benchmark)) {
265 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
266 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
267 formatted_time <- paste0(round(time_value, 2), "s")
268 log_info(verbose, ", took ", formatted_time)
269 } else {
270 log_info(verbose, ", took ", res$meta$benchmark)
271 }
272 }
Marc Kupietz365660e2025-06-25 15:09:55 +0200273
274 # Calculate and display ETA information on the same line if verbose and we have more than one query
275 if (verbose && total_queries > 1) {
276 eta_info <- calculate_eta(current_query, total_queries, start_time)
277 if (eta_info != "") {
278 elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
279 avg_time_per_query <- elapsed_time / current_query
280
281 # Add ETA info to the same line - remove the leading ". " for cleaner formatting
282 clean_eta_info <- sub("^\\. ", ". ", eta_info)
283 log_info(verbose, clean_eta_info)
284 }
285 }
286
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200287 log_info(verbose, "\n")
288 }
289
290 result <- data.frame(
291 query = query,
292 totalResults = totalResults,
293 vc = vc,
294 webUIRequestUrl = webUIRequestUrl,
295 stringsAsFactors = FALSE
296 )
297
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200298 return(result)
299 })
300
301 results %>% bind_rows()
Marc Kupietzd8851222025-05-01 10:57:19 +0200302 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200303 contentFields <- c("snippet", "tokens")
Marc Kupietza96537f2019-11-09 23:07:44 +0100304 if (metadataOnly) {
305 fields <- fields[!fields %in% contentFields]
306 }
Marc Kupietz80dc6432025-02-07 16:57:40 +0100307 if (!"textSigle" %in% fields) {
308 fields <- c(fields, "textSigle")
309 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100310 request <-
Marc Kupietzd8851222025-05-01 10:57:19 +0200311 paste0(
312 "?q=",
313 url_encode(enc2utf8(query)),
314 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
315 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
316 ifelse(!metadataOnly, "&show-tokens=true", ""),
317 "&ql=", ql
318 )
Marc Kupietza96537f2019-11-09 23:07:44 +0100319 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
320 requestUrl <- paste0(
321 kco@apiUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200322 "search",
Marc Kupietza96537f2019-11-09 23:07:44 +0100323 request,
Marc Kupietzd8851222025-05-01 10:57:19 +0200324 "&fields=",
Marc Kupietza96537f2019-11-09 23:07:44 +0100325 paste(fields, collapse = ","),
Marc Kupietzd8851222025-05-01 10:57:19 +0200326 if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietza96537f2019-11-09 23:07:44 +0100327 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200328 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
329 sep =
330 ""
331 )
332 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietza4675722022-02-23 23:55:15 +0100333 if (is.null(res)) {
Marc Kupietza4675722022-02-23 23:55:15 +0100334 message("API call failed.")
335 totalResults <- 0
336 } else {
Marc Kupietzd8851222025-05-01 10:57:19 +0200337 totalResults <- as.integer(res$meta$totalResults)
Marc Kupietza47d1502023-04-18 15:26:47 +0200338 log_info(verbose, ": ", totalResults, " hits")
Marc Kupietzd8851222025-05-01 10:57:19 +0200339 if (!is.null(res$meta$cached)) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200340 log_info(verbose, " [cached]\n")
Marc Kupietzd8851222025-05-01 10:57:19 +0200341 } else if (!is.null(res$meta$benchmark)) {
Marc Kupietz7638ca42025-05-25 13:18:16 +0200342 # Round the benchmark time to 2 decimal places for better readability
343 # If it's a string ending with 's', extract the number, round it, and re-add 's'
344 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
345 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
346 formatted_time <- paste0(round(time_value, 2), "s")
347 log_info(verbose, ", took ", formatted_time, "\n", sep = "")
348 } else {
349 # Fallback if the format is different than expected
350 log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
351 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200352 } else {
353 log_info(verbose, "\n")
354 }
Marc Kupietza4675722022-02-23 23:55:15 +0100355 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200356 if (as.df) {
Marc Kupietza96537f2019-11-09 23:07:44 +0100357 data.frame(
358 query = query,
Marc Kupietza4675722022-02-23 23:55:15 +0100359 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100360 vc = vc,
361 webUIRequestUrl = webUIRequestUrl,
362 stringsAsFactors = FALSE
363 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200364 } else {
Marc Kupietza96537f2019-11-09 23:07:44 +0100365 KorAPQuery(
366 korapConnection = kco,
367 nextStartIndex = 0,
368 fields = fields,
369 requestUrl = requestUrl,
370 request = request,
Marc Kupietza4675722022-02-23 23:55:15 +0100371 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100372 vc = vc,
373 apiResponse = res,
374 webUIRequestUrl = webUIRequestUrl,
Marc Kupietza4675722022-02-23 23:55:15 +0100375 hasMoreMatches = (totalResults > 0),
Marc Kupietza96537f2019-11-09 23:07:44 +0100376 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200377 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100378 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200379 }
380)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200381
Marc Kupietz05a60792024-12-07 16:23:31 +0100382#' @importFrom purrr map
383repair_data_strcuture <- function(x) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200384 if (is.list(x)) {
385 as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz05a60792024-12-07 16:23:31 +0100386 paste(.x, collapse = " ")
387 } else {
388 .x
389 }))
Marc Kupietzd8851222025-05-01 10:57:19 +0200390 } else {
Marc Kupietz05a60792024-12-07 16:23:31 +0100391 ifelse(is.na(x), "", x)
Marc Kupietzd8851222025-05-01 10:57:19 +0200392 }
Marc Kupietz05a60792024-12-07 16:23:31 +0100393}
394
Marc Kupietz62da2b52019-09-12 17:43:34 +0200395#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200396#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200397#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200398#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200399#' @family corpus search functions
400#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200401#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200402#' @param offset start offset for query results to fetch
403#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200404#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200405#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
406#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200407#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100408#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200409#' \dontrun{
410#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200411#' q <- KorAPConnection() |>
412#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200413#' fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100414#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100415#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100416#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200417#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200418#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200419#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200420#' @aliases fetchNext
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200421#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietzf4881122024-12-17 14:55:39 +0100422#' @importFrom tibble enframe add_column
423#' @importFrom stringr word
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200424#' @importFrom tidyr unnest unchop pivot_wider
425#' @importFrom purrr map
Marc Kupietz632cbd42019-09-06 16:04:51 +0200426#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200427setMethod("fetchNext", "KorAPQuery", function(kqo,
428 offset = kqo@nextStartIndex,
429 maxFetch = maxResultsPerPage,
430 verbose = kqo@korapConnection@verbose,
431 randomizePageOrder = FALSE) {
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100432 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietzd8851222025-05-01 10:57:19 +0200433 results <- key <- name <- tmp_positions <- 0
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100434
Marc Kupietze95108e2019-09-18 13:23:58 +0200435 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
436 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200437 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200438 use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz623d7122025-05-25 12:46:12 +0200439 # Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietze95108e2019-09-18 13:23:58 +0200440 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200441
Marc Kupietz24799fd2025-06-25 14:15:36 +0200442 # Track start time for ETA calculation
443 start_time <- Sys.time()
444
Marc Kupietz623d7122025-05-25 12:46:12 +0200445 # For randomized page order, generate a list of randomized page indices
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200446 if (randomizePageOrder) {
Marc Kupietz623d7122025-05-25 12:46:12 +0200447 # Calculate how many pages we need to fetch based on maxFetch
448 total_pages_to_fetch <- if (!is.na(maxFetch)) {
449 # Either limited by maxFetch or total results, whichever is smaller
450 min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
451 } else {
452 # All pages
453 ceiling(kqo@totalResults / maxResultsPerPage)
454 }
455
456 # Generate randomized page indices (0-based for API)
457 pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
458 page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200459 }
460
Marc Kupietzd8851222025-05-01 10:57:19 +0200461 if (is.null(collectedMatches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200462 collectedMatches <- data.frame()
463 }
Marc Kupietz623d7122025-05-25 12:46:12 +0200464
465 # Initialize the page counter properly based on nextStartIndex and any previously fetched results
466 # We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
467 # For first call, this will be 1, for subsequent calls, it will reflect our actual position
468 current_page_number <- ceiling(offset / maxResultsPerPage) + 1
469
470 # For sequential fetches, keep track of which global page we're on
471 # This is important for correctly showing page numbers in subsequent fetchNext calls
472 page_count_start <- current_page_number
473
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200474 repeat {
Marc Kupietz623d7122025-05-25 12:46:12 +0200475 # Determine which page to fetch next
476 if (randomizePageOrder) {
477 # In randomized mode, get the page from our randomized list using the page_index
478 # Make sure we don't exceed the array bounds
479 if (page_index > length(pages)) {
480 break # No more pages to fetch in randomized mode
481 }
482 current_offset_page <- pages[page_index]
483 # For display purposes in randomized mode, show which page out of the total we're fetching
484 display_page_number <- page_index
485 } else {
486 # In sequential mode, use the current_page_number to calculate the offset
487 current_offset_page <- (current_page_number - 1)
488 display_page_number <- current_page_number
489 }
490
491 # Calculate the actual offset in tokens
492 currentOffset <- current_offset_page * maxResultsPerPage
493
Marc Kupietzef0e9392025-06-18 12:21:49 +0200494 # Build the query with the appropriate count and offset using httr2
495 count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
Marc Kupietzecc86702025-06-24 12:12:51 +0200496
Marc Kupietzef0e9392025-06-18 12:21:49 +0200497 # Parse existing URL to preserve all query parameters
498 parsed_url <- httr2::url_parse(kqo@requestUrl)
499 existing_query <- parsed_url$query
Marc Kupietzecc86702025-06-24 12:12:51 +0200500
Marc Kupietzef0e9392025-06-18 12:21:49 +0200501 # Add/update count and offset parameters
502 existing_query$count <- count_param
503 existing_query$offset <- currentOffset
Marc Kupietzecc86702025-06-24 12:12:51 +0200504
Marc Kupietzef0e9392025-06-18 12:21:49 +0200505 # Rebuild the URL with all parameters
506 query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz68170952021-06-30 09:37:21 +0200507 res <- apiCall(kqo@korapConnection, query)
508 if (length(res$matches) == 0) {
509 break
510 }
511
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200512 if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz16ccf112025-01-26 13:25:27 +0100513 log_info(verbose, "Using fields API: ")
Marc Kupietz05a60792024-12-07 16:23:31 +0100514 currentMatches <- res$matches$fields %>%
515 purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
516 tibble::enframe() %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200517 tidyr::unnest(cols = value) %>%
518 tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200519 dplyr::select(-name)
Marc Kupietzd8851222025-05-01 10:57:19 +0200520 if ("snippet" %in% colnames(res$matches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200521 currentMatches$snippet <- res$matches$snippet
522 }
Marc Kupietz3cd2c6c2025-01-08 20:35:39 +0100523 if ("tokens" %in% colnames(res$matches)) {
524 currentMatches$tokens <- res$matches$tokens
525 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200526 } else {
527 currentMatches <- res$matches
528 }
529
Marc Kupietze95108e2019-09-18 13:23:58 +0200530 for (field in kqo@fields) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200531 if (!field %in% colnames(currentMatches)) {
532 currentMatches[, field] <- NA
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200533 }
534 }
Marc Kupietzf4881122024-12-17 14:55:39 +0100535 currentMatches <- currentMatches %>%
536 select(kqo@fields) %>%
537 mutate(
Marc Kupietzff712a92025-07-18 09:07:23 +0200538 matchID = res$matches$matchID,
Marc Kupietz0447da02025-01-08 20:51:09 +0100539 tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
Marc Kupietzf4881122024-12-17 14:55:39 +0100540 matchStart = as.integer(stringr::word(tmp_positions, 1)),
541 matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
542 ) %>%
543 select(-tmp_positions)
544
Marc Kupietz62da2b52019-09-12 17:43:34 +0200545 if (!is.list(collectedMatches)) {
546 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200547 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200548 collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200549 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200550
Marc Kupietz623d7122025-05-25 12:46:12 +0200551 # Get the actual items per page from the API response
552 # We now consistently use maxResultsPerPage instead
Marc Kupietzacbaab02025-05-01 10:56:35 +0200553
Marc Kupietz623d7122025-05-25 12:46:12 +0200554 # Calculate total pages consistently using fixed maxResultsPerPage
555 # This ensures consistent page counting across the function
556 total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
557
Marc Kupietz24799fd2025-06-25 14:15:36 +0200558 # Calculate ETA using the centralized function from logging.R
559 current_page <- if (randomizePageOrder) page_index else display_page_number
560 total_pages_to_fetch <- if (!is.na(maxFetch)) {
561 # Account for offset - we can only fetch from the remaining results after offset
562 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
563 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
564 } else {
565 total_pages
566 }
Marc Kupietz365660e2025-06-25 15:09:55 +0200567
Marc Kupietz24799fd2025-06-25 14:15:36 +0200568 eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)
Marc Kupietz365660e2025-06-25 15:09:55 +0200569
Marc Kupietz24799fd2025-06-25 14:15:36 +0200570 # Extract timing information for display
Marc Kupietzae9b6172025-05-02 15:50:01 +0200571 time_per_page <- NA
Marc Kupietzae9b6172025-05-02 15:50:01 +0200572 if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
Marc Kupietzae9b6172025-05-02 15:50:01 +0200573 time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
Marc Kupietzacbaab02025-05-01 10:56:35 +0200574 }
575
Marc Kupietz623d7122025-05-25 12:46:12 +0200576 # Create the page display string with proper formatting
Marc Kupietzacbaab02025-05-01 10:56:35 +0200577
Marc Kupietz623d7122025-05-25 12:46:12 +0200578 # For global page tracking, calculate the absolute page number
579 actual_display_number <- if (randomizePageOrder) {
580 current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
581 } else {
582 # In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
583 current_offset_page + 1
584 }
585
586 # For subsequent calls to fetchNext, we need to calculate the correct page numbers
587 # based on the current batch being fetched
588
589 # For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
590 # Simply count from 1 within the current batch
591
592 # The relative page number is simply the current position in this batch
593 if (randomizePageOrder) {
594 relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
595 } else {
596 relative_page_number <- display_page_number - (page_count_start - 1)
597 }
598
599 # How many pages will we fetch in this batch?
Marc Kupietz021663d2025-06-18 17:49:22 +0200600 # If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz623d7122025-05-25 12:46:12 +0200601 pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz021663d2025-06-18 17:49:22 +0200602 # Account for offset - we can only fetch from the remaining results after offset
603 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
604 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz623d7122025-05-25 12:46:12 +0200605 } else {
606 # Otherwise fetch all remaining pages
607 total_pages - page_count_start + 1
608 }
609
610 # The total pages to be shown in this batch
611 batch_total_pages <- pages_in_this_batch
612
613 page_display <- paste0(
614 "Retrieved page ",
615 sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
616 "/",
617 sprintf("%d", batch_total_pages)
618 )
619
620 # If randomized, also show which actual page we fetched
621 if (randomizePageOrder) {
622 # Determine the maximum width needed for page numbers (based on total pages)
623 # This ensures consistent alignment
624 max_page_width <- nchar(as.character(total_pages))
625 # Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz7638ca42025-05-25 13:18:16 +0200626 page_display <- paste0(
627 page_display,
628 sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
629 )
Marc Kupietz623d7122025-05-25 12:46:12 +0200630 }
631 # Always show the absolute page number and total pages (for clarity)
632 else {
633 # Show the absolute page number (out of total possible pages)
634 page_display <- paste0(page_display, sprintf(
635 " (page %d of %d total)",
636 actual_display_number, total_pages
637 ))
638 }
639
640 # Add caching or timing information
641 if (!is.null(res$meta$cached)) {
642 page_display <- paste0(page_display, " [cached]")
643 } else {
644 page_display <- paste0(
645 page_display,
646 " in ",
647 if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
Marc Kupietz24799fd2025-06-25 14:15:36 +0200648 "s",
649 eta_info
Marc Kupietz623d7122025-05-25 12:46:12 +0200650 )
651 }
652
653 log_info(verbose, paste0(page_display, "\n"))
654
655 # Increment the appropriate counter based on mode
656 if (randomizePageOrder) {
657 page_index <- page_index + 1
658 } else {
659 current_page_number <- current_page_number + 1
660 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200661 results <- results + res$meta$itemsPerPage
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200662 if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200663 break
664 }
665 }
Marc Kupietz68170952021-06-30 09:37:21 +0200666 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietzd8851222025-05-01 10:57:19 +0200667 KorAPQuery(
668 nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200669 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200670 fields = kqo@fields,
671 requestUrl = kqo@requestUrl,
672 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200673 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200674 vc = kqo@vc,
675 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200676 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200677 apiResponse = res,
Marc Kupietzd8851222025-05-01 10:57:19 +0200678 collectedMatches = collectedMatches
679 )
Marc Kupietze95108e2019-09-18 13:23:58 +0200680})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200681
682#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200683#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200684#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100685#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200686#' @family corpus search functions
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200687#' @param kqo object obtained from [corpusQuery()]
688#' @param verbose print progress information if true
689#' @param ... further arguments passed to [fetchNext()]
690#' @return The updated `kqo` object with all results in `@collectedMatches`
Marc Kupietza8c40f42025-06-24 15:49:52 +0200691#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200692#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200693#' \dontrun{
Marc Kupietzecc86702025-06-24 12:12:51 +0200694#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
695#' q <- KorAPConnection() |>
696#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200697#' fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200698#' q@collectedMatches
Marc Kupietzecc86702025-06-24 12:12:51 +0200699#'
700#' # Fetch also all KWICs
701#' q <- KorAPConnection() |> auth() |>
702#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
703#' fetchAll()
704#' q@collectedMatches
705#'
706#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
707#' q <- KorAPConnection() |>
708#' corpusQuery("<base/s=t>", # this matches each text once
709#' vc = "pubDate in 1958-03-12",
710#' fields = c("textSigle", "title"),
711#' ) |>
712#' fetchAll()
713#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100714#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200715#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200716#' @aliases fetchAll
Marc Kupietz62da2b52019-09-12 17:43:34 +0200717#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200718setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
719 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200720})
721
722#' Fetches the remaining results of a KorAP query.
723#'
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200724#' @param kqo object obtained from [corpusQuery()]
725#' @param verbose print progress information if true
726#' @param ... further arguments passed to [fetchNext()]
727#' @return The updated `kqo` object with remaining results in `@collectedMatches`
728#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200729#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200730#' \dontrun{
731#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200732#' q <- KorAPConnection() |>
733#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200734#' fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200735#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100736#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200737#'
738#' @aliases fetchRest
Marc Kupietze95108e2019-09-18 13:23:58 +0200739#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200740setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
741 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200742})
743
Marc Kupietze52b2952025-07-17 16:53:02 +0200744#' Fetch annotations for all collected matches
745#'
746#' **`fetchAnnotations`** fetches annotations for all matches in the `@collectedMatches` slot
Marc Kupietzff712a92025-07-18 09:07:23 +0200747#' of a KorAPQuery object and stores them in the `@annotations` slot. The method automatically
748#' uses the `matchID` from collected matches when available for safer and more reliable
749#' annotation retrieval, falling back to constructing URLs from `matchStart` and `matchEnd`
750#' if necessary.
Marc Kupietze52b2952025-07-17 16:53:02 +0200751#'
752#' @family corpus search functions
753#' @aliases fetchAnnotations
754#'
755#' @param kqo object obtained from [corpusQuery()] with collected matches
756#' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
757#' @param verbose print progress information if true
758#' @return The updated `kqo` object with annotations in `@annotations` slot
759#'
760#' @examples
761#' \dontrun{
762#'
763#' # Fetch annotations for matches using Tree-Tagger foundry
764#' q <- KorAPConnection() |>
765#' corpusQuery("Ameisenplage") |>
766#' fetchNext(maxFetch = 10) |>
767#' fetchAnnotations()
768#' q@annotations
769#'
770#' # Use a different foundry
771#' q <- KorAPConnection() |>
772#' corpusQuery("Ameisenplage") |>
773#' fetchNext(maxFetch = 10) |>
774#' fetchAnnotations(foundry = "mate")
775#' q@annotations
776#' }
777#'
778#' @export
779setMethod("fetchAnnotations", "KorAPQuery", function(kqo, foundry = "tt", verbose = kqo@korapConnection@verbose) {
780 if (is.null(kqo@collectedMatches) || nrow(kqo@collectedMatches) == 0) {
781 warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
782 return(kqo)
783 }
784
785 df <- kqo@collectedMatches
786 kco <- kqo@korapConnection
787 annotations_list <- list()
788
789 if (verbose) {
790 cat("Fetching annotations for", nrow(df), "matches using foundry:", foundry, "\n")
791 }
792
793 for (i in seq_len(nrow(df))) {
794 if (verbose && i %% 10 == 0) {
795 cat("Processing match", i, "of", nrow(df), "\n")
796 }
797
Marc Kupietzff712a92025-07-18 09:07:23 +0200798 # Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
799 if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
800 # matchID format: "match-match-A00/JUN/39609-p202-203"
801 # Extract document path and position: A00/JUN/39609-p202-203
802 # Then convert to URL format: A00/JUN/39609/p202-203
803
804 # First extract the document path with position (everything after the last "match-")
805 doc_path_with_pos <- gsub(".*match-([^-]+(?:/[^-]+)*-p\\d+-\\d+).*", "\\1", df$matchID[i])
806 # Then convert the dash before position to slash
807 match_path <- gsub("-p(\\d+-\\d+)", "/p\\1", doc_path_with_pos)
808 req <- paste0(kco@apiUrl, "corpus/", match_path, "?foundry=", foundry)
809 if (verbose) {
810 cat("Using matchID approach for match", i, ": matchID =", df$matchID[i], "\n")
811 cat("Extracted doc path:", doc_path_with_pos, "\n")
812 cat("Final match path:", match_path, "\n")
813 cat("Constructed URL:", req, "\n")
814 }
815 } else {
816 # Fallback to the old method
817 req <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", df$matchStart[i], "-", df$matchEnd[i], "?foundry=", foundry)
818 if (verbose) {
819 cat("Using fallback approach for match", i, ": textSigle =", df$textSigle[i], "\n")
820 cat("Constructed URL:", req, "\n")
821 }
822 }
Marc Kupietze52b2952025-07-17 16:53:02 +0200823
824 tryCatch({
825 res <- apiCall(kco, req)
826 if (!is.null(res)) {
827 annotations_list[[i]] <- res
828 } else {
829 if (verbose) {
830 cat("Warning: No annotations returned for match", i, "\n")
831 }
832 annotations_list[[i]] <- NULL
833 }
834 }, error = function(e) {
835 if (verbose) {
836 cat("Error fetching annotations for match", i, ":", e$message, "\n")
837 }
838 annotations_list[[i]] <- NULL
839 })
840 }
841
842 if (verbose) {
843 successful_annotations <- sum(!sapply(annotations_list, is.null))
844 cat("Successfully fetched annotations for", successful_annotations, "of", nrow(df), "matches\n")
845 }
846
847 kqo@annotations <- annotations_list
848 return(kqo)
849})
850
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200851#' Query frequencies of search expressions in virtual corpora
Marc Kupietz3f575282019-10-04 14:46:04 +0200852#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200853#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200854#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +0200855#' confidence intervals of one ore multiple search terms across one or multiple
856#' virtual corpora.
857#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200858#' @family frequency analysis
Marc Kupietz3f575282019-10-04 14:46:04 +0200859#' @aliases frequencyQuery
Marc Kupietz3f575282019-10-04 14:46:04 +0200860#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200861#' \dontrun{
862#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200863#' KorAPConnection(verbose = TRUE) |>
Marc Kupietz3f575282019-10-04 14:46:04 +0200864#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +0100865#' }
Marc Kupietz3f575282019-10-04 14:46:04 +0200866#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200867# @inheritParams corpusQuery
Marc Kupietz617266d2025-02-27 10:43:07 +0100868#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200869#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
870#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz67edcb52021-09-20 21:54:24 +0200871#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
872#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200873#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz3f575282019-10-04 14:46:04 +0200874#' @export
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200875#'
876#' @return A tibble, with each row containing the following result columns for query and vc combinations:
877#' - **query**: the query string used for the frequency analysis.
878#' - **totalResults**: absolute frequency of query matches in the vc.
879#' - **vc**: virtual corpus used for the query.
880#' - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
881#' - **total**: total number of words in vc.
882#' - **f**: relative frequency of query matches in the vc.
883#' - **conf.low**: lower bound of the confidence interval for the relative frequency, given `conf.level`.
884#' - **conf.high**: upper bound of the confidence interval for the relative frequency, given `conf.level`.
885
Marc Kupietzd8851222025-05-01 10:57:19 +0200886setMethod(
887 "frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +0100888 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200889 (if (as.alternatives) {
890 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
Marc Kupietzea34b812025-06-25 15:49:00 +0200891 group_by(vc) |>
Marc Kupietz71d6e052019-11-22 18:42:10 +0100892 mutate(total = sum(totalResults))
Marc Kupietzd8851222025-05-01 10:57:19 +0200893 } else {
894 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
895 mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
Marc Kupietzea34b812025-06-25 15:49:00 +0200896 }) |>
Marc Kupietz0c29cea2019-10-09 08:44:36 +0200897 ci(conf.level = conf.level)
Marc Kupietzd8851222025-05-01 10:57:19 +0200898 }
899)
Marc Kupietz3f575282019-10-04 14:46:04 +0200900
Marc Kupietz38a9d682024-12-06 16:17:09 +0100901#' buildWebUIRequestUrlFromString
902#'
903#' @rdname KorAPQuery-class
904#' @importFrom urltools url_encode
905#' @export
906buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200907 query,
908 vc = "",
909 ql = "poliqarp") {
Marc Kupietz38a9d682024-12-06 16:17:09 +0100910 if ("KorAPConnection" %in% class(KorAPUrl)) {
911 KorAPUrl <- KorAPUrl@KorAPUrl
912 }
913
914 request <-
915 paste0(
Marc Kupietzd8851222025-05-01 10:57:19 +0200916 "?q=",
Marc Kupietz38a9d682024-12-06 16:17:09 +0100917 urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietzd8851222025-05-01 10:57:19 +0200918 ifelse(vc != "",
919 paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
920 ""
921 ),
922 "&ql=",
Marc Kupietz38a9d682024-12-06 16:17:09 +0100923 ql
924 )
925 paste0(KorAPUrl, request)
926}
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200927
928#' buildWebUIRequestUrl
929#'
930#' @rdname KorAPQuery-class
Marc Kupietzf9129592025-01-26 19:17:54 +0100931#' @importFrom httr2 url_parse
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200932#' @export
933buildWebUIRequestUrl <- function(kco,
Marc Kupietzd8851222025-05-01 10:57:19 +0200934 query = if (missing(KorAPUrl)) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200935 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietzd8851222025-05-01 10:57:19 +0200936 } else {
937 httr2::url_parse(KorAPUrl)$query$q
938 },
Marc Kupietzf9129592025-01-26 19:17:54 +0100939 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200940 KorAPUrl,
Marc Kupietzf9129592025-01-26 19:17:54 +0100941 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz38a9d682024-12-06 16:17:09 +0100942 buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200943}
944
Marc Kupietzd8851222025-05-01 10:57:19 +0200945#' format()
Marc Kupietze95108e2019-09-18 13:23:58 +0200946#' @rdname KorAPQuery-class
947#' @param x KorAPQuery object
948#' @param ... further arguments passed to or from other methods
Marc Kupietzb73ca0f2025-01-28 20:45:01 +0100949#' @importFrom urltools param_get url_decode
Marc Kupietze95108e2019-09-18 13:23:58 +0200950#' @export
951format.KorAPQuery <- function(x, ...) {
952 cat("<KorAPQuery>\n")
953 q <- x
Marc Kupietzd8851222025-05-01 10:57:19 +0200954 param <- urltools::param_get(q@request) |> lapply(urltools::url_decode)
Marc Kupietzb73ca0f2025-01-28 20:45:01 +0100955 cat(" Query: ", param$q, "\n")
956 if (!is.null(param$cq) && param$cq != "") {
957 cat(" Virtual corpus: ", param$cq, "\n")
958 }
959 if (!is.null(q@collectedMatches)) {
960 cat("==============================================================================================================", "\n")
961 print(summary(q@collectedMatches))
962 cat("==============================================================================================================", "\n")
963 }
964 cat(" Total results: ", q@totalResults, "\n")
965 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietze52b2952025-07-17 16:53:02 +0200966 if (!is.null(q@annotations)) {
967 successful_annotations <- sum(!sapply(q@annotations, is.null))
968 cat(" Annotations: ", successful_annotations, " of ", length(q@annotations), " matches\n")
969 }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200970}
971
Marc Kupietze95108e2019-09-18 13:23:58 +0200972#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200973#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200974#' @rdname KorAPQuery-class
975#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200976#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200977setMethod("show", "KorAPQuery", function(object) {
978 format(object)
979})