blob: a8372e34c3ec7ab6c69f14a6cd4afe6b3358b7fa [file] [log] [blame]
Marc Kupietza8c40f42025-06-24 15:49:52 +02001#' KorAPQuery class (internal)
Marc Kupietze95108e2019-09-18 13:23:58 +02002#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02003#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
Marc Kupietze95108e2019-09-18 13:23:58 +02004#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02005#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +02006#' @include KorAPConnection.R
Marc Kupietz6dfeed92025-06-03 11:58:06 +02007#' @include logging.R
Marc Kupietzf9129592025-01-26 19:17:54 +01008#' @import httr2
Marc Kupietze95108e2019-09-18 13:23:58 +02009#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
Marc Kupietza29f3d42025-07-18 10:14:43 +020024 "hasMoreMatches"
Marc Kupietze95108e2019-09-18 13:23:58 +020025))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietza8c40f42025-06-24 15:49:52 +020027#' Initialize KorAPQuery object
28#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +020029#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020030#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020031#' @param request query part of the request URL
32#' @param vc definition of a virtual corpus
33#' @param totalResults number of hits the query has yielded
34#' @param nextStartIndex at what index to start the next fetch of query results
35#' @param fields what data / metadata fields should be collected
36#' @param requestUrl complete URL of the API request
37#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
38#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020039#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020040#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020041#'
42#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020043#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +020044setMethod(
45 "initialize", "KorAPQuery",
46 function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
47 "corpusSigle", "textSigle", "pubDate", "pubPlace",
48 "availability", "textClass", "snippet", "tokens"
49 ),
Marc Kupietza29f3d42025-07-18 10:14:43 +020050 requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
Marc Kupietzd8851222025-05-01 10:57:19 +020051 .Object <- callNextMethod()
52 .Object@korapConnection <- korapConnection
53 .Object@request <- request
54 .Object@vc <- vc
55 .Object@totalResults <- totalResults
56 .Object@nextStartIndex <- nextStartIndex
57 .Object@fields <- fields
58 .Object@requestUrl <- requestUrl
59 .Object@webUIRequestUrl <- webUIRequestUrl
60 .Object@apiResponse <- apiResponse
61 .Object@hasMoreMatches <- hasMoreMatches
62 .Object@collectedMatches <- collectedMatches
63 .Object
64 }
65)
Marc Kupietz632cbd42019-09-06 16:04:51 +020066
Marc Kupietzd8851222025-05-01 10:57:19 +020067setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
68setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
69setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
70setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
Marc Kupietz0af75932025-09-09 18:14:16 +020071setGeneric(
72 "fetchAnnotations",
73 function(kqo,
74 foundry = "tt",
75 overwrite = FALSE,
76 verbose = kqo@korapConnection@verbose) standardGeneric("fetchAnnotations")
77)
Marc Kupietzd8851222025-05-01 10:57:19 +020078setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietze95108e2019-09-18 13:23:58 +020079
80maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020081
Marc Kupietz4de53ec2019-10-04 09:12:00 +020082## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietzef1ef4a2025-02-19 12:12:40 +010083utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020084
Marc Kupietza8c40f42025-06-24 15:49:52 +020085#' Search corpus for query terms
Marc Kupietzdbd431a2021-08-29 12:17:45 +020086#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020087#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020088#'
Marc Kupietza8c40f42025-06-24 15:49:52 +020089#' @family corpus search functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +020090#' @aliases corpusQuery
91#'
92#' @importFrom urltools url_encode
93#' @importFrom purrr pmap
Marc Kupietzea34b812025-06-25 15:49:00 +020094#' @importFrom dplyr bind_rows group_by
Marc Kupietzdbd431a2021-08-29 12:17:45 +020095#'
Marc Kupietz617266d2025-02-27 10:43:07 +010096#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz67edcb52021-09-20 21:54:24 +020097#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020098#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020099#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz132f0052023-04-16 14:23:05 +0200100#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
101#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
102#' your RKorAPClient application as explained in the
103#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
104#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
105#' `FALSE`.
Marc Kupietz67edcb52021-09-20 21:54:24 +0200106#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Marc Kupietz1623fe82025-06-24 16:31:46 +0200107#' @param fields character vector specifying which metadata fields to retrieve for each match.
108#' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
109#' \describe{
110#' \item{**Text identification**:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
111#' \item{**Publication info**:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
112#' \item{**Temporal data**:}{`pubDate`, `creationDate` - when text was published/created}
113#' \item{**Publication details**:}{`pubPlace`, `publisher`, `reference` - where/how published}
114#' \item{**Text classification**:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
115#' \item{**Adminstrative and technical info**:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
116#' \item{**Content data**:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
117#' \item{**System data**:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
118#' }
119#' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
120#' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default if `metadataOnly` is set to `FALSE`.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100121#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +0200122#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200123#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200124#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200125#' @param context string that specifies the size of the left and the right context returned in `snippet`
126#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
127#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
128#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
129#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
130#' as this is typically legally motivated.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200131#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz67edcb52021-09-20 21:54:24 +0200132#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
133#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200134#'
135#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200136#' \dontrun{
137#'
Marc Kupietz1623fe82025-06-24 16:31:46 +0200138#' # Fetch basic metadata for "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200139#' KorAPConnection() |>
140#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200141#' fetchAll()
Marc Kupietz1623fe82025-06-24 16:31:46 +0200142#'
143#' # Fetch specific metadata fields for bibliographic analysis
Marc Kupietzd3526422025-06-25 09:16:15 +0200144#' query <- KorAPConnection() |>
Marc Kupietz1623fe82025-06-24 16:31:46 +0200145#' corpusQuery("Ameisenplage",
146#' fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
147#' results <- fetchAll(query)
148#' results@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100149#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200150#'
Marc Kupietz6ae76052021-09-21 10:34:00 +0200151#' \dontrun{
152#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200153#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
154#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200155#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200156#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200157#' corpusQuery(
158#' KorAPUrl =
159#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
160#' )
Marc Kupietz6ae76052021-09-21 10:34:00 +0200161#' }
162#'
163#' \dontrun{
Marc Kupietz3c531f62019-09-13 12:17:24 +0200164#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200165#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200166#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200167#' {
168#' . ->> kco
Marc Kupietzd3526422025-06-25 09:16:15 +0200169#' } |>
170#' corpusQuery("Ameisenplage") |>
171#' fetchAll() |>
172#' slot("collectedMatches") |>
173#' mutate(year = lubridate::year(pubDate)) |>
174#' dplyr::select(year) |>
175#' group_by(year) |>
176#' summarise(Count = dplyr::n()) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200177#' mutate(Freq = mapply(function(f, y) {
178#' f / corpusStats(kco, paste("pubDate in", y))@tokens
Marc Kupietzd3526422025-06-25 09:16:15 +0200179#' }, Count, year)) |>
180#' dplyr::select(-Count) |>
181#' complete(year = min(year):max(year), fill = list(Freq = 0)) |>
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200182#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100183#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200184#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200185#'
186#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200187#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200188#'
189#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +0200190setMethod(
191 "corpusQuery", "KorAPConnection",
192 function(kco,
193 query = if (missing(KorAPUrl)) {
194 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
195 } else {
196 httr2::url_parse(KorAPUrl)$query$q
197 },
198 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
199 KorAPUrl,
200 metadataOnly = TRUE,
201 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
202 fields = c(
203 "corpusSigle",
204 "textSigle",
205 "pubDate",
206 "pubPlace",
207 "availability",
208 "textClass",
209 "snippet",
210 "tokens"
211 ),
212 accessRewriteFatal = TRUE,
213 verbose = kco@verbose,
214 expand = length(vc) != length(query),
215 as.df = FALSE,
216 context = NULL) {
217 if (length(query) > 1 || length(vc) > 1) {
218 grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200219
220 # Initialize timing variables for ETA calculation
221 total_queries <- nrow(grid)
222 current_query <- 0
223 start_time <- Sys.time()
224
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200225 results <- purrr::pmap(grid, function(query, vc, ...) {
226 current_query <<- current_query + 1
227
228 # Execute the single query directly (avoiding recursive call)
229 contentFields <- c("snippet", "tokens")
230 query_fields <- fields
231 if (metadataOnly) {
232 query_fields <- query_fields[!query_fields %in% contentFields]
233 }
234 if (!"textSigle" %in% query_fields) {
235 query_fields <- c(query_fields, "textSigle")
236 }
237 request <-
238 paste0(
239 "?q=",
240 url_encode(enc2utf8(query)),
241 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
242 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
243 ifelse(!metadataOnly, "&show-tokens=true", ""),
244 "&ql=", ql
245 )
246 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
247 requestUrl <- paste0(
248 kco@apiUrl,
249 "search",
250 request,
251 "&fields=",
252 paste(query_fields, collapse = ","),
253 if (metadataOnly) "&access-rewrite-disabled=true" else ""
254 )
255
256 # Show individual query progress
257 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
258 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
259 if (is.null(res)) {
260 log_info(verbose, ": API call failed\n")
261 totalResults <- 0
262 } else {
263 totalResults <- as.integer(res$meta$totalResults)
264 log_info(verbose, ": ", totalResults, " hits")
265 if (!is.null(res$meta$cached)) {
266 log_info(verbose, " [cached]")
267 } else if (!is.null(res$meta$benchmark)) {
268 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
269 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
270 formatted_time <- paste0(round(time_value, 2), "s")
271 log_info(verbose, ", took ", formatted_time)
272 } else {
273 log_info(verbose, ", took ", res$meta$benchmark)
274 }
275 }
Marc Kupietz365660e2025-06-25 15:09:55 +0200276
277 # Calculate and display ETA information on the same line if verbose and we have more than one query
278 if (verbose && total_queries > 1) {
279 eta_info <- calculate_eta(current_query, total_queries, start_time)
280 if (eta_info != "") {
281 elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
282 avg_time_per_query <- elapsed_time / current_query
283
284 # Add ETA info to the same line - remove the leading ". " for cleaner formatting
285 clean_eta_info <- sub("^\\. ", ". ", eta_info)
286 log_info(verbose, clean_eta_info)
287 }
288 }
289
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200290 log_info(verbose, "\n")
291 }
292
293 result <- data.frame(
294 query = query,
295 totalResults = totalResults,
296 vc = vc,
297 webUIRequestUrl = webUIRequestUrl,
298 stringsAsFactors = FALSE
299 )
300
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200301 return(result)
302 })
303
304 results %>% bind_rows()
Marc Kupietzd8851222025-05-01 10:57:19 +0200305 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200306 contentFields <- c("snippet", "tokens")
Marc Kupietza96537f2019-11-09 23:07:44 +0100307 if (metadataOnly) {
308 fields <- fields[!fields %in% contentFields]
309 }
Marc Kupietz80dc6432025-02-07 16:57:40 +0100310 if (!"textSigle" %in% fields) {
311 fields <- c(fields, "textSigle")
312 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100313 request <-
Marc Kupietzd8851222025-05-01 10:57:19 +0200314 paste0(
315 "?q=",
316 url_encode(enc2utf8(query)),
317 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
318 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
319 ifelse(!metadataOnly, "&show-tokens=true", ""),
320 "&ql=", ql
321 )
Marc Kupietza96537f2019-11-09 23:07:44 +0100322 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
323 requestUrl <- paste0(
324 kco@apiUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200325 "search",
Marc Kupietza96537f2019-11-09 23:07:44 +0100326 request,
Marc Kupietzd8851222025-05-01 10:57:19 +0200327 "&fields=",
Marc Kupietza96537f2019-11-09 23:07:44 +0100328 paste(fields, collapse = ","),
Marc Kupietzd8851222025-05-01 10:57:19 +0200329 if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietza96537f2019-11-09 23:07:44 +0100330 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200331 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
332 sep =
333 ""
334 )
335 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietza4675722022-02-23 23:55:15 +0100336 if (is.null(res)) {
Marc Kupietza4675722022-02-23 23:55:15 +0100337 message("API call failed.")
338 totalResults <- 0
339 } else {
Marc Kupietzd8851222025-05-01 10:57:19 +0200340 totalResults <- as.integer(res$meta$totalResults)
Marc Kupietza47d1502023-04-18 15:26:47 +0200341 log_info(verbose, ": ", totalResults, " hits")
Marc Kupietzd8851222025-05-01 10:57:19 +0200342 if (!is.null(res$meta$cached)) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200343 log_info(verbose, " [cached]\n")
Marc Kupietzd8851222025-05-01 10:57:19 +0200344 } else if (!is.null(res$meta$benchmark)) {
Marc Kupietz2baf5c52025-09-05 16:41:11 +0200345 # Round the benchmark time to 2 decimal places for better readability.
346 # Be robust to locales using comma as decimal separator (e.g., "0,12s").
Marc Kupietz7638ca42025-05-25 13:18:16 +0200347 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
Marc Kupietz2baf5c52025-09-05 16:41:11 +0200348 bench_str <- sub("s$", "", res$meta$benchmark)
349 bench_num <- suppressWarnings(as.numeric(gsub(",", ".", bench_str)))
350 if (!is.na(bench_num)) {
351 formatted_time <- paste0(round(bench_num, 2), "s")
352 } else {
353 formatted_time <- res$meta$benchmark
354 }
Marc Kupietz7638ca42025-05-25 13:18:16 +0200355 log_info(verbose, ", took ", formatted_time, "\n", sep = "")
356 } else {
357 # Fallback if the format is different than expected
358 log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
359 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200360 } else {
361 log_info(verbose, "\n")
362 }
Marc Kupietza4675722022-02-23 23:55:15 +0100363 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200364 if (as.df) {
Marc Kupietza96537f2019-11-09 23:07:44 +0100365 data.frame(
366 query = query,
Marc Kupietza4675722022-02-23 23:55:15 +0100367 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100368 vc = vc,
369 webUIRequestUrl = webUIRequestUrl,
370 stringsAsFactors = FALSE
371 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200372 } else {
Marc Kupietza96537f2019-11-09 23:07:44 +0100373 KorAPQuery(
374 korapConnection = kco,
375 nextStartIndex = 0,
376 fields = fields,
377 requestUrl = requestUrl,
378 request = request,
Marc Kupietza4675722022-02-23 23:55:15 +0100379 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100380 vc = vc,
381 apiResponse = res,
382 webUIRequestUrl = webUIRequestUrl,
Marc Kupietza4675722022-02-23 23:55:15 +0100383 hasMoreMatches = (totalResults > 0),
Marc Kupietza96537f2019-11-09 23:07:44 +0100384 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200385 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100386 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200387 }
388)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200389
Marc Kupietz05a60792024-12-07 16:23:31 +0100390#' @importFrom purrr map
391repair_data_strcuture <- function(x) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200392 if (is.list(x)) {
393 as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz05a60792024-12-07 16:23:31 +0100394 paste(.x, collapse = " ")
395 } else {
396 .x
397 }))
Marc Kupietzd8851222025-05-01 10:57:19 +0200398 } else {
Marc Kupietz05a60792024-12-07 16:23:31 +0100399 ifelse(is.na(x), "", x)
Marc Kupietzd8851222025-05-01 10:57:19 +0200400 }
Marc Kupietz05a60792024-12-07 16:23:31 +0100401}
402
Marc Kupietz62da2b52019-09-12 17:43:34 +0200403#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200404#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200405#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200406#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200407#' @family corpus search functions
408#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200409#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200410#' @param offset start offset for query results to fetch
411#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200412#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200413#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
414#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200415#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100416#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200417#' \dontrun{
418#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200419#' q <- KorAPConnection() |>
420#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200421#' fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100422#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100423#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100424#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200425#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200426#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200427#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200428#' @aliases fetchNext
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200429#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietzf4881122024-12-17 14:55:39 +0100430#' @importFrom tibble enframe add_column
431#' @importFrom stringr word
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200432#' @importFrom tidyr unnest unchop pivot_wider
433#' @importFrom purrr map
Marc Kupietz632cbd42019-09-06 16:04:51 +0200434#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200435setMethod("fetchNext", "KorAPQuery", function(kqo,
436 offset = kqo@nextStartIndex,
437 maxFetch = maxResultsPerPage,
438 verbose = kqo@korapConnection@verbose,
439 randomizePageOrder = FALSE) {
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100440 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietzd8851222025-05-01 10:57:19 +0200441 results <- key <- name <- tmp_positions <- 0
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100442
Marc Kupietze95108e2019-09-18 13:23:58 +0200443 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
444 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200445 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200446 use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz623d7122025-05-25 12:46:12 +0200447 # Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietze95108e2019-09-18 13:23:58 +0200448 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200449
Marc Kupietz24799fd2025-06-25 14:15:36 +0200450 # Track start time for ETA calculation
451 start_time <- Sys.time()
452
Marc Kupietz623d7122025-05-25 12:46:12 +0200453 # For randomized page order, generate a list of randomized page indices
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200454 if (randomizePageOrder) {
Marc Kupietz623d7122025-05-25 12:46:12 +0200455 # Calculate how many pages we need to fetch based on maxFetch
456 total_pages_to_fetch <- if (!is.na(maxFetch)) {
457 # Either limited by maxFetch or total results, whichever is smaller
458 min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
459 } else {
460 # All pages
461 ceiling(kqo@totalResults / maxResultsPerPage)
462 }
463
464 # Generate randomized page indices (0-based for API)
465 pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
466 page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200467 }
468
Marc Kupietzd8851222025-05-01 10:57:19 +0200469 if (is.null(collectedMatches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200470 collectedMatches <- data.frame()
471 }
Marc Kupietz623d7122025-05-25 12:46:12 +0200472
473 # Initialize the page counter properly based on nextStartIndex and any previously fetched results
474 # We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
475 # For first call, this will be 1, for subsequent calls, it will reflect our actual position
476 current_page_number <- ceiling(offset / maxResultsPerPage) + 1
477
478 # For sequential fetches, keep track of which global page we're on
479 # This is important for correctly showing page numbers in subsequent fetchNext calls
480 page_count_start <- current_page_number
481
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200482 repeat {
Marc Kupietz623d7122025-05-25 12:46:12 +0200483 # Determine which page to fetch next
484 if (randomizePageOrder) {
485 # In randomized mode, get the page from our randomized list using the page_index
486 # Make sure we don't exceed the array bounds
487 if (page_index > length(pages)) {
488 break # No more pages to fetch in randomized mode
489 }
490 current_offset_page <- pages[page_index]
491 # For display purposes in randomized mode, show which page out of the total we're fetching
492 display_page_number <- page_index
493 } else {
494 # In sequential mode, use the current_page_number to calculate the offset
495 current_offset_page <- (current_page_number - 1)
496 display_page_number <- current_page_number
497 }
498
499 # Calculate the actual offset in tokens
500 currentOffset <- current_offset_page * maxResultsPerPage
501
Marc Kupietzef0e9392025-06-18 12:21:49 +0200502 # Build the query with the appropriate count and offset using httr2
503 count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
Marc Kupietzecc86702025-06-24 12:12:51 +0200504
Marc Kupietzef0e9392025-06-18 12:21:49 +0200505 # Parse existing URL to preserve all query parameters
506 parsed_url <- httr2::url_parse(kqo@requestUrl)
507 existing_query <- parsed_url$query
Marc Kupietzecc86702025-06-24 12:12:51 +0200508
Marc Kupietzef0e9392025-06-18 12:21:49 +0200509 # Add/update count and offset parameters
510 existing_query$count <- count_param
511 existing_query$offset <- currentOffset
Marc Kupietzecc86702025-06-24 12:12:51 +0200512
Marc Kupietzef0e9392025-06-18 12:21:49 +0200513 # Rebuild the URL with all parameters
514 query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz68170952021-06-30 09:37:21 +0200515 res <- apiCall(kqo@korapConnection, query)
516 if (length(res$matches) == 0) {
517 break
518 }
519
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200520 if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz16ccf112025-01-26 13:25:27 +0100521 log_info(verbose, "Using fields API: ")
Marc Kupietz05a60792024-12-07 16:23:31 +0100522 currentMatches <- res$matches$fields %>%
523 purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
524 tibble::enframe() %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200525 tidyr::unnest(cols = value) %>%
526 tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200527 dplyr::select(-name)
Marc Kupietzd8851222025-05-01 10:57:19 +0200528 if ("snippet" %in% colnames(res$matches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200529 currentMatches$snippet <- res$matches$snippet
530 }
Marc Kupietz3cd2c6c2025-01-08 20:35:39 +0100531 if ("tokens" %in% colnames(res$matches)) {
532 currentMatches$tokens <- res$matches$tokens
533 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200534 } else {
535 currentMatches <- res$matches
536 }
537
Marc Kupietze95108e2019-09-18 13:23:58 +0200538 for (field in kqo@fields) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200539 if (!field %in% colnames(currentMatches)) {
540 currentMatches[, field] <- NA
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200541 }
542 }
Marc Kupietzf4881122024-12-17 14:55:39 +0100543 currentMatches <- currentMatches %>%
544 select(kqo@fields) %>%
545 mutate(
Marc Kupietzff712a92025-07-18 09:07:23 +0200546 matchID = res$matches$matchID,
Marc Kupietz0447da02025-01-08 20:51:09 +0100547 tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
Marc Kupietzf4881122024-12-17 14:55:39 +0100548 matchStart = as.integer(stringr::word(tmp_positions, 1)),
549 matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
550 ) %>%
551 select(-tmp_positions)
552
Marc Kupietz62da2b52019-09-12 17:43:34 +0200553 if (!is.list(collectedMatches)) {
554 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200555 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200556 collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200557 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200558
Marc Kupietz623d7122025-05-25 12:46:12 +0200559 # Get the actual items per page from the API response
560 # We now consistently use maxResultsPerPage instead
Marc Kupietzacbaab02025-05-01 10:56:35 +0200561
Marc Kupietz623d7122025-05-25 12:46:12 +0200562 # Calculate total pages consistently using fixed maxResultsPerPage
563 # This ensures consistent page counting across the function
564 total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
565
Marc Kupietz24799fd2025-06-25 14:15:36 +0200566 # Calculate ETA using the centralized function from logging.R
567 current_page <- if (randomizePageOrder) page_index else display_page_number
568 total_pages_to_fetch <- if (!is.na(maxFetch)) {
569 # Account for offset - we can only fetch from the remaining results after offset
570 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
571 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
572 } else {
573 total_pages
574 }
Marc Kupietz365660e2025-06-25 15:09:55 +0200575
Marc Kupietz24799fd2025-06-25 14:15:36 +0200576 eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)
Marc Kupietz365660e2025-06-25 15:09:55 +0200577
Marc Kupietz24799fd2025-06-25 14:15:36 +0200578 # Extract timing information for display
Marc Kupietzae9b6172025-05-02 15:50:01 +0200579 time_per_page <- NA
Marc Kupietzae9b6172025-05-02 15:50:01 +0200580 if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
Marc Kupietzae9b6172025-05-02 15:50:01 +0200581 time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
Marc Kupietzacbaab02025-05-01 10:56:35 +0200582 }
583
Marc Kupietz623d7122025-05-25 12:46:12 +0200584 # Create the page display string with proper formatting
Marc Kupietzacbaab02025-05-01 10:56:35 +0200585
Marc Kupietz623d7122025-05-25 12:46:12 +0200586 # For global page tracking, calculate the absolute page number
587 actual_display_number <- if (randomizePageOrder) {
588 current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
589 } else {
590 # In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
591 current_offset_page + 1
592 }
593
594 # For subsequent calls to fetchNext, we need to calculate the correct page numbers
595 # based on the current batch being fetched
596
597 # For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
598 # Simply count from 1 within the current batch
599
600 # The relative page number is simply the current position in this batch
601 if (randomizePageOrder) {
602 relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
603 } else {
604 relative_page_number <- display_page_number - (page_count_start - 1)
605 }
606
607 # How many pages will we fetch in this batch?
Marc Kupietz021663d2025-06-18 17:49:22 +0200608 # If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz623d7122025-05-25 12:46:12 +0200609 pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz021663d2025-06-18 17:49:22 +0200610 # Account for offset - we can only fetch from the remaining results after offset
611 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
612 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz623d7122025-05-25 12:46:12 +0200613 } else {
614 # Otherwise fetch all remaining pages
615 total_pages - page_count_start + 1
616 }
617
618 # The total pages to be shown in this batch
619 batch_total_pages <- pages_in_this_batch
620
621 page_display <- paste0(
622 "Retrieved page ",
623 sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
624 "/",
625 sprintf("%d", batch_total_pages)
626 )
627
628 # If randomized, also show which actual page we fetched
629 if (randomizePageOrder) {
630 # Determine the maximum width needed for page numbers (based on total pages)
631 # This ensures consistent alignment
632 max_page_width <- nchar(as.character(total_pages))
633 # Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz7638ca42025-05-25 13:18:16 +0200634 page_display <- paste0(
635 page_display,
636 sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
637 )
Marc Kupietz623d7122025-05-25 12:46:12 +0200638 }
639 # Always show the absolute page number and total pages (for clarity)
640 else {
641 # Show the absolute page number (out of total possible pages)
642 page_display <- paste0(page_display, sprintf(
643 " (page %d of %d total)",
644 actual_display_number, total_pages
645 ))
646 }
647
648 # Add caching or timing information
649 if (!is.null(res$meta$cached)) {
650 page_display <- paste0(page_display, " [cached]")
651 } else {
652 page_display <- paste0(
653 page_display,
654 " in ",
655 if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
Marc Kupietz24799fd2025-06-25 14:15:36 +0200656 "s",
657 eta_info
Marc Kupietz623d7122025-05-25 12:46:12 +0200658 )
659 }
660
661 log_info(verbose, paste0(page_display, "\n"))
662
663 # Increment the appropriate counter based on mode
664 if (randomizePageOrder) {
665 page_index <- page_index + 1
666 } else {
667 current_page_number <- current_page_number + 1
668 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200669 results <- results + res$meta$itemsPerPage
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200670 if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200671 break
672 }
673 }
Marc Kupietz68170952021-06-30 09:37:21 +0200674 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietzd8851222025-05-01 10:57:19 +0200675 KorAPQuery(
676 nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200677 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200678 fields = kqo@fields,
679 requestUrl = kqo@requestUrl,
680 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200681 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200682 vc = kqo@vc,
683 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200684 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200685 apiResponse = res,
Marc Kupietzd8851222025-05-01 10:57:19 +0200686 collectedMatches = collectedMatches
687 )
Marc Kupietze95108e2019-09-18 13:23:58 +0200688})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200689
690#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200691#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200692#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100693#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200694#' @family corpus search functions
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200695#' @param kqo object obtained from [corpusQuery()]
696#' @param verbose print progress information if true
697#' @param ... further arguments passed to [fetchNext()]
698#' @return The updated `kqo` object with all results in `@collectedMatches`
Marc Kupietza8c40f42025-06-24 15:49:52 +0200699#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200700#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200701#' \dontrun{
Marc Kupietzecc86702025-06-24 12:12:51 +0200702#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
703#' q <- KorAPConnection() |>
704#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200705#' fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200706#' q@collectedMatches
Marc Kupietzecc86702025-06-24 12:12:51 +0200707#'
708#' # Fetch also all KWICs
709#' q <- KorAPConnection() |> auth() |>
710#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
711#' fetchAll()
712#' q@collectedMatches
713#'
714#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
715#' q <- KorAPConnection() |>
716#' corpusQuery("<base/s=t>", # this matches each text once
717#' vc = "pubDate in 1958-03-12",
718#' fields = c("textSigle", "title"),
719#' ) |>
720#' fetchAll()
721#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100722#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200723#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200724#' @aliases fetchAll
Marc Kupietz62da2b52019-09-12 17:43:34 +0200725#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200726setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
727 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200728})
729
730#' Fetches the remaining results of a KorAP query.
731#'
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200732#' @param kqo object obtained from [corpusQuery()]
733#' @param verbose print progress information if true
734#' @param ... further arguments passed to [fetchNext()]
735#' @return The updated `kqo` object with remaining results in `@collectedMatches`
736#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200737#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200738#' \dontrun{
739#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200740#' q <- KorAPConnection() |>
741#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200742#' fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200743#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100744#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200745#'
746#' @aliases fetchRest
Marc Kupietze95108e2019-09-18 13:23:58 +0200747#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200748setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
749 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200750})
751
Marc Kupietzbdedd022025-10-09 14:14:15 +0200752# Helper to collapse multiple annotation values while preserving order
753collapse_features <- function(values) {
754 if (length(values) == 0) {
755 return(NA_character_)
756 }
757 unique_values <- values[!duplicated(values)]
758 paste(unique_values, collapse = "|")
759}
760
761# Extract token-level annotations from a DOM node
762collect_token_annotations <- function(parent_node) {
763 if (inherits(parent_node, "xml_missing")) {
764 return(list(
765 node = list(),
766 token = character(0),
767 lemma = character(0),
768 pos = character(0),
769 morph = character(0)
770 ))
771 }
772
773 leaf_nodes <- xml2::xml_find_all(parent_node, ".//span[not(.//span)]")
774
775 if (length(leaf_nodes) == 0) {
776 return(list(
777 node = list(),
778 token = character(0),
779 lemma = character(0),
780 pos = character(0),
781 morph = character(0)
782 ))
783 }
784
785 tokens <- character(0)
786 lemmas <- character(0)
787 pos_tags <- character(0)
788 morph_tags <- character(0)
789 kept_nodes <- list()
790
791 for (idx in seq_along(leaf_nodes)) {
792 leaf <- leaf_nodes[[idx]]
793 token_text <- trimws(xml2::xml_text(leaf))
794 if (identical(token_text, "")) {
795 next
796 }
797
798 kept_nodes[[length(kept_nodes) + 1]] <- leaf
799 tokens <- c(tokens, token_text)
800
801 ancestors <- xml2::xml_find_all(leaf, "ancestor-or-self::span")
802 titles <- xml2::xml_attr(ancestors, "title")
803 titles <- titles[!is.na(titles)]
804
805 feature_pieces <- if (length(titles) > 0) unlist(strsplit(titles, "[[:space:]]+")) else character(0)
806
807 lemma_values <- sub('.*?/l:(.*)$', '\\1', feature_pieces[grepl('/l:', feature_pieces)], perl = TRUE)
808 pos_values <- sub('.*?/p:(.*)$', '\\1', feature_pieces[grepl('/p:', feature_pieces)], perl = TRUE)
809 morph_values <- sub('.*?/m:(.*)$', '\\1', feature_pieces[grepl('/m:', feature_pieces)], perl = TRUE)
810
811 lemmas <- c(lemmas, collapse_features(lemma_values))
812 pos_tags <- c(pos_tags, collapse_features(pos_values))
813 morph_tags <- c(morph_tags, collapse_features(morph_values))
814 }
815
816 list(
817 node = kept_nodes,
818 token = tokens,
819 lemma = lemmas,
820 pos = pos_tags,
821 morph = morph_tags
822 )
823}
824
Marc Kupietza29f3d42025-07-18 10:14:43 +0200825#'
826#' Parse XML annotations into linguistic layers
827#'
828#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
829#' from XML annotation snippets returned by the KorAP API.
830#'
831#' @param xml_snippet XML string containing annotation data
832#' @return Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
833#' @keywords internal
834parse_xml_annotations <- function(xml_snippet) {
835 if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
836 return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
837 }
838
Marc Kupietzbdedd022025-10-09 14:14:15 +0200839 doc <- tryCatch(xml2::read_html(paste0("<root>", xml_snippet, "</root>")), error = function(e) NULL)
840 if (is.null(doc)) {
841 return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
Marc Kupietzcd452182025-10-09 13:28:41 +0200842 }
843
Marc Kupietzbdedd022025-10-09 14:14:15 +0200844 match_node <- xml2::xml_find_first(doc, ".//span[contains(@class, 'match')]")
845 if (inherits(match_node, "xml_missing")) {
846 match_node <- xml2::xml_find_first(doc, ".//span")
847 if (inherits(match_node, "xml_missing")) {
848 return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
Marc Kupietza29f3d42025-07-18 10:14:43 +0200849 }
850 }
851
Marc Kupietzbdedd022025-10-09 14:14:15 +0200852 token_info <- collect_token_annotations(match_node)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200853
Marc Kupietzbdedd022025-10-09 14:14:15 +0200854 list(
855 token = token_info$token,
856 lemma = token_info$lemma,
857 pos = token_info$pos,
858 morph = token_info$morph
859 )
Marc Kupietza29f3d42025-07-18 10:14:43 +0200860}
861
862#'
863#' Parse XML annotations into linguistic layers with left/match/right structure
864#'
865#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
866#' from XML annotation snippets returned by the KorAP API, split into left context,
867#' match, and right context sections like the tokens field.
868#'
869#' @param xml_snippet XML string containing annotation data
870#' @return Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
871#' @keywords internal
872parse_xml_annotations_structured <- function(xml_snippet) {
873 if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
874 empty_result <- list(left = character(0), match = character(0), right = character(0))
875 return(list(
876 atokens = empty_result,
877 lemma = empty_result,
878 pos = empty_result,
879 morph = empty_result
880 ))
881 }
882
Marc Kupietzbdedd022025-10-09 14:14:15 +0200883 doc <- tryCatch(xml2::read_html(paste0("<root>", xml_snippet, "</root>")), error = function(e) NULL)
884 if (is.null(doc)) {
885 empty_result <- list(left = character(0), match = character(0), right = character(0))
Marc Kupietza29f3d42025-07-18 10:14:43 +0200886 return(list(
Marc Kupietzbdedd022025-10-09 14:14:15 +0200887 atokens = empty_result,
888 lemma = empty_result,
889 pos = empty_result,
890 morph = empty_result
Marc Kupietza29f3d42025-07-18 10:14:43 +0200891 ))
892 }
893
Marc Kupietzbdedd022025-10-09 14:14:15 +0200894 match_node <- xml2::xml_find_first(doc, ".//span[contains(@class, 'match')]")
895 if (inherits(match_node, "xml_missing")) {
896 empty_result <- list(left = character(0), match = character(0), right = character(0))
897 return(list(
898 atokens = empty_result,
899 lemma = empty_result,
900 pos = empty_result,
901 morph = empty_result
902 ))
Marc Kupietza29f3d42025-07-18 10:14:43 +0200903 }
Marc Kupietzc643a122025-07-18 18:18:36 +0200904
Marc Kupietzbdedd022025-10-09 14:14:15 +0200905 token_info <- collect_token_annotations(match_node)
906 tokens <- token_info$token
907 lemmas <- token_info$lemma
908 pos_tags <- token_info$pos
909 morph_tags <- token_info$morph
910 nodes <- token_info$node
Marc Kupietzc643a122025-07-18 18:18:36 +0200911
Marc Kupietzbdedd022025-10-09 14:14:15 +0200912 if (length(tokens) == 0) {
913 empty_result <- list(left = character(0), match = character(0), right = character(0))
914 return(list(
915 atokens = empty_result,
916 lemma = empty_result,
917 pos = empty_result,
918 morph = empty_result
919 ))
920 }
Marc Kupietzc643a122025-07-18 18:18:36 +0200921
Marc Kupietzbdedd022025-10-09 14:14:15 +0200922 mark_flags <- vapply(nodes, function(n) {
923 !inherits(xml2::xml_find_first(n, "ancestor::mark"), "xml_missing")
924 }, logical(1))
Marc Kupietzc643a122025-07-18 18:18:36 +0200925
Marc Kupietzbdedd022025-10-09 14:14:15 +0200926 if (any(mark_flags)) {
927 first_idx <- which(mark_flags)[1]
928 last_idx <- tail(which(mark_flags), 1)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200929 } else {
Marc Kupietzbdedd022025-10-09 14:14:15 +0200930 first_idx <- 1
931 last_idx <- length(tokens)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200932 }
933
Marc Kupietzbdedd022025-10-09 14:14:15 +0200934 sections <- rep("match", length(tokens))
935 if (first_idx > 1) {
936 sections[seq_len(first_idx - 1)] <- "left"
937 }
938 if (last_idx < length(tokens)) {
939 sections[seq(from = last_idx + 1, to = length(tokens))] <- "right"
940 }
Marc Kupietza29f3d42025-07-18 10:14:43 +0200941
Marc Kupietzbdedd022025-10-09 14:14:15 +0200942 subset_by_section <- function(values, section) {
943 idx <- sections == section
944 if (!any(idx)) {
945 return(character(0))
946 }
947 values[idx]
948 }
949
950 atokens <- list(
951 left = subset_by_section(tokens, "left"),
952 match = subset_by_section(tokens, "match"),
953 right = subset_by_section(tokens, "right")
954 )
955
956 lemma <- list(
957 left = subset_by_section(lemmas, "left"),
958 match = subset_by_section(lemmas, "match"),
959 right = subset_by_section(lemmas, "right")
960 )
961
962 pos <- list(
963 left = subset_by_section(pos_tags, "left"),
964 match = subset_by_section(pos_tags, "match"),
965 right = subset_by_section(pos_tags, "right")
966 )
967
968 morph <- list(
969 left = subset_by_section(morph_tags, "left"),
970 match = subset_by_section(morph_tags, "match"),
971 right = subset_by_section(morph_tags, "right")
972 )
973
974 list(
975 atokens = atokens,
976 lemma = lemma,
977 pos = pos,
978 morph = morph
979 )
Marc Kupietza29f3d42025-07-18 10:14:43 +0200980}
981
Marc Kupietze52b2952025-07-17 16:53:02 +0200982#' Fetch annotations for all collected matches
983#'
Marc Kupietz89f796e2025-07-19 09:05:06 +0200984#' `r lifecycle::badge("experimental")`
985#'
986#' **`fetchAnnotations`** fetches annotations (only token annotations, for now)
987#' for all matches in the `@collectedMatches` slot
Marc Kupietzc643a122025-07-18 18:18:36 +0200988#' of a KorAPQuery object and adds annotation columns directly to the `@collectedMatches`
Marc Kupietz89f796e2025-07-19 09:05:06 +0200989#' data frame. The method uses the `matchID` from collected matches.
Marc Kupietza29f3d42025-07-18 10:14:43 +0200990#'
991#' **Important**: For copyright-restricted corpora, users must be authorized via [auth()]
992#' and the initial corpus query must have `metadataOnly = FALSE` to ensure snippets are
993#' available for annotation parsing.
994#'
995#' The method parses XML snippet annotations and adds linguistic columns to the data frame:
996#' - `pos`: data frame with `left`, `match`, `right` columns, each containing list vectors of part-of-speech tags
997#' - `lemma`: data frame with `left`, `match`, `right` columns, each containing list vectors of lemmas
998#' - `morph`: data frame with `left`, `match`, `right` columns, each containing list vectors of morphological tags
999#' - `atokens`: data frame with `left`, `match`, `right` columns, each containing list vectors of token text (from annotations)
1000#' - `annotation_snippet`: original XML snippet from the annotation API
Marc Kupietze52b2952025-07-17 16:53:02 +02001001#'
1002#' @family corpus search functions
Marc Kupietz89f796e2025-07-19 09:05:06 +02001003#' @concept Annotations
Marc Kupietze52b2952025-07-17 16:53:02 +02001004#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001005#' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
Marc Kupietze52b2952025-07-17 16:53:02 +02001006#' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
Marc Kupietz93787d52025-09-03 13:33:25 +02001007#' @param overwrite logical; if TRUE, re-fetch and replace any existing
1008#' annotation columns. If FALSE (default), only add missing annotation layers
1009#' and preserve already fetched ones (e.g., keep POS/lemma from a previous
1010#' foundry while adding morph from another).
Marc Kupietze52b2952025-07-17 16:53:02 +02001011#' @param verbose print progress information if true
Marc Kupietz0af75932025-09-09 18:14:16 +02001012#' @return The updated `kqo` object with annotation columns
Marc Kupietz89f796e2025-07-19 09:05:06 +02001013#' like `pos`, `lemma`, `morph` (and `atokens` and `annotation_snippet`)
1014#' in the `@collectedMatches` slot. Each column is a data frame
1015#' with `left`, `match`, and `right` columns containing list vectors of annotations
1016#' for the left context, matched tokens, and right context, respectively.
1017#' The original XML snippet for each match is also stored in `annotation_snippet`.
Marc Kupietze52b2952025-07-17 16:53:02 +02001018#'
1019#' @examples
1020#' \dontrun{
1021#'
1022#' # Fetch annotations for matches using Tree-Tagger foundry
Marc Kupietza29f3d42025-07-18 10:14:43 +02001023#' # Note: Authorization required for copyright-restricted corpora
Marc Kupietze52b2952025-07-17 16:53:02 +02001024#' q <- KorAPConnection() |>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001025#' auth() |>
1026#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
Marc Kupietze52b2952025-07-17 16:53:02 +02001027#' fetchNext(maxFetch = 10) |>
1028#' fetchAnnotations()
Marc Kupietze52b2952025-07-17 16:53:02 +02001029#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001030#' # Access linguistic annotations for match i:
Marc Kupietz6aa5a0d2025-09-08 17:51:47 +02001031#' pos_tags <- q@collectedMatches$pos
1032#' # Data frame with left/match/right columns for POS tags
1033#' lemmas <- q@collectedMatches$lemma
1034#' # Data frame with left/match/right columns for lemmas
1035#' morphology <- q@collectedMatches$morph
1036#' # Data frame with left/match/right columns for morphological tags
1037#' atokens <- q@collectedMatches$atokens
1038#' # Data frame with left/match/right columns for annotation token text
Marc Kupietz0af75932025-09-09 18:14:16 +02001039#' # Original XML snippet for match i
1040#' raw_snippet <- q@collectedMatches$annotation_snippet[[i]]
Marc Kupietzc643a122025-07-18 18:18:36 +02001041#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001042#' # Access specific components:
Marc Kupietz0af75932025-09-09 18:14:16 +02001043#' # POS tags for the matched tokens in match i
1044#' match_pos <- q@collectedMatches$pos$match[[i]]
1045#' # Lemmas for the left context in match i
1046#' left_lemmas <- q@collectedMatches$lemma$left[[i]]
1047#' # Token text for the right context in match i
1048#' right_tokens <- q@collectedMatches$atokens$right[[i]]
Marc Kupietza29f3d42025-07-18 10:14:43 +02001049#'
Marc Kupietz89f796e2025-07-19 09:05:06 +02001050#' # Use a different foundry (e.g., MarMoT)
Marc Kupietze52b2952025-07-17 16:53:02 +02001051#' q <- KorAPConnection() |>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001052#' auth() |>
1053#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
Marc Kupietze52b2952025-07-17 16:53:02 +02001054#' fetchNext(maxFetch = 10) |>
Marc Kupietz89f796e2025-07-19 09:05:06 +02001055#' fetchAnnotations(foundry = "marmot")
1056#' q@collectedMatches$pos$left[1] # POS tags for the left context of the first match
Marc Kupietze52b2952025-07-17 16:53:02 +02001057#' }
Marc Kupietze52b2952025-07-17 16:53:02 +02001058#' @export
Marc Kupietz0af75932025-09-09 18:14:16 +02001059setMethod("fetchAnnotations", "KorAPQuery", function(kqo,
1060 foundry = "tt",
1061 overwrite = FALSE,
1062 verbose = kqo@korapConnection@verbose) {
1063 if (is.null(kqo@collectedMatches) ||
1064 nrow(kqo@collectedMatches) == 0) {
1065 warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
1066 return(kqo)
1067 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001068
Marc Kupietze52b2952025-07-17 16:53:02 +02001069 df <- kqo@collectedMatches
1070 kco <- kqo@korapConnection
Marc Kupietza29f3d42025-07-18 10:14:43 +02001071
Marc Kupietza29f3d42025-07-18 10:14:43 +02001072 # Initialize annotation columns as data frames (like tokens field)
1073 # Create the structure more explicitly to avoid assignment issues
1074 nrows <- nrow(df)
Marc Kupietzc643a122025-07-18 18:18:36 +02001075
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001076 # Pre-compute the empty character vector list to avoid repeated computation
1077 empty_char_list <- I(replicate(nrows, character(0), simplify = FALSE))
Marc Kupietz0af75932025-09-09 18:14:16 +02001078
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001079 # Helper function to create annotation data frame structure
1080 create_annotation_df <- function(empty_list) {
1081 data.frame(
1082 left = empty_list,
1083 match = empty_list,
1084 right = empty_list,
1085 stringsAsFactors = FALSE
1086 )
1087 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001088
Marc Kupietz93787d52025-09-03 13:33:25 +02001089 # Track which annotation columns already existed to decide overwrite behavior
1090 existing_types <- list(
1091 pos = "pos" %in% colnames(df),
1092 lemma = "lemma" %in% colnames(df),
1093 morph = "morph" %in% colnames(df),
1094 atokens = "atokens" %in% colnames(df),
1095 annotation_snippet = "annotation_snippet" %in% colnames(df)
1096 )
1097
1098 # Initialize annotation columns using the helper function
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001099 annotation_types <- c("pos", "lemma", "morph", "atokens")
1100 for (type in annotation_types) {
Marc Kupietz93787d52025-09-03 13:33:25 +02001101 if (overwrite || !existing_types[[type]]) {
1102 df[[type]] <- create_annotation_df(empty_char_list)
1103 }
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001104 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001105
Marc Kupietz93787d52025-09-03 13:33:25 +02001106 if (overwrite || !existing_types$annotation_snippet) {
feldmuellera02f1932025-09-15 16:38:06 +02001107 df$annotation_snippet <- rep(NA_character_, nrows) # Fixed line
Marc Kupietz93787d52025-09-03 13:33:25 +02001108 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001109
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001110 # Initialize timing for ETA calculation
1111 start_time <- Sys.time()
1112 if (verbose) {
1113 log_info(verbose, paste("Starting to fetch annotations for", nrows, "matches\n"))
1114 }
1115
Marc Kupietz93787d52025-09-03 13:33:25 +02001116 # Helper to decide if existing annotation row is effectively empty
1117 is_empty_annotation_row <- function(ann_df, row_index) {
1118 if (is.null(ann_df) || nrow(ann_df) < row_index) return(TRUE)
1119 left_val <- ann_df$left[[row_index]]
1120 match_val <- ann_df$match[[row_index]]
1121 right_val <- ann_df$right[[row_index]]
1122 all(
1123 (is.null(left_val) || (length(left_val) == 0) || all(is.na(left_val))),
1124 (is.null(match_val) || (length(match_val) == 0) || all(is.na(match_val))),
1125 (is.null(right_val) || (length(right_val) == 0) || all(is.na(right_val)))
1126 )
1127 }
1128
Marc Kupietze52b2952025-07-17 16:53:02 +02001129 for (i in seq_len(nrow(df))) {
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001130 # ETA logging
1131 if (verbose && i > 1) {
1132 eta_info <- calculate_eta(i, nrows, start_time)
1133 log_info(verbose, paste("Fetching annotations for match", i, "of", nrows, eta_info, "\n"))
1134 }
Marc Kupietzff712a92025-07-18 09:07:23 +02001135 # Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
1136 if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001137 # matchID format: "match-match-A00/JUN/39609-p202-203" or encrypted format like
1138 # "match-DNB10/CSL/80400-p2343-2344x_MinDOhu_P6dd2MMZJyyus_7MairdKnr1LxY07Cya-Ow"
1139 # Extract document path and position, handling both regular and encrypted formats
Marc Kupietzc643a122025-07-18 18:18:36 +02001140
Marc Kupietza29f3d42025-07-18 10:14:43 +02001141 # More flexible regex to extract the document path with position and encryption
1142 # Look for pattern: match-(...)-p(\d+)-(\d+)(.*) where (.*) is the encrypted part
1143 # We need to capture the entire path including the encrypted suffix
1144 match_result <- regexpr("match-(.+?-p\\d+-\\d+.*)", df$matchID[i], perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001145
Marc Kupietza29f3d42025-07-18 10:14:43 +02001146 if (match_result > 0) {
1147 # Extract the complete path including encryption (everything after "match-")
1148 doc_path_with_pos_and_encryption <- gsub("^match-(.+)$", "\\1", df$matchID[i], perl = TRUE)
1149 # Convert the dash before position to slash, but keep everything after the position
1150 match_path <- gsub("-p(\\d+-\\d+.*)", "/p\\1", doc_path_with_pos_and_encryption)
Marc Kupietz25121302025-07-19 08:45:43 +02001151 # Use httr2 to construct URL safely
1152 base_url <- paste0(kco@apiUrl, "corpus/", match_path)
1153 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietza29f3d42025-07-18 10:14:43 +02001154 } else {
Marc Kupietz25121302025-07-19 08:45:43 +02001155 # If regex fails, fall back to the old method with httr2
1156 # Format numbers to avoid scientific notation
1157 match_start <- format(df$matchStart[i], scientific = FALSE)
1158 match_end <- format(df$matchEnd[i], scientific = FALSE)
1159 base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
1160 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietzff712a92025-07-18 09:07:23 +02001161 }
1162 } else {
Marc Kupietz25121302025-07-19 08:45:43 +02001163 # Fallback to the old method with httr2
1164 # Format numbers to avoid scientific notation
1165 match_start <- format(df$matchStart[i], scientific = FALSE)
1166 match_end <- format(df$matchEnd[i], scientific = FALSE)
1167 base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
1168 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietzff712a92025-07-18 09:07:23 +02001169 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001170
Marc Kupietze52b2952025-07-17 16:53:02 +02001171 tryCatch({
1172 res <- apiCall(kco, req)
Marc Kupietzc643a122025-07-18 18:18:36 +02001173
Marc Kupietze52b2952025-07-17 16:53:02 +02001174 if (!is.null(res)) {
Marc Kupietz93787d52025-09-03 13:33:25 +02001175 # Store the raw annotation snippet (respect overwrite flag)
1176 if (overwrite || !existing_types$annotation_snippet || is.null(df$annotation_snippet[[i]]) || is.na(df$annotation_snippet[[i]])) {
1177 df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
1178 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001179
1180 # Parse XML annotations if snippet is available
1181 if (is.list(res) && "snippet" %in% names(res)) {
1182 parsed_annotations <- parse_xml_annotations_structured(res$snippet)
1183
1184 # Store the parsed linguistic data in data frame format (like tokens)
1185 # Use individual assignment to avoid data frame mismatch errors
1186 tryCatch({
1187 # Assign POS annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001188 if (overwrite || !existing_types$pos || is_empty_annotation_row(df$pos, i)) {
1189 df$pos$left[i] <- list(parsed_annotations$pos$left)
1190 df$pos$match[i] <- list(parsed_annotations$pos$match)
1191 df$pos$right[i] <- list(parsed_annotations$pos$right)
1192 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001193
Marc Kupietza29f3d42025-07-18 10:14:43 +02001194 # Assign lemma annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001195 if (overwrite || !existing_types$lemma || is_empty_annotation_row(df$lemma, i)) {
1196 df$lemma$left[i] <- list(parsed_annotations$lemma$left)
1197 df$lemma$match[i] <- list(parsed_annotations$lemma$match)
1198 df$lemma$right[i] <- list(parsed_annotations$lemma$right)
1199 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001200
Marc Kupietza29f3d42025-07-18 10:14:43 +02001201 # Assign morphology annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001202 if (overwrite || !existing_types$morph || is_empty_annotation_row(df$morph, i)) {
1203 df$morph$left[i] <- list(parsed_annotations$morph$left)
1204 df$morph$match[i] <- list(parsed_annotations$morph$match)
1205 df$morph$right[i] <- list(parsed_annotations$morph$right)
1206 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001207
Marc Kupietza29f3d42025-07-18 10:14:43 +02001208 # Assign token annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001209 if (overwrite || !existing_types$atokens || is_empty_annotation_row(df$atokens, i)) {
1210 df$atokens$left[i] <- list(parsed_annotations$atokens$left)
1211 df$atokens$match[i] <- list(parsed_annotations$atokens$match)
1212 df$atokens$right[i] <- list(parsed_annotations$atokens$right)
1213 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001214 }, error = function(assign_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001215 # Set empty character vectors on assignment error using list assignment
Marc Kupietz93787d52025-09-03 13:33:25 +02001216 if (overwrite || !existing_types$pos) {
1217 df$pos$left[i] <<- list(character(0))
1218 df$pos$match[i] <<- list(character(0))
1219 df$pos$right[i] <<- list(character(0))
1220 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001221
Marc Kupietz93787d52025-09-03 13:33:25 +02001222 if (overwrite || !existing_types$lemma) {
1223 df$lemma$left[i] <<- list(character(0))
1224 df$lemma$match[i] <<- list(character(0))
1225 df$lemma$right[i] <<- list(character(0))
1226 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001227
Marc Kupietz93787d52025-09-03 13:33:25 +02001228 if (overwrite || !existing_types$morph) {
1229 df$morph$left[i] <<- list(character(0))
1230 df$morph$match[i] <<- list(character(0))
1231 df$morph$right[i] <<- list(character(0))
1232 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001233
Marc Kupietz93787d52025-09-03 13:33:25 +02001234 if (overwrite || !existing_types$atokens) {
1235 df$atokens$left[i] <<- list(character(0))
1236 df$atokens$match[i] <<- list(character(0))
1237 df$atokens$right[i] <<- list(character(0))
1238 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001239 })
Marc Kupietza29f3d42025-07-18 10:14:43 +02001240 } else {
1241 # No snippet available, store empty vectors
Marc Kupietz93787d52025-09-03 13:33:25 +02001242 if (overwrite || !existing_types$pos) {
1243 df$pos$left[i] <- list(character(0))
1244 df$pos$match[i] <- list(character(0))
1245 df$pos$right[i] <- list(character(0))
1246 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001247
Marc Kupietz93787d52025-09-03 13:33:25 +02001248 if (overwrite || !existing_types$lemma) {
1249 df$lemma$left[i] <- list(character(0))
1250 df$lemma$match[i] <- list(character(0))
1251 df$lemma$right[i] <- list(character(0))
1252 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001253
Marc Kupietz93787d52025-09-03 13:33:25 +02001254 if (overwrite || !existing_types$morph) {
1255 df$morph$left[i] <- list(character(0))
1256 df$morph$match[i] <- list(character(0))
1257 df$morph$right[i] <- list(character(0))
1258 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001259
Marc Kupietz93787d52025-09-03 13:33:25 +02001260 if (overwrite || !existing_types$atokens) {
1261 df$atokens$left[i] <- list(character(0))
1262 df$atokens$match[i] <- list(character(0))
1263 df$atokens$right[i] <- list(character(0))
1264 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001265 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001266 } else {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001267 # Store NAs for failed requests
Marc Kupietz93787d52025-09-03 13:33:25 +02001268 if (overwrite || !existing_types$pos) {
1269 df$pos$left[i] <- list(NA)
1270 df$pos$match[i] <- list(NA)
1271 df$pos$right[i] <- list(NA)
1272 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001273
Marc Kupietz93787d52025-09-03 13:33:25 +02001274 if (overwrite || !existing_types$lemma) {
1275 df$lemma$left[i] <- list(NA)
1276 df$lemma$match[i] <- list(NA)
1277 df$lemma$right[i] <- list(NA)
1278 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001279
Marc Kupietz93787d52025-09-03 13:33:25 +02001280 if (overwrite || !existing_types$morph) {
1281 df$morph$left[i] <- list(NA)
1282 df$morph$match[i] <- list(NA)
1283 df$morph$right[i] <- list(NA)
1284 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001285
Marc Kupietz93787d52025-09-03 13:33:25 +02001286 if (overwrite || !existing_types$atokens) {
1287 df$atokens$left[i] <- list(NA)
1288 df$atokens$match[i] <- list(NA)
1289 df$atokens$right[i] <- list(NA)
1290 }
1291 if (overwrite || !existing_types$annotation_snippet) {
1292 df$annotation_snippet[[i]] <- NA
1293 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001294 }
1295 }, error = function(e) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001296 # Store NAs for failed requests
Marc Kupietz93787d52025-09-03 13:33:25 +02001297 if (overwrite || !existing_types$pos) {
1298 df$pos$left[i] <- list(NA)
1299 df$pos$match[i] <- list(NA)
1300 df$pos$right[i] <- list(NA)
1301 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001302
Marc Kupietz93787d52025-09-03 13:33:25 +02001303 if (overwrite || !existing_types$lemma) {
1304 df$lemma$left[i] <- list(NA)
1305 df$lemma$match[i] <- list(NA)
1306 df$lemma$right[i] <- list(NA)
1307 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001308
Marc Kupietz93787d52025-09-03 13:33:25 +02001309 if (overwrite || !existing_types$morph) {
1310 df$morph$left[i] <- list(NA)
1311 df$morph$match[i] <- list(NA)
1312 df$morph$right[i] <- list(NA)
1313 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001314
Marc Kupietz93787d52025-09-03 13:33:25 +02001315 if (overwrite || !existing_types$atokens) {
1316 df$atokens$left[i] <- list(NA)
1317 df$atokens$match[i] <- list(NA)
1318 df$atokens$right[i] <- list(NA)
1319 }
1320 if (overwrite || !existing_types$annotation_snippet) {
1321 df$annotation_snippet[[i]] <- NA
1322 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001323 })
1324 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001325
Marc Kupietza29f3d42025-07-18 10:14:43 +02001326 # Validate data frame structure before assignment
1327 if (nrow(df) != nrow(kqo@collectedMatches)) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001328 }
1329
1330 # Update the collectedMatches with annotation data
1331 tryCatch({
1332 kqo@collectedMatches <- df
1333 }, error = function(assign_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001334 # Try a safer approach: add columns individually
1335 tryCatch({
1336 kqo@collectedMatches$pos <- df$pos
Marc Kupietzc643a122025-07-18 18:18:36 +02001337 kqo@collectedMatches$lemma <- df$lemma
Marc Kupietza29f3d42025-07-18 10:14:43 +02001338 kqo@collectedMatches$morph <- df$morph
1339 kqo@collectedMatches$atokens <- df$atokens
1340 kqo@collectedMatches$annotation_snippet <- df$annotation_snippet
1341 }, error = function(col_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001342 warning("Failed to add annotation data to collectedMatches")
1343 })
1344 })
1345
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001346 if (verbose) {
1347 elapsed_time <- Sys.time() - start_time
1348 log_info(verbose, paste("Finished fetching annotations for", nrows, "matches in", format_duration(as.numeric(elapsed_time, units = "secs")), "\n"))
1349 }
1350
Marc Kupietze52b2952025-07-17 16:53:02 +02001351 return(kqo)
1352})
1353
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001354#' Query frequencies of search expressions in virtual corpora
Marc Kupietz3f575282019-10-04 14:46:04 +02001355#'
Marc Kupietz67edcb52021-09-20 21:54:24 +02001356#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001357#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +02001358#' confidence intervals of one ore multiple search terms across one or multiple
1359#' virtual corpora.
1360#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02001361#' @family frequency analysis
Marc Kupietz3f575282019-10-04 14:46:04 +02001362#' @aliases frequencyQuery
Marc Kupietz3f575282019-10-04 14:46:04 +02001363#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +02001364#' \dontrun{
1365#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001366#' KorAPConnection(verbose = TRUE) |>
Marc Kupietz3f575282019-10-04 14:46:04 +02001367#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +01001368#' }
Marc Kupietz3f575282019-10-04 14:46:04 +02001369#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001370# @inheritParams corpusQuery
Marc Kupietz617266d2025-02-27 10:43:07 +01001371#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001372#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
1373#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz67edcb52021-09-20 21:54:24 +02001374#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
1375#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001376#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz3f575282019-10-04 14:46:04 +02001377#' @export
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001378#'
1379#' @return A tibble, with each row containing the following result columns for query and vc combinations:
1380#' - **query**: the query string used for the frequency analysis.
1381#' - **totalResults**: absolute frequency of query matches in the vc.
1382#' - **vc**: virtual corpus used for the query.
1383#' - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
1384#' - **total**: total number of words in vc.
1385#' - **f**: relative frequency of query matches in the vc.
1386#' - **conf.low**: lower bound of the confidence interval for the relative frequency, given `conf.level`.
1387#' - **conf.high**: upper bound of the confidence interval for the relative frequency, given `conf.level`.
1388
Marc Kupietzd8851222025-05-01 10:57:19 +02001389setMethod(
1390 "frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +01001391 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietzd8851222025-05-01 10:57:19 +02001392 (if (as.alternatives) {
1393 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
Marc Kupietzea34b812025-06-25 15:49:00 +02001394 group_by(vc) |>
Marc Kupietz71d6e052019-11-22 18:42:10 +01001395 mutate(total = sum(totalResults))
Marc Kupietzd8851222025-05-01 10:57:19 +02001396 } else {
1397 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
1398 mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
Marc Kupietzea34b812025-06-25 15:49:00 +02001399 }) |>
Marc Kupietz0c29cea2019-10-09 08:44:36 +02001400 ci(conf.level = conf.level)
Marc Kupietzd8851222025-05-01 10:57:19 +02001401 }
1402)
Marc Kupietz3f575282019-10-04 14:46:04 +02001403
Marc Kupietz38a9d682024-12-06 16:17:09 +01001404#' buildWebUIRequestUrlFromString
1405#'
1406#' @rdname KorAPQuery-class
1407#' @importFrom urltools url_encode
1408#' @export
1409buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +02001410 query,
1411 vc = "",
1412 ql = "poliqarp") {
Marc Kupietz38a9d682024-12-06 16:17:09 +01001413 if ("KorAPConnection" %in% class(KorAPUrl)) {
1414 KorAPUrl <- KorAPUrl@KorAPUrl
1415 }
1416
1417 request <-
1418 paste0(
Marc Kupietzd8851222025-05-01 10:57:19 +02001419 "?q=",
Marc Kupietz38a9d682024-12-06 16:17:09 +01001420 urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietzd8851222025-05-01 10:57:19 +02001421 ifelse(vc != "",
1422 paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
1423 ""
1424 ),
1425 "&ql=",
Marc Kupietz38a9d682024-12-06 16:17:09 +01001426 ql
1427 )
1428 paste0(KorAPUrl, request)
1429}
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001430
1431#' buildWebUIRequestUrl
1432#'
1433#' @rdname KorAPQuery-class
Marc Kupietzf9129592025-01-26 19:17:54 +01001434#' @importFrom httr2 url_parse
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001435#' @export
1436buildWebUIRequestUrl <- function(kco,
Marc Kupietzd8851222025-05-01 10:57:19 +02001437 query = if (missing(KorAPUrl)) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001438 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietzd8851222025-05-01 10:57:19 +02001439 } else {
1440 httr2::url_parse(KorAPUrl)$query$q
1441 },
Marc Kupietzf9129592025-01-26 19:17:54 +01001442 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001443 KorAPUrl,
Marc Kupietzf9129592025-01-26 19:17:54 +01001444 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz38a9d682024-12-06 16:17:09 +01001445 buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001446}
1447
Marc Kupietzd8851222025-05-01 10:57:19 +02001448#' format()
Marc Kupietze95108e2019-09-18 13:23:58 +02001449#' @rdname KorAPQuery-class
1450#' @param x KorAPQuery object
1451#' @param ... further arguments passed to or from other methods
Marc Kupietzb73ca0f2025-01-28 20:45:01 +01001452#' @importFrom urltools param_get url_decode
Marc Kupietze95108e2019-09-18 13:23:58 +02001453#' @export
1454format.KorAPQuery <- function(x, ...) {
1455 cat("<KorAPQuery>\n")
1456 q <- x
Marc Kupietzd8851222025-05-01 10:57:19 +02001457 param <- urltools::param_get(q@request) |> lapply(urltools::url_decode)
Marc Kupietzb73ca0f2025-01-28 20:45:01 +01001458 cat(" Query: ", param$q, "\n")
1459 if (!is.null(param$cq) && param$cq != "") {
1460 cat(" Virtual corpus: ", param$cq, "\n")
1461 }
1462 if (!is.null(q@collectedMatches)) {
1463 cat("==============================================================================================================", "\n")
1464 print(summary(q@collectedMatches))
1465 cat("==============================================================================================================", "\n")
1466 }
1467 cat(" Total results: ", q@totalResults, "\n")
1468 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietza29f3d42025-07-18 10:14:43 +02001469 if (!is.null(q@collectedMatches) && "pos" %in% colnames(q@collectedMatches)) {
1470 successful_annotations <- sum(!is.na(q@collectedMatches$annotation_snippet))
1471 parsed_annotations <- sum(!is.na(q@collectedMatches$pos))
1472 cat(" Annotations: ", successful_annotations, " of ", nrow(q@collectedMatches), " matches")
1473 if (parsed_annotations > 0) {
1474 cat(" (", parsed_annotations, " with parsed linguistic data)")
1475 }
1476 cat("\n")
Marc Kupietze52b2952025-07-17 16:53:02 +02001477 }
Marc Kupietz62da2b52019-09-12 17:43:34 +02001478}
1479
Marc Kupietze95108e2019-09-18 13:23:58 +02001480#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +02001481#'
Marc Kupietze95108e2019-09-18 13:23:58 +02001482#' @rdname KorAPQuery-class
1483#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +02001484#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +02001485setMethod("show", "KorAPQuery", function(object) {
1486 format(object)
Marc Kupietzc643a122025-07-18 18:18:36 +02001487 invisible(object)
Marc Kupietze95108e2019-09-18 13:23:58 +02001488})