blob: 1afb122966953aa9707d5c67b83bcc786dd2c7ac [file] [log] [blame]
Marc Kupietza8c40f42025-06-24 15:49:52 +02001#' KorAPQuery class (internal)
Marc Kupietze95108e2019-09-18 13:23:58 +02002#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02003#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
Marc Kupietze95108e2019-09-18 13:23:58 +02004#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02005#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +02006#' @include KorAPConnection.R
Marc Kupietz6dfeed92025-06-03 11:58:06 +02007#' @include logging.R
Marc Kupietzf9129592025-01-26 19:17:54 +01008#' @import httr2
Marc Kupietze95108e2019-09-18 13:23:58 +02009#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
Marc Kupietza29f3d42025-07-18 10:14:43 +020024 "hasMoreMatches"
Marc Kupietze95108e2019-09-18 13:23:58 +020025))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietza8c40f42025-06-24 15:49:52 +020027#' Initialize KorAPQuery object
28#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +020029#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020030#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020031#' @param request query part of the request URL
32#' @param vc definition of a virtual corpus
33#' @param totalResults number of hits the query has yielded
34#' @param nextStartIndex at what index to start the next fetch of query results
35#' @param fields what data / metadata fields should be collected
36#' @param requestUrl complete URL of the API request
37#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
38#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020039#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020040#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020041#'
42#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020043#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +020044setMethod(
45 "initialize", "KorAPQuery",
46 function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
47 "corpusSigle", "textSigle", "pubDate", "pubPlace",
48 "availability", "textClass", "snippet", "tokens"
49 ),
Marc Kupietza29f3d42025-07-18 10:14:43 +020050 requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
Marc Kupietzd8851222025-05-01 10:57:19 +020051 .Object <- callNextMethod()
52 .Object@korapConnection <- korapConnection
53 .Object@request <- request
54 .Object@vc <- vc
55 .Object@totalResults <- totalResults
56 .Object@nextStartIndex <- nextStartIndex
57 .Object@fields <- fields
58 .Object@requestUrl <- requestUrl
59 .Object@webUIRequestUrl <- webUIRequestUrl
60 .Object@apiResponse <- apiResponse
61 .Object@hasMoreMatches <- hasMoreMatches
62 .Object@collectedMatches <- collectedMatches
63 .Object
64 }
65)
Marc Kupietz632cbd42019-09-06 16:04:51 +020066
Marc Kupietzd8851222025-05-01 10:57:19 +020067setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
68setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
69setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
70setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
Marc Kupietz0af75932025-09-09 18:14:16 +020071setGeneric(
72 "fetchAnnotations",
73 function(kqo,
74 foundry = "tt",
75 overwrite = FALSE,
76 verbose = kqo@korapConnection@verbose) standardGeneric("fetchAnnotations")
77)
Marc Kupietzd8851222025-05-01 10:57:19 +020078setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietze95108e2019-09-18 13:23:58 +020079
80maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020081
Marc Kupietz4de53ec2019-10-04 09:12:00 +020082## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietzef1ef4a2025-02-19 12:12:40 +010083utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020084
Marc Kupietza8c40f42025-06-24 15:49:52 +020085#' Search corpus for query terms
Marc Kupietzdbd431a2021-08-29 12:17:45 +020086#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020087#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020088#'
Marc Kupietza8c40f42025-06-24 15:49:52 +020089#' @family corpus search functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +020090#' @aliases corpusQuery
91#'
92#' @importFrom urltools url_encode
93#' @importFrom purrr pmap
Marc Kupietzea34b812025-06-25 15:49:00 +020094#' @importFrom dplyr bind_rows group_by
Marc Kupietzdbd431a2021-08-29 12:17:45 +020095#'
Marc Kupietz617266d2025-02-27 10:43:07 +010096#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz67edcb52021-09-20 21:54:24 +020097#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020098#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020099#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz132f0052023-04-16 14:23:05 +0200100#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
101#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
102#' your RKorAPClient application as explained in the
103#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
104#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
105#' `FALSE`.
Marc Kupietz67edcb52021-09-20 21:54:24 +0200106#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Marc Kupietz1623fe82025-06-24 16:31:46 +0200107#' @param fields character vector specifying which metadata fields to retrieve for each match.
108#' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
109#' \describe{
110#' \item{**Text identification**:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
111#' \item{**Publication info**:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
112#' \item{**Temporal data**:}{`pubDate`, `creationDate` - when text was published/created}
113#' \item{**Publication details**:}{`pubPlace`, `publisher`, `reference` - where/how published}
114#' \item{**Text classification**:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
115#' \item{**Adminstrative and technical info**:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
116#' \item{**Content data**:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
117#' \item{**System data**:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
118#' }
119#' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
120#' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default if `metadataOnly` is set to `FALSE`.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100121#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +0200122#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200123#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200124#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200125#' @param context string that specifies the size of the left and the right context returned in `snippet`
126#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
127#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
128#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
129#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
130#' as this is typically legally motivated.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200131#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz67edcb52021-09-20 21:54:24 +0200132#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
133#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200134#'
135#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200136#' \dontrun{
137#'
Marc Kupietz1623fe82025-06-24 16:31:46 +0200138#' # Fetch basic metadata for "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200139#' KorAPConnection() |>
140#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200141#' fetchAll()
Marc Kupietz1623fe82025-06-24 16:31:46 +0200142#'
143#' # Fetch specific metadata fields for bibliographic analysis
Marc Kupietzd3526422025-06-25 09:16:15 +0200144#' query <- KorAPConnection() |>
Marc Kupietz1623fe82025-06-24 16:31:46 +0200145#' corpusQuery("Ameisenplage",
146#' fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
147#' results <- fetchAll(query)
148#' results@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100149#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200150#'
Marc Kupietz6ae76052021-09-21 10:34:00 +0200151#' \dontrun{
152#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200153#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
154#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200155#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200156#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200157#' corpusQuery(
158#' KorAPUrl =
159#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
160#' )
Marc Kupietz6ae76052021-09-21 10:34:00 +0200161#' }
162#'
163#' \dontrun{
Marc Kupietz3c531f62019-09-13 12:17:24 +0200164#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200165#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200166#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200167#' {
168#' . ->> kco
Marc Kupietzd3526422025-06-25 09:16:15 +0200169#' } |>
170#' corpusQuery("Ameisenplage") |>
171#' fetchAll() |>
172#' slot("collectedMatches") |>
173#' mutate(year = lubridate::year(pubDate)) |>
174#' dplyr::select(year) |>
175#' group_by(year) |>
176#' summarise(Count = dplyr::n()) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200177#' mutate(Freq = mapply(function(f, y) {
178#' f / corpusStats(kco, paste("pubDate in", y))@tokens
Marc Kupietzd3526422025-06-25 09:16:15 +0200179#' }, Count, year)) |>
180#' dplyr::select(-Count) |>
181#' complete(year = min(year):max(year), fill = list(Freq = 0)) |>
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200182#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100183#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200184#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200185#'
186#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200187#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200188#'
189#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +0200190setMethod(
191 "corpusQuery", "KorAPConnection",
192 function(kco,
193 query = if (missing(KorAPUrl)) {
194 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
195 } else {
196 httr2::url_parse(KorAPUrl)$query$q
197 },
198 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
199 KorAPUrl,
200 metadataOnly = TRUE,
201 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
202 fields = c(
203 "corpusSigle",
204 "textSigle",
205 "pubDate",
206 "pubPlace",
207 "availability",
208 "textClass",
209 "snippet",
210 "tokens"
211 ),
212 accessRewriteFatal = TRUE,
213 verbose = kco@verbose,
214 expand = length(vc) != length(query),
215 as.df = FALSE,
216 context = NULL) {
217 if (length(query) > 1 || length(vc) > 1) {
218 grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200219
220 # Initialize timing variables for ETA calculation
221 total_queries <- nrow(grid)
222 current_query <- 0
223 start_time <- Sys.time()
224
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200225 results <- purrr::pmap(grid, function(query, vc, ...) {
226 current_query <<- current_query + 1
227
228 # Execute the single query directly (avoiding recursive call)
229 contentFields <- c("snippet", "tokens")
230 query_fields <- fields
231 if (metadataOnly) {
232 query_fields <- query_fields[!query_fields %in% contentFields]
233 }
234 if (!"textSigle" %in% query_fields) {
235 query_fields <- c(query_fields, "textSigle")
236 }
237 request <-
238 paste0(
239 "?q=",
240 url_encode(enc2utf8(query)),
241 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
242 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
243 ifelse(!metadataOnly, "&show-tokens=true", ""),
244 "&ql=", ql
245 )
246 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
247 requestUrl <- paste0(
248 kco@apiUrl,
249 "search",
250 request,
251 "&fields=",
252 paste(query_fields, collapse = ","),
253 if (metadataOnly) "&access-rewrite-disabled=true" else ""
254 )
255
256 # Show individual query progress
257 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
258 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
259 if (is.null(res)) {
260 log_info(verbose, ": API call failed\n")
261 totalResults <- 0
262 } else {
Marc Kupietz336c85d2025-07-24 13:52:03 +0200263 # Check for query rewrites and warn the user
264 warnOnRewrites(res)
265
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200266 totalResults <- as.integer(res$meta$totalResults)
267 log_info(verbose, ": ", totalResults, " hits")
268 if (!is.null(res$meta$cached)) {
269 log_info(verbose, " [cached]")
270 } else if (!is.null(res$meta$benchmark)) {
271 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
272 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
273 formatted_time <- paste0(round(time_value, 2), "s")
274 log_info(verbose, ", took ", formatted_time)
275 } else {
276 log_info(verbose, ", took ", res$meta$benchmark)
277 }
278 }
Marc Kupietz365660e2025-06-25 15:09:55 +0200279
280 # Calculate and display ETA information on the same line if verbose and we have more than one query
281 if (verbose && total_queries > 1) {
282 eta_info <- calculate_eta(current_query, total_queries, start_time)
283 if (eta_info != "") {
284 elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
285 avg_time_per_query <- elapsed_time / current_query
286
287 # Add ETA info to the same line - remove the leading ". " for cleaner formatting
288 clean_eta_info <- sub("^\\. ", ". ", eta_info)
289 log_info(verbose, clean_eta_info)
290 }
291 }
292
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200293 log_info(verbose, "\n")
294 }
295
296 result <- data.frame(
297 query = query,
298 totalResults = totalResults,
299 vc = vc,
300 webUIRequestUrl = webUIRequestUrl,
301 stringsAsFactors = FALSE
302 )
303
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200304 return(result)
305 })
306
307 results %>% bind_rows()
Marc Kupietzd8851222025-05-01 10:57:19 +0200308 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200309 contentFields <- c("snippet", "tokens")
Marc Kupietza96537f2019-11-09 23:07:44 +0100310 if (metadataOnly) {
311 fields <- fields[!fields %in% contentFields]
312 }
Marc Kupietz80dc6432025-02-07 16:57:40 +0100313 if (!"textSigle" %in% fields) {
314 fields <- c(fields, "textSigle")
315 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100316 request <-
Marc Kupietzd8851222025-05-01 10:57:19 +0200317 paste0(
318 "?q=",
319 url_encode(enc2utf8(query)),
320 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
321 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
322 ifelse(!metadataOnly, "&show-tokens=true", ""),
323 "&ql=", ql
324 )
Marc Kupietza96537f2019-11-09 23:07:44 +0100325 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
326 requestUrl <- paste0(
327 kco@apiUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200328 "search",
Marc Kupietza96537f2019-11-09 23:07:44 +0100329 request,
Marc Kupietzd8851222025-05-01 10:57:19 +0200330 "&fields=",
Marc Kupietza96537f2019-11-09 23:07:44 +0100331 paste(fields, collapse = ","),
Marc Kupietzd8851222025-05-01 10:57:19 +0200332 if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietza96537f2019-11-09 23:07:44 +0100333 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200334 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
335 sep =
336 ""
337 )
338 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietza4675722022-02-23 23:55:15 +0100339 if (is.null(res)) {
Marc Kupietza4675722022-02-23 23:55:15 +0100340 message("API call failed.")
341 totalResults <- 0
342 } else {
Marc Kupietz336c85d2025-07-24 13:52:03 +0200343 # Check for query rewrites and warn the user
344 warnOnRewrites(res)
345
Marc Kupietzd8851222025-05-01 10:57:19 +0200346 totalResults <- as.integer(res$meta$totalResults)
Marc Kupietza47d1502023-04-18 15:26:47 +0200347 log_info(verbose, ": ", totalResults, " hits")
Marc Kupietzd8851222025-05-01 10:57:19 +0200348 if (!is.null(res$meta$cached)) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200349 log_info(verbose, " [cached]\n")
Marc Kupietzd8851222025-05-01 10:57:19 +0200350 } else if (!is.null(res$meta$benchmark)) {
Marc Kupietz2baf5c52025-09-05 16:41:11 +0200351 # Round the benchmark time to 2 decimal places for better readability.
352 # Be robust to locales using comma as decimal separator (e.g., "0,12s").
Marc Kupietz7638ca42025-05-25 13:18:16 +0200353 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
Marc Kupietz2baf5c52025-09-05 16:41:11 +0200354 bench_str <- sub("s$", "", res$meta$benchmark)
355 bench_num <- suppressWarnings(as.numeric(gsub(",", ".", bench_str)))
356 if (!is.na(bench_num)) {
357 formatted_time <- paste0(round(bench_num, 2), "s")
358 } else {
359 formatted_time <- res$meta$benchmark
360 }
Marc Kupietz7638ca42025-05-25 13:18:16 +0200361 log_info(verbose, ", took ", formatted_time, "\n", sep = "")
362 } else {
363 # Fallback if the format is different than expected
364 log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
365 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200366 } else {
367 log_info(verbose, "\n")
368 }
Marc Kupietza4675722022-02-23 23:55:15 +0100369 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200370 if (as.df) {
Marc Kupietza96537f2019-11-09 23:07:44 +0100371 data.frame(
372 query = query,
Marc Kupietza4675722022-02-23 23:55:15 +0100373 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100374 vc = vc,
375 webUIRequestUrl = webUIRequestUrl,
376 stringsAsFactors = FALSE
377 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200378 } else {
Marc Kupietza96537f2019-11-09 23:07:44 +0100379 KorAPQuery(
380 korapConnection = kco,
381 nextStartIndex = 0,
382 fields = fields,
383 requestUrl = requestUrl,
384 request = request,
Marc Kupietza4675722022-02-23 23:55:15 +0100385 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100386 vc = vc,
387 apiResponse = res,
388 webUIRequestUrl = webUIRequestUrl,
Marc Kupietza4675722022-02-23 23:55:15 +0100389 hasMoreMatches = (totalResults > 0),
Marc Kupietza96537f2019-11-09 23:07:44 +0100390 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200391 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100392 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200393 }
394)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200395
Marc Kupietz336c85d2025-07-24 13:52:03 +0200396# Helper function to check if a query rewrite warning should be shown
397warnOnRewrites <- function(res) {
398 if (!is.null(res$collection$rewrites)) {
399 comment <- res$collection$rewrites$`_comment`
400 # Only show warning if it's not just the standard policy message
401 if (!is.null(comment) && comment != "All corpus access policy has been added.") {
402 warning(res$collection$rewrites$editor, " had to rewrite your query: ", comment)
403 }
404 }
405}
406
Marc Kupietz05a60792024-12-07 16:23:31 +0100407#' @importFrom purrr map
408repair_data_strcuture <- function(x) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200409 if (is.list(x)) {
410 as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz05a60792024-12-07 16:23:31 +0100411 paste(.x, collapse = " ")
412 } else {
413 .x
414 }))
Marc Kupietzd8851222025-05-01 10:57:19 +0200415 } else {
Marc Kupietz05a60792024-12-07 16:23:31 +0100416 ifelse(is.na(x), "", x)
Marc Kupietzd8851222025-05-01 10:57:19 +0200417 }
Marc Kupietz05a60792024-12-07 16:23:31 +0100418}
419
Marc Kupietz62da2b52019-09-12 17:43:34 +0200420#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200421#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200422#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200423#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200424#' @family corpus search functions
425#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200426#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200427#' @param offset start offset for query results to fetch
428#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200429#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200430#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
431#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200432#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100433#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200434#' \dontrun{
435#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200436#' q <- KorAPConnection() |>
437#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200438#' fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100439#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100440#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100441#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200442#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200443#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200444#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200445#' @aliases fetchNext
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200446#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietzf4881122024-12-17 14:55:39 +0100447#' @importFrom tibble enframe add_column
448#' @importFrom stringr word
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200449#' @importFrom tidyr unnest unchop pivot_wider
450#' @importFrom purrr map
Marc Kupietz632cbd42019-09-06 16:04:51 +0200451#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200452setMethod("fetchNext", "KorAPQuery", function(kqo,
453 offset = kqo@nextStartIndex,
454 maxFetch = maxResultsPerPage,
455 verbose = kqo@korapConnection@verbose,
456 randomizePageOrder = FALSE) {
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100457 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietzd8851222025-05-01 10:57:19 +0200458 results <- key <- name <- tmp_positions <- 0
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100459
Marc Kupietze95108e2019-09-18 13:23:58 +0200460 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
461 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200462 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200463 use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz623d7122025-05-25 12:46:12 +0200464 # Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietze95108e2019-09-18 13:23:58 +0200465 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200466
Marc Kupietz24799fd2025-06-25 14:15:36 +0200467 # Track start time for ETA calculation
468 start_time <- Sys.time()
469
Marc Kupietz623d7122025-05-25 12:46:12 +0200470 # For randomized page order, generate a list of randomized page indices
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200471 if (randomizePageOrder) {
Marc Kupietz623d7122025-05-25 12:46:12 +0200472 # Calculate how many pages we need to fetch based on maxFetch
473 total_pages_to_fetch <- if (!is.na(maxFetch)) {
474 # Either limited by maxFetch or total results, whichever is smaller
475 min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
476 } else {
477 # All pages
478 ceiling(kqo@totalResults / maxResultsPerPage)
479 }
480
481 # Generate randomized page indices (0-based for API)
482 pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
483 page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200484 }
485
Marc Kupietzd8851222025-05-01 10:57:19 +0200486 if (is.null(collectedMatches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200487 collectedMatches <- data.frame()
488 }
Marc Kupietz623d7122025-05-25 12:46:12 +0200489
490 # Initialize the page counter properly based on nextStartIndex and any previously fetched results
491 # We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
492 # For first call, this will be 1, for subsequent calls, it will reflect our actual position
493 current_page_number <- ceiling(offset / maxResultsPerPage) + 1
494
495 # For sequential fetches, keep track of which global page we're on
496 # This is important for correctly showing page numbers in subsequent fetchNext calls
497 page_count_start <- current_page_number
498
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200499 repeat {
Marc Kupietz623d7122025-05-25 12:46:12 +0200500 # Determine which page to fetch next
501 if (randomizePageOrder) {
502 # In randomized mode, get the page from our randomized list using the page_index
503 # Make sure we don't exceed the array bounds
504 if (page_index > length(pages)) {
505 break # No more pages to fetch in randomized mode
506 }
507 current_offset_page <- pages[page_index]
508 # For display purposes in randomized mode, show which page out of the total we're fetching
509 display_page_number <- page_index
510 } else {
511 # In sequential mode, use the current_page_number to calculate the offset
512 current_offset_page <- (current_page_number - 1)
513 display_page_number <- current_page_number
514 }
515
516 # Calculate the actual offset in tokens
517 currentOffset <- current_offset_page * maxResultsPerPage
518
Marc Kupietzef0e9392025-06-18 12:21:49 +0200519 # Build the query with the appropriate count and offset using httr2
520 count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
Marc Kupietzecc86702025-06-24 12:12:51 +0200521
Marc Kupietzef0e9392025-06-18 12:21:49 +0200522 # Parse existing URL to preserve all query parameters
523 parsed_url <- httr2::url_parse(kqo@requestUrl)
524 existing_query <- parsed_url$query
Marc Kupietzecc86702025-06-24 12:12:51 +0200525
Marc Kupietzef0e9392025-06-18 12:21:49 +0200526 # Add/update count and offset parameters
527 existing_query$count <- count_param
528 existing_query$offset <- currentOffset
Marc Kupietzecc86702025-06-24 12:12:51 +0200529
Marc Kupietzef0e9392025-06-18 12:21:49 +0200530 # Rebuild the URL with all parameters
531 query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz336c85d2025-07-24 13:52:03 +0200532
Marc Kupietz68170952021-06-30 09:37:21 +0200533 res <- apiCall(kqo@korapConnection, query)
534 if (length(res$matches) == 0) {
535 break
536 }
537
Marc Kupietz336c85d2025-07-24 13:52:03 +0200538 # Check for query rewrites and warn the user
539 warnOnRewrites(res)
540
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200541 if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz16ccf112025-01-26 13:25:27 +0100542 log_info(verbose, "Using fields API: ")
Marc Kupietz05a60792024-12-07 16:23:31 +0100543 currentMatches <- res$matches$fields %>%
544 purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
545 tibble::enframe() %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200546 tidyr::unnest(cols = value) %>%
547 tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200548 dplyr::select(-name)
Marc Kupietzd8851222025-05-01 10:57:19 +0200549 if ("snippet" %in% colnames(res$matches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200550 currentMatches$snippet <- res$matches$snippet
551 }
Marc Kupietz3cd2c6c2025-01-08 20:35:39 +0100552 if ("tokens" %in% colnames(res$matches)) {
553 currentMatches$tokens <- res$matches$tokens
554 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200555 } else {
556 currentMatches <- res$matches
557 }
558
Marc Kupietze95108e2019-09-18 13:23:58 +0200559 for (field in kqo@fields) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200560 if (!field %in% colnames(currentMatches)) {
561 currentMatches[, field] <- NA
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200562 }
563 }
Marc Kupietzf4881122024-12-17 14:55:39 +0100564 currentMatches <- currentMatches %>%
565 select(kqo@fields) %>%
566 mutate(
Marc Kupietzff712a92025-07-18 09:07:23 +0200567 matchID = res$matches$matchID,
Marc Kupietz0447da02025-01-08 20:51:09 +0100568 tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
Marc Kupietzf4881122024-12-17 14:55:39 +0100569 matchStart = as.integer(stringr::word(tmp_positions, 1)),
570 matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
571 ) %>%
572 select(-tmp_positions)
573
Marc Kupietz62da2b52019-09-12 17:43:34 +0200574 if (!is.list(collectedMatches)) {
575 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200576 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200577 collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200578 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200579
Marc Kupietz336c85d2025-07-24 13:52:03 +0200580
Marc Kupietz623d7122025-05-25 12:46:12 +0200581 # Get the actual items per page from the API response
582 # We now consistently use maxResultsPerPage instead
Marc Kupietzacbaab02025-05-01 10:56:35 +0200583
Marc Kupietz623d7122025-05-25 12:46:12 +0200584 # Calculate total pages consistently using fixed maxResultsPerPage
585 # This ensures consistent page counting across the function
586 total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
587
Marc Kupietz24799fd2025-06-25 14:15:36 +0200588 # Calculate ETA using the centralized function from logging.R
589 current_page <- if (randomizePageOrder) page_index else display_page_number
590 total_pages_to_fetch <- if (!is.na(maxFetch)) {
591 # Account for offset - we can only fetch from the remaining results after offset
592 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
593 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
594 } else {
595 total_pages
596 }
Marc Kupietz365660e2025-06-25 15:09:55 +0200597
Marc Kupietz24799fd2025-06-25 14:15:36 +0200598 eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)
Marc Kupietz365660e2025-06-25 15:09:55 +0200599
Marc Kupietz24799fd2025-06-25 14:15:36 +0200600 # Extract timing information for display
Marc Kupietzae9b6172025-05-02 15:50:01 +0200601 time_per_page <- NA
Marc Kupietzae9b6172025-05-02 15:50:01 +0200602 if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
Marc Kupietzae9b6172025-05-02 15:50:01 +0200603 time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
Marc Kupietzacbaab02025-05-01 10:56:35 +0200604 }
605
Marc Kupietz623d7122025-05-25 12:46:12 +0200606 # Create the page display string with proper formatting
Marc Kupietzacbaab02025-05-01 10:56:35 +0200607
Marc Kupietz623d7122025-05-25 12:46:12 +0200608 # For global page tracking, calculate the absolute page number
609 actual_display_number <- if (randomizePageOrder) {
610 current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
611 } else {
612 # In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
613 current_offset_page + 1
614 }
615
616 # For subsequent calls to fetchNext, we need to calculate the correct page numbers
617 # based on the current batch being fetched
618
619 # For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
620 # Simply count from 1 within the current batch
621
622 # The relative page number is simply the current position in this batch
623 if (randomizePageOrder) {
624 relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
625 } else {
626 relative_page_number <- display_page_number - (page_count_start - 1)
627 }
628
629 # How many pages will we fetch in this batch?
Marc Kupietz021663d2025-06-18 17:49:22 +0200630 # If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz623d7122025-05-25 12:46:12 +0200631 pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz021663d2025-06-18 17:49:22 +0200632 # Account for offset - we can only fetch from the remaining results after offset
633 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
634 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz623d7122025-05-25 12:46:12 +0200635 } else {
636 # Otherwise fetch all remaining pages
637 total_pages - page_count_start + 1
638 }
639
640 # The total pages to be shown in this batch
641 batch_total_pages <- pages_in_this_batch
642
643 page_display <- paste0(
644 "Retrieved page ",
645 sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
646 "/",
647 sprintf("%d", batch_total_pages)
648 )
649
650 # If randomized, also show which actual page we fetched
651 if (randomizePageOrder) {
652 # Determine the maximum width needed for page numbers (based on total pages)
653 # This ensures consistent alignment
654 max_page_width <- nchar(as.character(total_pages))
655 # Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz7638ca42025-05-25 13:18:16 +0200656 page_display <- paste0(
657 page_display,
658 sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
659 )
Marc Kupietz623d7122025-05-25 12:46:12 +0200660 }
661 # Always show the absolute page number and total pages (for clarity)
662 else {
663 # Show the absolute page number (out of total possible pages)
664 page_display <- paste0(page_display, sprintf(
665 " (page %d of %d total)",
666 actual_display_number, total_pages
667 ))
668 }
669
670 # Add caching or timing information
671 if (!is.null(res$meta$cached)) {
672 page_display <- paste0(page_display, " [cached]")
673 } else {
674 page_display <- paste0(
675 page_display,
676 " in ",
677 if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
Marc Kupietz24799fd2025-06-25 14:15:36 +0200678 "s",
679 eta_info
Marc Kupietz623d7122025-05-25 12:46:12 +0200680 )
681 }
682
683 log_info(verbose, paste0(page_display, "\n"))
684
685 # Increment the appropriate counter based on mode
686 if (randomizePageOrder) {
687 page_index <- page_index + 1
688 } else {
689 current_page_number <- current_page_number + 1
690 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200691 results <- results + res$meta$itemsPerPage
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200692 if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200693 break
694 }
695 }
Marc Kupietz68170952021-06-30 09:37:21 +0200696 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietzd8851222025-05-01 10:57:19 +0200697 KorAPQuery(
698 nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200699 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200700 fields = kqo@fields,
701 requestUrl = kqo@requestUrl,
702 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200703 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200704 vc = kqo@vc,
705 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200706 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200707 apiResponse = res,
Marc Kupietzd8851222025-05-01 10:57:19 +0200708 collectedMatches = collectedMatches
709 )
Marc Kupietze95108e2019-09-18 13:23:58 +0200710})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200711
712#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200713#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200714#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100715#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200716#' @family corpus search functions
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200717#' @param kqo object obtained from [corpusQuery()]
718#' @param verbose print progress information if true
719#' @param ... further arguments passed to [fetchNext()]
720#' @return The updated `kqo` object with all results in `@collectedMatches`
Marc Kupietza8c40f42025-06-24 15:49:52 +0200721#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200722#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200723#' \dontrun{
Marc Kupietzecc86702025-06-24 12:12:51 +0200724#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
725#' q <- KorAPConnection() |>
726#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200727#' fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200728#' q@collectedMatches
Marc Kupietzecc86702025-06-24 12:12:51 +0200729#'
730#' # Fetch also all KWICs
731#' q <- KorAPConnection() |> auth() |>
732#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
733#' fetchAll()
734#' q@collectedMatches
735#'
736#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
737#' q <- KorAPConnection() |>
738#' corpusQuery("<base/s=t>", # this matches each text once
739#' vc = "pubDate in 1958-03-12",
740#' fields = c("textSigle", "title"),
741#' ) |>
742#' fetchAll()
743#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100744#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200745#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200746#' @aliases fetchAll
Marc Kupietz62da2b52019-09-12 17:43:34 +0200747#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200748setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
749 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200750})
751
752#' Fetches the remaining results of a KorAP query.
753#'
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200754#' @param kqo object obtained from [corpusQuery()]
755#' @param verbose print progress information if true
756#' @param ... further arguments passed to [fetchNext()]
757#' @return The updated `kqo` object with remaining results in `@collectedMatches`
758#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200759#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200760#' \dontrun{
761#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200762#' q <- KorAPConnection() |>
763#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200764#' fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200765#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100766#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200767#'
768#' @aliases fetchRest
Marc Kupietze95108e2019-09-18 13:23:58 +0200769#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200770setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
771 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200772})
773
Marc Kupietza29f3d42025-07-18 10:14:43 +0200774#'
775#' Parse XML annotations into linguistic layers
776#'
777#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
778#' from XML annotation snippets returned by the KorAP API.
779#'
780#' @param xml_snippet XML string containing annotation data
781#' @return Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
782#' @keywords internal
783parse_xml_annotations <- function(xml_snippet) {
784 if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
785 return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
786 }
787
Marc Kupietzcd452182025-10-09 13:28:41 +0200788 extract_morph_via_xml <- function(fragment) {
789 snippet <- paste0("<root>", fragment, "</root>")
790 doc <- tryCatch(xml2::read_html(snippet), error = function(e) NULL)
791 if (is.null(doc)) return(NULL)
792
793 nodes <- xml2::xml_find_all(doc, ".//span[not(.//span)]")
794 if (length(nodes) == 0) return(list(tokens = character(0), morph = character(0)))
795
796 tokens_xml <- character(0)
797 morph_vals <- character(0)
798
799 for (node in nodes) {
800 token_text <- trimws(xml2::xml_text(node))
801 if (identical(token_text, "")) next
802
803 tokens_xml <- c(tokens_xml, token_text)
804
805 ancestors <- xml2::xml_find_all(node, "ancestor-or-self::span")
806 titles <- xml2::xml_attr(ancestors, "title")
807 titles <- titles[!is.na(titles)]
808
809 feature_tokens <- character(0)
810 if (length(titles) > 0) {
811 bits <- unlist(strsplit(titles, "[[:space:]]+"))
812 bits <- bits[grepl('/m:', bits)]
813 if (length(bits) > 0) {
814 feature_tokens <- sub('.*?/m:(.*)$', '\\1', bits, perl = TRUE)
815 feature_tokens <- feature_tokens[!duplicated(feature_tokens)]
816 }
817 }
818
819 if (length(feature_tokens) == 0) {
820 morph_vals <- c(morph_vals, NA_character_)
821 } else {
822 morph_vals <- c(morph_vals, paste(feature_tokens, collapse = "|"))
823 }
824 }
825
826 list(tokens = tokens_xml, morph = morph_vals)
827 }
828
Marc Kupietza29f3d42025-07-18 10:14:43 +0200829 # Extract content within <span class="match">...</span> using a more robust approach
830 if (grepl('<span class="match">', xml_snippet)) {
831 # Find the start of match span
832 start_pos <- regexpr('<span class="match">', xml_snippet)
833 if (start_pos > 0) {
834 # Find the end by counting nested spans
835 content_start <- start_pos + attr(start_pos, "match.length")
836 remaining <- substr(xml_snippet, content_start, nchar(xml_snippet))
837
838 # Simple approach: extract everything until we hit context-right or end
839 if (grepl('<span class="context-right">', remaining)) {
840 content_to_parse <- gsub('(.*?)<span class="context-right">.*', '\\1', remaining)
841 } else {
842 # Find the closing </span> that matches our opening span
843 # For now, use a simpler approach - take everything until the last </span> sequence
844 content_to_parse <- gsub('(.*)</span>\\s*$', '\\1', remaining)
845 }
846 } else {
847 content_to_parse <- xml_snippet
848 }
849 } else {
850 content_to_parse <- xml_snippet
851 }
852
853 # Initialize result vectors
854 tokens <- character(0)
855 lemmas <- character(0)
856 pos_tags <- character(0)
857 morph_tags <- character(0)
858
859 # Split the content by </span> and process each meaningful part
860 parts <- unlist(strsplit(content_to_parse, '</span>'))
861
862 for (part in parts) {
863 part <- trimws(part)
864 if (nchar(part) == 0) next
865
866 # Look for parts that have title attributes and end with text
867 if (grepl('<span[^>]*title=', part)) {
868 # Extract the text content (everything after the last >)
869 text_content <- gsub('.*>([^<]*)$', '\\1', part)
870 text_content <- trimws(text_content)
871
872 if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
873 tokens <- c(tokens, text_content)
874
875 # Extract all title attributes from this part
876 title_pattern <- 'title="([^"]*)"'
877 title_matches <- gregexpr(title_pattern, part)
878
879 lemma <- NA
880 pos_tag <- NA
Marc Kupietzcd452182025-10-09 13:28:41 +0200881 morph_features <- character(0)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200882
883 if (title_matches[[1]][1] != -1) {
884 all_titles <- regmatches(part, title_matches)[[1]]
885 for (title_match in all_titles) {
886 title_content <- gsub(title_pattern, '\\1', title_match)
Marc Kupietzc643a122025-07-18 18:18:36 +0200887
Marc Kupietza29f3d42025-07-18 10:14:43 +0200888 # Split by spaces and process each annotation
889 annotations <- unlist(strsplit(title_content, "\\s+"))
890 for (annotation in annotations) {
Marc Kupietzc643a122025-07-18 18:18:36 +0200891 if (grepl('^[^/]+/l:', annotation)) {
892 lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
893 } else if (grepl('^[^/]+/p:', annotation)) {
894 pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
895 } else if (grepl('^[^/]+/m:', annotation)) {
Marc Kupietzcd452182025-10-09 13:28:41 +0200896 morph_features <- c(morph_features, gsub('^[^/]+/m:(.*)$', '\\1', annotation))
Marc Kupietza29f3d42025-07-18 10:14:43 +0200897 }
898 }
899 }
900 }
901
902 lemmas <- c(lemmas, lemma)
903 pos_tags <- c(pos_tags, pos_tag)
Marc Kupietzcd452182025-10-09 13:28:41 +0200904 morph_tag <- if (length(morph_features) > 0) {
905 paste(unique(morph_features), collapse = "|")
906 } else {
907 NA
908 }
Marc Kupietza29f3d42025-07-18 10:14:43 +0200909 morph_tags <- c(morph_tags, morph_tag)
910 }
911 }
912 }
913
914 # If no tokens found with the splitting approach, try a different method
915 if (length(tokens) == 0) {
916 # Look for the innermost spans that contain actual text
917 innermost_pattern <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
918 innermost_matches <- gregexpr(innermost_pattern, content_to_parse, perl = TRUE)
919
920 if (innermost_matches[[1]][1] != -1) {
921 matches <- regmatches(content_to_parse, innermost_matches)[[1]]
922
923 for (match in matches) {
924 title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
925 text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
926 text <- trimws(text)
927
928 if (nchar(text) > 0) {
929 tokens <- c(tokens, text)
930
931 # Parse space-separated annotations in title
932 lemma <- NA
933 pos_tag <- NA
Marc Kupietzcd452182025-10-09 13:28:41 +0200934 morph_features <- character(0)
Marc Kupietzc643a122025-07-18 18:18:36 +0200935
Marc Kupietza29f3d42025-07-18 10:14:43 +0200936 annotations <- unlist(strsplit(title, "\\s+"))
937 for (annotation in annotations) {
Marc Kupietzc643a122025-07-18 18:18:36 +0200938 if (grepl('^[^/]+/l:', annotation)) {
939 lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
940 } else if (grepl('^[^/]+/p:', annotation)) {
941 pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
942 } else if (grepl('^[^/]+/m:', annotation)) {
Marc Kupietzcd452182025-10-09 13:28:41 +0200943 morph_features <- c(morph_features, gsub('^[^/]+/m:(.*)$', '\\1', annotation))
Marc Kupietza29f3d42025-07-18 10:14:43 +0200944 }
945 }
Marc Kupietzc643a122025-07-18 18:18:36 +0200946
Marc Kupietza29f3d42025-07-18 10:14:43 +0200947 lemmas <- c(lemmas, lemma)
948 pos_tags <- c(pos_tags, pos_tag)
Marc Kupietzcd452182025-10-09 13:28:41 +0200949 morph_tag <- if (length(morph_features) > 0) {
950 paste(unique(morph_features), collapse = "|")
951 } else {
952 NA
953 }
Marc Kupietza29f3d42025-07-18 10:14:43 +0200954 morph_tags <- c(morph_tags, morph_tag)
955 }
956 }
957 }
958 }
959
Marc Kupietzcd452182025-10-09 13:28:41 +0200960 xml_morph <- extract_morph_via_xml(xml_snippet)
961 if (!is.null(xml_morph) && length(xml_morph$morph) > 0) {
962 morph_tags <- xml_morph$morph
963 }
964
Marc Kupietza29f3d42025-07-18 10:14:43 +0200965 # Ensure all vectors have the same length
966 max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
967 if (max_length > 0) {
968 tokens <- c(tokens, rep(NA, max_length - length(tokens)))
969 lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
970 pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
971 morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
972 }
973
974 return(list(
975 token = tokens,
976 lemma = lemmas,
977 pos = pos_tags,
978 morph = morph_tags
979 ))
980}
981
982#'
983#' Parse XML annotations into linguistic layers with left/match/right structure
984#'
985#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
986#' from XML annotation snippets returned by the KorAP API, split into left context,
987#' match, and right context sections like the tokens field.
988#'
989#' @param xml_snippet XML string containing annotation data
990#' @return Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
991#' @keywords internal
992parse_xml_annotations_structured <- function(xml_snippet) {
993 if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
994 empty_result <- list(left = character(0), match = character(0), right = character(0))
995 return(list(
996 atokens = empty_result,
997 lemma = empty_result,
998 pos = empty_result,
999 morph = empty_result
1000 ))
1001 }
1002
Marc Kupietzcd452182025-10-09 13:28:41 +02001003 extract_morphological_features_via_xml <- function(section_content) {
1004 snippet <- paste0("<root>", section_content, "</root>")
1005 doc <- tryCatch(xml2::read_html(snippet), error = function(e) NULL)
1006 if (is.null(doc)) return(NULL)
1007
1008 nodes <- xml2::xml_find_all(doc, ".//span[not(.//span)]")
1009 if (length(nodes) == 0) {
1010 return(list(tokens = character(0), morph = character(0)))
1011 }
1012
1013 tokens_xml <- character(0)
1014 morph_vals <- character(0)
1015
1016 for (node in nodes) {
1017 token_text <- trimws(xml2::xml_text(node))
1018 if (identical(token_text, "")) next
1019
1020 tokens_xml <- c(tokens_xml, token_text)
1021
1022 ancestors <- xml2::xml_find_all(node, "ancestor-or-self::span")
1023 titles <- xml2::xml_attr(ancestors, "title")
1024 titles <- titles[!is.na(titles)]
1025
1026 feature_tokens <- character(0)
1027 if (length(titles) > 0) {
1028 bits <- unlist(strsplit(titles, "[[:space:]]+"))
1029 bits <- bits[grepl('/m:', bits)]
1030 if (length(bits) > 0) {
1031 feature_tokens <- sub('.*?/m:(.*)$', '\\1', bits, perl = TRUE)
1032 feature_tokens <- feature_tokens[!duplicated(feature_tokens)]
1033 }
1034 }
1035
1036 if (length(feature_tokens) == 0) {
1037 morph_vals <- c(morph_vals, NA_character_)
1038 } else {
1039 morph_vals <- c(morph_vals, paste(feature_tokens, collapse = "|"))
1040 }
1041 }
1042
1043 list(tokens = tokens_xml, morph = morph_vals)
1044 }
1045
Marc Kupietza29f3d42025-07-18 10:14:43 +02001046 # Helper function to extract annotations from a span section
1047 extract_annotations_from_section <- function(section_content) {
Marc Kupietz560b5912025-09-01 17:36:13 +02001048 # Remove any <mark>...</mark> tags that may interrupt token boundaries
1049 section_no_marks <- gsub('</?mark[^>]*>', '', section_content, perl = TRUE)
1050 # Normalize separators between adjacent top-level spans so splitting is robust.
1051 # Replace any punctuation/entity/space run between one-or-more closing spans and the next opening span
1052 # with a single space, preserving all closing spans.
1053 section_norm <- gsub('((?:</span>)+)[[:space:]]*(?:&[^;]+;|[[:punct:]]|[[:space:]])*[[:space:]]*(<span)', '\\1 \\2', section_no_marks, perl = TRUE)
1054 # Handle both spaced tokens and nested single tokens by scanning innermost spans with direct text
Marc Kupietza29f3d42025-07-18 10:14:43 +02001055 tokens <- character(0)
1056 lemmas <- character(0)
1057 pos_tags <- character(0)
1058 morph_tags <- character(0)
Marc Kupietz89f796e2025-07-19 09:05:06 +02001059
Marc Kupietz560b5912025-09-01 17:36:13 +02001060 pat_token <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
1061 mm <- gregexpr(pat_token, section_norm, perl = TRUE)
1062 if (mm[[1]][1] != -1) {
1063 starts <- mm[[1]]
1064 lens <- attr(mm[[1]], 'match.length')
1065 for (k in seq_along(starts)) {
1066 s <- starts[k]
1067 e <- s + lens[k] - 1
1068 fragment <- substr(section_norm, s, e)
1069 text_content <- sub(pat_token, '\\2', fragment, perl = TRUE)
1070 text_content <- trimws(text_content)
1071 title_content <- sub(pat_token, '\\1', fragment, perl = TRUE)
Marc Kupietz89f796e2025-07-19 09:05:06 +02001072
Marc Kupietz560b5912025-09-01 17:36:13 +02001073 if (nchar(text_content) == 0) next
Marc Kupietz89f796e2025-07-19 09:05:06 +02001074
Marc Kupietz560b5912025-09-01 17:36:13 +02001075 lemma <- NA
1076 pos_tag <- NA
1077 morph_features <- character(0)
Marc Kupietz89f796e2025-07-19 09:05:06 +02001078
Marc Kupietz560b5912025-09-01 17:36:13 +02001079 # parse inner title
1080 ann <- unlist(strsplit(title_content, "[[:space:]]+"))
1081 for (a in ann) {
1082 if (grepl('/l:', a)) {
1083 lemma <- sub('.*?/l:(.*)$', '\\1', a, perl = TRUE)
1084 } else if (grepl('/p:', a)) {
1085 pos_tag <- sub('.*?/p:(.*)$', '\\1', a, perl = TRUE)
1086 } else if (grepl('/m:', a)) {
1087 morph_features <- c(morph_features, sub('.*?/m:(.*)$', '\\1', a, perl = TRUE))
Marc Kupietza29f3d42025-07-18 10:14:43 +02001088 }
1089 }
Marc Kupietz560b5912025-09-01 17:36:13 +02001090
1091 # If lemma missing, look back in nearby context for the nearest title containing l:
1092 if (is.na(lemma) || nchar(lemma) == 0) {
1093 ctx_start <- max(1, s - 500)
1094 context <- substr(section_norm, ctx_start, s - 1)
1095 tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
1096 if (tmm[[1]][1] != -1) {
1097 ctx_titles <- regmatches(context, tmm)[[1]]
1098 for (ti in rev(ctx_titles)) {
1099 cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
1100 if (grepl('/l:', cont)) {
1101 lemma <- sub('.*?/l:([^ ]+).*', '\\1', cont, perl = TRUE)
1102 break
1103 }
1104 }
1105 }
1106 }
1107
1108 # If POS missing, keep NA; morphological features may also appear in outer titles
Marc Kupietzcd452182025-10-09 13:28:41 +02001109 ctx_start <- max(1, s - 500)
1110 context <- substr(section_norm, ctx_start, s - 1)
1111 tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
1112 if (tmm[[1]][1] != -1) {
1113 ctx_titles <- regmatches(context, tmm)[[1]]
1114 collecting <- FALSE
1115 for (ti in rev(ctx_titles)) {
1116 cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
1117 if (grepl('/m:', cont)) {
1118 collecting <- TRUE
1119 mparts <- unlist(strsplit(cont, "[[:space:]]+"))
1120 features <- sub('.*?/m:(.*)$', '\\1', mparts[grepl('/m:', mparts)], perl = TRUE)
1121 if (length(features) > 0) {
1122 new_features <- features[!features %in% morph_features]
1123 morph_features <- c(morph_features, new_features)
Marc Kupietz560b5912025-09-01 17:36:13 +02001124 }
Marc Kupietzcd452182025-10-09 13:28:41 +02001125 } else if (collecting) {
1126 break
Marc Kupietz560b5912025-09-01 17:36:13 +02001127 }
1128 }
1129 }
1130
1131 tokens <- c(tokens, text_content)
1132 lemmas <- c(lemmas, if (!is.null(lemma)) lemma else NA)
1133 pos_tags <- c(pos_tags, if (!is.null(pos_tag)) pos_tag else NA)
Marc Kupietzcd452182025-10-09 13:28:41 +02001134 morph_tags <- c(morph_tags, if (length(morph_features) > 0) paste(unique(morph_features), collapse = "|") else NA)
Marc Kupietza29f3d42025-07-18 10:14:43 +02001135 }
1136 }
1137
Marc Kupietzcd452182025-10-09 13:28:41 +02001138 # Optionally replace morphological tags using XML-based extraction if it aligns with tokens
1139 xml_morph <- extract_morphological_features_via_xml(section_content)
1140 if (!is.null(xml_morph) && length(xml_morph$morph) > 0) {
1141 morph_tags <- xml_morph$morph
1142 }
1143
Marc Kupietza29f3d42025-07-18 10:14:43 +02001144 # Ensure all vectors have the same length
1145 max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
1146 if (max_length > 0) {
1147 tokens <- c(tokens, rep(NA, max_length - length(tokens)))
1148 lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
1149 pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
1150 morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
1151 }
1152
1153 return(list(
1154 tokens = tokens,
1155 lemmas = lemmas,
1156 pos_tags = pos_tags,
1157 morph_tags = morph_tags
1158 ))
1159 }
1160
1161 # Split the XML into three parts: left context, match content, and right context
1162 # The structure is: <span class="match">...left...<mark>...match...</mark>...right...</span>
Marc Kupietzc643a122025-07-18 18:18:36 +02001163
Marc Kupietza29f3d42025-07-18 10:14:43 +02001164 # First extract the content within the match span using DOTALL modifier
1165 match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*<span class="context-right">'
1166 match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001167
Marc Kupietza29f3d42025-07-18 10:14:43 +02001168 if (match_span_match == -1) {
1169 # Try alternative pattern if no context-right
1170 match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*$'
1171 match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
1172 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001173
Marc Kupietza29f3d42025-07-18 10:14:43 +02001174 if (match_span_match > 0) {
1175 match_span_content <- gsub(match_span_pattern, '\\1', xml_snippet, perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001176
Marc Kupietza29f3d42025-07-18 10:14:43 +02001177 # Now find the <mark> and </mark> positions within this content
1178 mark_start <- regexpr('<mark[^>]*>', match_span_content, perl = TRUE)
Marc Kupietz560b5912025-09-01 17:36:13 +02001179 # Use the LAST closing </mark> to cover multi-part matches
1180 mark_end_gre <- gregexpr('</mark>', match_span_content, perl = TRUE)
1181 mark_end_positions <- mark_end_gre[[1]]
1182 mark_end <- if (!is.null(mark_end_positions) && length(mark_end_positions) > 0 && mark_end_positions[1] != -1)
1183 mark_end_positions[length(mark_end_positions)] else -1
1184 mark_end_len <- if (mark_end != -1) attr(mark_end_gre[[1]], "match.length")[length(mark_end_positions)] else 0
Marc Kupietzc643a122025-07-18 18:18:36 +02001185
Marc Kupietza29f3d42025-07-18 10:14:43 +02001186 if (mark_start > 0 && mark_end > 0) {
Marc Kupietz560b5912025-09-01 17:36:13 +02001187 # Left context: everything before first <mark>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001188 left_content <- substr(match_span_content, 1, mark_start - 1)
Marc Kupietzc643a122025-07-18 18:18:36 +02001189
Marc Kupietz560b5912025-09-01 17:36:13 +02001190 # Match content: everything between first <mark> and last </mark>
1191 match_content <- substr(match_span_content, mark_start, mark_end + mark_end_len - 1)
Marc Kupietzc643a122025-07-18 18:18:36 +02001192
Marc Kupietz560b5912025-09-01 17:36:13 +02001193 # Right context: everything after last </mark>
1194 right_content_start <- mark_end + mark_end_len
Marc Kupietza29f3d42025-07-18 10:14:43 +02001195 right_content <- substr(match_span_content, right_content_start, nchar(match_span_content))
1196 } else {
1197 # No mark tags found, treat entire match span as match content
1198 left_content <- ""
1199 match_content <- match_span_content
1200 right_content <- ""
1201 }
1202 } else {
1203 # No match span found, treat entire content as match
1204 left_content <- ""
1205 match_content <- xml_snippet
1206 right_content <- ""
1207 }
1208
1209 # Process each section
1210 left_annotations <- extract_annotations_from_section(left_content)
1211 match_annotations <- extract_annotations_from_section(match_content)
1212 right_annotations <- extract_annotations_from_section(right_content)
1213
1214 return(list(
1215 atokens = list(
1216 left = left_annotations$tokens,
1217 match = match_annotations$tokens,
1218 right = right_annotations$tokens
1219 ),
1220 lemma = list(
1221 left = left_annotations$lemmas,
1222 match = match_annotations$lemmas,
1223 right = right_annotations$lemmas
1224 ),
1225 pos = list(
1226 left = left_annotations$pos_tags,
1227 match = match_annotations$pos_tags,
1228 right = right_annotations$pos_tags
1229 ),
1230 morph = list(
1231 left = left_annotations$morph_tags,
1232 match = match_annotations$morph_tags,
1233 right = right_annotations$morph_tags
1234 )
1235 ))
1236}
1237
Marc Kupietze52b2952025-07-17 16:53:02 +02001238#' Fetch annotations for all collected matches
1239#'
Marc Kupietz89f796e2025-07-19 09:05:06 +02001240#' `r lifecycle::badge("experimental")`
1241#'
1242#' **`fetchAnnotations`** fetches annotations (only token annotations, for now)
1243#' for all matches in the `@collectedMatches` slot
Marc Kupietzc643a122025-07-18 18:18:36 +02001244#' of a KorAPQuery object and adds annotation columns directly to the `@collectedMatches`
Marc Kupietz89f796e2025-07-19 09:05:06 +02001245#' data frame. The method uses the `matchID` from collected matches.
Marc Kupietza29f3d42025-07-18 10:14:43 +02001246#'
1247#' **Important**: For copyright-restricted corpora, users must be authorized via [auth()]
1248#' and the initial corpus query must have `metadataOnly = FALSE` to ensure snippets are
1249#' available for annotation parsing.
1250#'
1251#' The method parses XML snippet annotations and adds linguistic columns to the data frame:
1252#' - `pos`: data frame with `left`, `match`, `right` columns, each containing list vectors of part-of-speech tags
1253#' - `lemma`: data frame with `left`, `match`, `right` columns, each containing list vectors of lemmas
1254#' - `morph`: data frame with `left`, `match`, `right` columns, each containing list vectors of morphological tags
1255#' - `atokens`: data frame with `left`, `match`, `right` columns, each containing list vectors of token text (from annotations)
1256#' - `annotation_snippet`: original XML snippet from the annotation API
Marc Kupietze52b2952025-07-17 16:53:02 +02001257#'
1258#' @family corpus search functions
Marc Kupietz89f796e2025-07-19 09:05:06 +02001259#' @concept Annotations
Marc Kupietze52b2952025-07-17 16:53:02 +02001260#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001261#' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
Marc Kupietze52b2952025-07-17 16:53:02 +02001262#' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
Marc Kupietz93787d52025-09-03 13:33:25 +02001263#' @param overwrite logical; if TRUE, re-fetch and replace any existing
1264#' annotation columns. If FALSE (default), only add missing annotation layers
1265#' and preserve already fetched ones (e.g., keep POS/lemma from a previous
1266#' foundry while adding morph from another).
Marc Kupietze52b2952025-07-17 16:53:02 +02001267#' @param verbose print progress information if true
Marc Kupietz0af75932025-09-09 18:14:16 +02001268#' @return The updated `kqo` object with annotation columns
Marc Kupietz336c85d2025-07-24 13:52:03 +02001269#' @return The updated `kqo` object with annotation columns
Marc Kupietz89f796e2025-07-19 09:05:06 +02001270#' like `pos`, `lemma`, `morph` (and `atokens` and `annotation_snippet`)
1271#' in the `@collectedMatches` slot. Each column is a data frame
1272#' with `left`, `match`, and `right` columns containing list vectors of annotations
1273#' for the left context, matched tokens, and right context, respectively.
1274#' The original XML snippet for each match is also stored in `annotation_snippet`.
Marc Kupietze52b2952025-07-17 16:53:02 +02001275#'
1276#' @examples
1277#' \dontrun{
1278#'
1279#' # Fetch annotations for matches using Tree-Tagger foundry
Marc Kupietza29f3d42025-07-18 10:14:43 +02001280#' # Note: Authorization required for copyright-restricted corpora
Marc Kupietze52b2952025-07-17 16:53:02 +02001281#' q <- KorAPConnection() |>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001282#' auth() |>
1283#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
Marc Kupietze52b2952025-07-17 16:53:02 +02001284#' fetchNext(maxFetch = 10) |>
1285#' fetchAnnotations()
Marc Kupietze52b2952025-07-17 16:53:02 +02001286#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001287#' # Access linguistic annotations for match i:
Marc Kupietz6aa5a0d2025-09-08 17:51:47 +02001288#' pos_tags <- q@collectedMatches$pos
1289#' # Data frame with left/match/right columns for POS tags
1290#' lemmas <- q@collectedMatches$lemma
1291#' # Data frame with left/match/right columns for lemmas
1292#' morphology <- q@collectedMatches$morph
1293#' # Data frame with left/match/right columns for morphological tags
1294#' atokens <- q@collectedMatches$atokens
1295#' # Data frame with left/match/right columns for annotation token text
Marc Kupietz0af75932025-09-09 18:14:16 +02001296#' # Original XML snippet for match i
1297#' raw_snippet <- q@collectedMatches$annotation_snippet[[i]]
Marc Kupietzc643a122025-07-18 18:18:36 +02001298#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001299#' # Access specific components:
Marc Kupietz0af75932025-09-09 18:14:16 +02001300#' # POS tags for the matched tokens in match i
1301#' match_pos <- q@collectedMatches$pos$match[[i]]
1302#' # Lemmas for the left context in match i
1303#' left_lemmas <- q@collectedMatches$lemma$left[[i]]
1304#' # Token text for the right context in match i
1305#' right_tokens <- q@collectedMatches$atokens$right[[i]]
Marc Kupietza29f3d42025-07-18 10:14:43 +02001306#'
Marc Kupietz89f796e2025-07-19 09:05:06 +02001307#' # Use a different foundry (e.g., MarMoT)
Marc Kupietze52b2952025-07-17 16:53:02 +02001308#' q <- KorAPConnection() |>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001309#' auth() |>
1310#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
Marc Kupietze52b2952025-07-17 16:53:02 +02001311#' fetchNext(maxFetch = 10) |>
Marc Kupietz89f796e2025-07-19 09:05:06 +02001312#' fetchAnnotations(foundry = "marmot")
1313#' q@collectedMatches$pos$left[1] # POS tags for the left context of the first match
Marc Kupietze52b2952025-07-17 16:53:02 +02001314#' }
Marc Kupietze52b2952025-07-17 16:53:02 +02001315#' @export
Marc Kupietz0af75932025-09-09 18:14:16 +02001316setMethod("fetchAnnotations", "KorAPQuery", function(kqo,
1317 foundry = "tt",
1318 overwrite = FALSE,
1319 verbose = kqo@korapConnection@verbose) {
1320 if (is.null(kqo@collectedMatches) ||
1321 nrow(kqo@collectedMatches) == 0) {
1322 warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
1323 return(kqo)
1324 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001325
Marc Kupietze52b2952025-07-17 16:53:02 +02001326 df <- kqo@collectedMatches
1327 kco <- kqo@korapConnection
Marc Kupietza29f3d42025-07-18 10:14:43 +02001328
Marc Kupietza29f3d42025-07-18 10:14:43 +02001329 # Initialize annotation columns as data frames (like tokens field)
1330 # Create the structure more explicitly to avoid assignment issues
1331 nrows <- nrow(df)
Marc Kupietzc643a122025-07-18 18:18:36 +02001332
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001333 # Pre-compute the empty character vector list to avoid repeated computation
1334 empty_char_list <- I(replicate(nrows, character(0), simplify = FALSE))
Marc Kupietz0af75932025-09-09 18:14:16 +02001335
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001336 # Helper function to create annotation data frame structure
1337 create_annotation_df <- function(empty_list) {
1338 data.frame(
1339 left = empty_list,
1340 match = empty_list,
1341 right = empty_list,
1342 stringsAsFactors = FALSE
1343 )
1344 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001345
Marc Kupietz93787d52025-09-03 13:33:25 +02001346 # Track which annotation columns already existed to decide overwrite behavior
1347 existing_types <- list(
1348 pos = "pos" %in% colnames(df),
1349 lemma = "lemma" %in% colnames(df),
1350 morph = "morph" %in% colnames(df),
1351 atokens = "atokens" %in% colnames(df),
1352 annotation_snippet = "annotation_snippet" %in% colnames(df)
1353 )
1354
1355 # Initialize annotation columns using the helper function
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001356 annotation_types <- c("pos", "lemma", "morph", "atokens")
1357 for (type in annotation_types) {
Marc Kupietz93787d52025-09-03 13:33:25 +02001358 if (overwrite || !existing_types[[type]]) {
1359 df[[type]] <- create_annotation_df(empty_char_list)
1360 }
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001361 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001362
Marc Kupietz93787d52025-09-03 13:33:25 +02001363 if (overwrite || !existing_types$annotation_snippet) {
feldmuellera02f1932025-09-15 16:38:06 +02001364 df$annotation_snippet <- rep(NA_character_, nrows) # Fixed line
Marc Kupietz93787d52025-09-03 13:33:25 +02001365 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001366
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001367 # Initialize timing for ETA calculation
1368 start_time <- Sys.time()
1369 if (verbose) {
1370 log_info(verbose, paste("Starting to fetch annotations for", nrows, "matches\n"))
1371 }
1372
Marc Kupietz93787d52025-09-03 13:33:25 +02001373 # Helper to decide if existing annotation row is effectively empty
1374 is_empty_annotation_row <- function(ann_df, row_index) {
1375 if (is.null(ann_df) || nrow(ann_df) < row_index) return(TRUE)
1376 left_val <- ann_df$left[[row_index]]
1377 match_val <- ann_df$match[[row_index]]
1378 right_val <- ann_df$right[[row_index]]
1379 all(
1380 (is.null(left_val) || (length(left_val) == 0) || all(is.na(left_val))),
1381 (is.null(match_val) || (length(match_val) == 0) || all(is.na(match_val))),
1382 (is.null(right_val) || (length(right_val) == 0) || all(is.na(right_val)))
1383 )
1384 }
1385
Marc Kupietze52b2952025-07-17 16:53:02 +02001386 for (i in seq_len(nrow(df))) {
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001387 # ETA logging
1388 if (verbose && i > 1) {
1389 eta_info <- calculate_eta(i, nrows, start_time)
1390 log_info(verbose, paste("Fetching annotations for match", i, "of", nrows, eta_info, "\n"))
1391 }
Marc Kupietzff712a92025-07-18 09:07:23 +02001392 # Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
1393 if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001394 # matchID format: "match-match-A00/JUN/39609-p202-203" or encrypted format like
1395 # "match-DNB10/CSL/80400-p2343-2344x_MinDOhu_P6dd2MMZJyyus_7MairdKnr1LxY07Cya-Ow"
1396 # Extract document path and position, handling both regular and encrypted formats
Marc Kupietzc643a122025-07-18 18:18:36 +02001397
Marc Kupietza29f3d42025-07-18 10:14:43 +02001398 # More flexible regex to extract the document path with position and encryption
1399 # Look for pattern: match-(...)-p(\d+)-(\d+)(.*) where (.*) is the encrypted part
1400 # We need to capture the entire path including the encrypted suffix
1401 match_result <- regexpr("match-(.+?-p\\d+-\\d+.*)", df$matchID[i], perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001402
Marc Kupietza29f3d42025-07-18 10:14:43 +02001403 if (match_result > 0) {
1404 # Extract the complete path including encryption (everything after "match-")
1405 doc_path_with_pos_and_encryption <- gsub("^match-(.+)$", "\\1", df$matchID[i], perl = TRUE)
1406 # Convert the dash before position to slash, but keep everything after the position
1407 match_path <- gsub("-p(\\d+-\\d+.*)", "/p\\1", doc_path_with_pos_and_encryption)
Marc Kupietz25121302025-07-19 08:45:43 +02001408 # Use httr2 to construct URL safely
1409 base_url <- paste0(kco@apiUrl, "corpus/", match_path)
1410 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietza29f3d42025-07-18 10:14:43 +02001411 } else {
Marc Kupietz25121302025-07-19 08:45:43 +02001412 # If regex fails, fall back to the old method with httr2
1413 # Format numbers to avoid scientific notation
1414 match_start <- format(df$matchStart[i], scientific = FALSE)
1415 match_end <- format(df$matchEnd[i], scientific = FALSE)
1416 base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
1417 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietzff712a92025-07-18 09:07:23 +02001418 }
1419 } else {
Marc Kupietz25121302025-07-19 08:45:43 +02001420 # Fallback to the old method with httr2
1421 # Format numbers to avoid scientific notation
1422 match_start <- format(df$matchStart[i], scientific = FALSE)
1423 match_end <- format(df$matchEnd[i], scientific = FALSE)
1424 base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
1425 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietzff712a92025-07-18 09:07:23 +02001426 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001427
Marc Kupietze52b2952025-07-17 16:53:02 +02001428 tryCatch({
1429 res <- apiCall(kco, req)
Marc Kupietzc643a122025-07-18 18:18:36 +02001430
Marc Kupietze52b2952025-07-17 16:53:02 +02001431 if (!is.null(res)) {
Marc Kupietz93787d52025-09-03 13:33:25 +02001432 # Store the raw annotation snippet (respect overwrite flag)
1433 if (overwrite || !existing_types$annotation_snippet || is.null(df$annotation_snippet[[i]]) || is.na(df$annotation_snippet[[i]])) {
1434 df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
1435 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001436
1437 # Parse XML annotations if snippet is available
1438 if (is.list(res) && "snippet" %in% names(res)) {
1439 parsed_annotations <- parse_xml_annotations_structured(res$snippet)
1440
1441 # Store the parsed linguistic data in data frame format (like tokens)
1442 # Use individual assignment to avoid data frame mismatch errors
1443 tryCatch({
1444 # Assign POS annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001445 if (overwrite || !existing_types$pos || is_empty_annotation_row(df$pos, i)) {
1446 df$pos$left[i] <- list(parsed_annotations$pos$left)
1447 df$pos$match[i] <- list(parsed_annotations$pos$match)
1448 df$pos$right[i] <- list(parsed_annotations$pos$right)
1449 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001450
Marc Kupietza29f3d42025-07-18 10:14:43 +02001451 # Assign lemma annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001452 if (overwrite || !existing_types$lemma || is_empty_annotation_row(df$lemma, i)) {
1453 df$lemma$left[i] <- list(parsed_annotations$lemma$left)
1454 df$lemma$match[i] <- list(parsed_annotations$lemma$match)
1455 df$lemma$right[i] <- list(parsed_annotations$lemma$right)
1456 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001457
Marc Kupietza29f3d42025-07-18 10:14:43 +02001458 # Assign morphology annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001459 if (overwrite || !existing_types$morph || is_empty_annotation_row(df$morph, i)) {
1460 df$morph$left[i] <- list(parsed_annotations$morph$left)
1461 df$morph$match[i] <- list(parsed_annotations$morph$match)
1462 df$morph$right[i] <- list(parsed_annotations$morph$right)
1463 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001464
Marc Kupietza29f3d42025-07-18 10:14:43 +02001465 # Assign token annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001466 if (overwrite || !existing_types$atokens || is_empty_annotation_row(df$atokens, i)) {
1467 df$atokens$left[i] <- list(parsed_annotations$atokens$left)
1468 df$atokens$match[i] <- list(parsed_annotations$atokens$match)
1469 df$atokens$right[i] <- list(parsed_annotations$atokens$right)
1470 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001471 }, error = function(assign_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001472 # Set empty character vectors on assignment error using list assignment
Marc Kupietz93787d52025-09-03 13:33:25 +02001473 if (overwrite || !existing_types$pos) {
1474 df$pos$left[i] <<- list(character(0))
1475 df$pos$match[i] <<- list(character(0))
1476 df$pos$right[i] <<- list(character(0))
1477 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001478
Marc Kupietz93787d52025-09-03 13:33:25 +02001479 if (overwrite || !existing_types$lemma) {
1480 df$lemma$left[i] <<- list(character(0))
1481 df$lemma$match[i] <<- list(character(0))
1482 df$lemma$right[i] <<- list(character(0))
1483 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001484
Marc Kupietz93787d52025-09-03 13:33:25 +02001485 if (overwrite || !existing_types$morph) {
1486 df$morph$left[i] <<- list(character(0))
1487 df$morph$match[i] <<- list(character(0))
1488 df$morph$right[i] <<- list(character(0))
1489 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001490
Marc Kupietz93787d52025-09-03 13:33:25 +02001491 if (overwrite || !existing_types$atokens) {
1492 df$atokens$left[i] <<- list(character(0))
1493 df$atokens$match[i] <<- list(character(0))
1494 df$atokens$right[i] <<- list(character(0))
1495 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001496 })
Marc Kupietza29f3d42025-07-18 10:14:43 +02001497 } else {
1498 # No snippet available, store empty vectors
Marc Kupietz93787d52025-09-03 13:33:25 +02001499 if (overwrite || !existing_types$pos) {
1500 df$pos$left[i] <- list(character(0))
1501 df$pos$match[i] <- list(character(0))
1502 df$pos$right[i] <- list(character(0))
1503 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001504
Marc Kupietz93787d52025-09-03 13:33:25 +02001505 if (overwrite || !existing_types$lemma) {
1506 df$lemma$left[i] <- list(character(0))
1507 df$lemma$match[i] <- list(character(0))
1508 df$lemma$right[i] <- list(character(0))
1509 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001510
Marc Kupietz93787d52025-09-03 13:33:25 +02001511 if (overwrite || !existing_types$morph) {
1512 df$morph$left[i] <- list(character(0))
1513 df$morph$match[i] <- list(character(0))
1514 df$morph$right[i] <- list(character(0))
1515 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001516
Marc Kupietz93787d52025-09-03 13:33:25 +02001517 if (overwrite || !existing_types$atokens) {
1518 df$atokens$left[i] <- list(character(0))
1519 df$atokens$match[i] <- list(character(0))
1520 df$atokens$right[i] <- list(character(0))
1521 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001522 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001523 } else {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001524 # Store NAs for failed requests
Marc Kupietz93787d52025-09-03 13:33:25 +02001525 if (overwrite || !existing_types$pos) {
1526 df$pos$left[i] <- list(NA)
1527 df$pos$match[i] <- list(NA)
1528 df$pos$right[i] <- list(NA)
1529 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001530
Marc Kupietz93787d52025-09-03 13:33:25 +02001531 if (overwrite || !existing_types$lemma) {
1532 df$lemma$left[i] <- list(NA)
1533 df$lemma$match[i] <- list(NA)
1534 df$lemma$right[i] <- list(NA)
1535 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001536
Marc Kupietz93787d52025-09-03 13:33:25 +02001537 if (overwrite || !existing_types$morph) {
1538 df$morph$left[i] <- list(NA)
1539 df$morph$match[i] <- list(NA)
1540 df$morph$right[i] <- list(NA)
1541 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001542
Marc Kupietz93787d52025-09-03 13:33:25 +02001543 if (overwrite || !existing_types$atokens) {
1544 df$atokens$left[i] <- list(NA)
1545 df$atokens$match[i] <- list(NA)
1546 df$atokens$right[i] <- list(NA)
1547 }
1548 if (overwrite || !existing_types$annotation_snippet) {
1549 df$annotation_snippet[[i]] <- NA
1550 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001551 }
1552 }, error = function(e) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001553 # Store NAs for failed requests
Marc Kupietz93787d52025-09-03 13:33:25 +02001554 if (overwrite || !existing_types$pos) {
1555 df$pos$left[i] <- list(NA)
1556 df$pos$match[i] <- list(NA)
1557 df$pos$right[i] <- list(NA)
1558 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001559
Marc Kupietz93787d52025-09-03 13:33:25 +02001560 if (overwrite || !existing_types$lemma) {
1561 df$lemma$left[i] <- list(NA)
1562 df$lemma$match[i] <- list(NA)
1563 df$lemma$right[i] <- list(NA)
1564 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001565
Marc Kupietz93787d52025-09-03 13:33:25 +02001566 if (overwrite || !existing_types$morph) {
1567 df$morph$left[i] <- list(NA)
1568 df$morph$match[i] <- list(NA)
1569 df$morph$right[i] <- list(NA)
1570 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001571
Marc Kupietz93787d52025-09-03 13:33:25 +02001572 if (overwrite || !existing_types$atokens) {
1573 df$atokens$left[i] <- list(NA)
1574 df$atokens$match[i] <- list(NA)
1575 df$atokens$right[i] <- list(NA)
1576 }
1577 if (overwrite || !existing_types$annotation_snippet) {
1578 df$annotation_snippet[[i]] <- NA
1579 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001580 })
1581 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001582
Marc Kupietza29f3d42025-07-18 10:14:43 +02001583 # Validate data frame structure before assignment
1584 if (nrow(df) != nrow(kqo@collectedMatches)) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001585 }
1586
1587 # Update the collectedMatches with annotation data
1588 tryCatch({
1589 kqo@collectedMatches <- df
1590 }, error = function(assign_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001591 # Try a safer approach: add columns individually
1592 tryCatch({
1593 kqo@collectedMatches$pos <- df$pos
Marc Kupietzc643a122025-07-18 18:18:36 +02001594 kqo@collectedMatches$lemma <- df$lemma
Marc Kupietza29f3d42025-07-18 10:14:43 +02001595 kqo@collectedMatches$morph <- df$morph
1596 kqo@collectedMatches$atokens <- df$atokens
1597 kqo@collectedMatches$annotation_snippet <- df$annotation_snippet
1598 }, error = function(col_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001599 warning("Failed to add annotation data to collectedMatches")
1600 })
1601 })
1602
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001603 if (verbose) {
1604 elapsed_time <- Sys.time() - start_time
1605 log_info(verbose, paste("Finished fetching annotations for", nrows, "matches in", format_duration(as.numeric(elapsed_time, units = "secs")), "\n"))
1606 }
1607
Marc Kupietze52b2952025-07-17 16:53:02 +02001608 return(kqo)
1609})
1610
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001611#' Query frequencies of search expressions in virtual corpora
Marc Kupietz3f575282019-10-04 14:46:04 +02001612#'
Marc Kupietz67edcb52021-09-20 21:54:24 +02001613#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001614#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +02001615#' confidence intervals of one ore multiple search terms across one or multiple
1616#' virtual corpora.
1617#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02001618#' @family frequency analysis
Marc Kupietz3f575282019-10-04 14:46:04 +02001619#' @aliases frequencyQuery
Marc Kupietz3f575282019-10-04 14:46:04 +02001620#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +02001621#' \dontrun{
1622#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001623#' KorAPConnection(verbose = TRUE) |>
Marc Kupietz3f575282019-10-04 14:46:04 +02001624#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +01001625#' }
Marc Kupietz3f575282019-10-04 14:46:04 +02001626#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001627# @inheritParams corpusQuery
Marc Kupietz617266d2025-02-27 10:43:07 +01001628#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001629#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
1630#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz67edcb52021-09-20 21:54:24 +02001631#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
1632#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001633#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz3f575282019-10-04 14:46:04 +02001634#' @export
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001635#'
1636#' @return A tibble, with each row containing the following result columns for query and vc combinations:
1637#' - **query**: the query string used for the frequency analysis.
1638#' - **totalResults**: absolute frequency of query matches in the vc.
1639#' - **vc**: virtual corpus used for the query.
1640#' - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
1641#' - **total**: total number of words in vc.
1642#' - **f**: relative frequency of query matches in the vc.
1643#' - **conf.low**: lower bound of the confidence interval for the relative frequency, given `conf.level`.
1644#' - **conf.high**: upper bound of the confidence interval for the relative frequency, given `conf.level`.
1645
Marc Kupietzd8851222025-05-01 10:57:19 +02001646setMethod(
1647 "frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +01001648 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietzd8851222025-05-01 10:57:19 +02001649 (if (as.alternatives) {
1650 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
Marc Kupietzea34b812025-06-25 15:49:00 +02001651 group_by(vc) |>
Marc Kupietz71d6e052019-11-22 18:42:10 +01001652 mutate(total = sum(totalResults))
Marc Kupietzd8851222025-05-01 10:57:19 +02001653 } else {
1654 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
1655 mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
Marc Kupietzea34b812025-06-25 15:49:00 +02001656 }) |>
Marc Kupietz0c29cea2019-10-09 08:44:36 +02001657 ci(conf.level = conf.level)
Marc Kupietzd8851222025-05-01 10:57:19 +02001658 }
1659)
Marc Kupietz3f575282019-10-04 14:46:04 +02001660
Marc Kupietz38a9d682024-12-06 16:17:09 +01001661#' buildWebUIRequestUrlFromString
1662#'
1663#' @rdname KorAPQuery-class
1664#' @importFrom urltools url_encode
1665#' @export
1666buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +02001667 query,
1668 vc = "",
1669 ql = "poliqarp") {
Marc Kupietz38a9d682024-12-06 16:17:09 +01001670 if ("KorAPConnection" %in% class(KorAPUrl)) {
1671 KorAPUrl <- KorAPUrl@KorAPUrl
1672 }
1673
1674 request <-
1675 paste0(
Marc Kupietzd8851222025-05-01 10:57:19 +02001676 "?q=",
Marc Kupietz38a9d682024-12-06 16:17:09 +01001677 urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietzd8851222025-05-01 10:57:19 +02001678 ifelse(vc != "",
1679 paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
1680 ""
1681 ),
1682 "&ql=",
Marc Kupietz38a9d682024-12-06 16:17:09 +01001683 ql
1684 )
1685 paste0(KorAPUrl, request)
1686}
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001687
1688#' buildWebUIRequestUrl
1689#'
1690#' @rdname KorAPQuery-class
Marc Kupietzf9129592025-01-26 19:17:54 +01001691#' @importFrom httr2 url_parse
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001692#' @export
1693buildWebUIRequestUrl <- function(kco,
Marc Kupietzd8851222025-05-01 10:57:19 +02001694 query = if (missing(KorAPUrl)) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001695 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietzd8851222025-05-01 10:57:19 +02001696 } else {
1697 httr2::url_parse(KorAPUrl)$query$q
1698 },
Marc Kupietzf9129592025-01-26 19:17:54 +01001699 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001700 KorAPUrl,
Marc Kupietzf9129592025-01-26 19:17:54 +01001701 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz38a9d682024-12-06 16:17:09 +01001702 buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001703}
1704
Marc Kupietzd8851222025-05-01 10:57:19 +02001705#' format()
Marc Kupietze95108e2019-09-18 13:23:58 +02001706#' @rdname KorAPQuery-class
1707#' @param x KorAPQuery object
1708#' @param ... further arguments passed to or from other methods
Marc Kupietzb73ca0f2025-01-28 20:45:01 +01001709#' @importFrom urltools param_get url_decode
Marc Kupietze95108e2019-09-18 13:23:58 +02001710#' @export
1711format.KorAPQuery <- function(x, ...) {
1712 cat("<KorAPQuery>\n")
1713 q <- x
Marc Kupietzd8851222025-05-01 10:57:19 +02001714 param <- urltools::param_get(q@request) |> lapply(urltools::url_decode)
Marc Kupietzb73ca0f2025-01-28 20:45:01 +01001715 cat(" Query: ", param$q, "\n")
1716 if (!is.null(param$cq) && param$cq != "") {
1717 cat(" Virtual corpus: ", param$cq, "\n")
1718 }
1719 if (!is.null(q@collectedMatches)) {
1720 cat("==============================================================================================================", "\n")
1721 print(summary(q@collectedMatches))
1722 cat("==============================================================================================================", "\n")
1723 }
1724 cat(" Total results: ", q@totalResults, "\n")
1725 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietza29f3d42025-07-18 10:14:43 +02001726 if (!is.null(q@collectedMatches) && "pos" %in% colnames(q@collectedMatches)) {
1727 successful_annotations <- sum(!is.na(q@collectedMatches$annotation_snippet))
1728 parsed_annotations <- sum(!is.na(q@collectedMatches$pos))
1729 cat(" Annotations: ", successful_annotations, " of ", nrow(q@collectedMatches), " matches")
1730 if (parsed_annotations > 0) {
1731 cat(" (", parsed_annotations, " with parsed linguistic data)")
1732 }
1733 cat("\n")
Marc Kupietze52b2952025-07-17 16:53:02 +02001734 }
Marc Kupietz62da2b52019-09-12 17:43:34 +02001735}
1736
Marc Kupietze95108e2019-09-18 13:23:58 +02001737#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +02001738#'
Marc Kupietze95108e2019-09-18 13:23:58 +02001739#' @rdname KorAPQuery-class
1740#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +02001741#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +02001742setMethod("show", "KorAPQuery", function(object) {
1743 format(object)
Marc Kupietzc643a122025-07-18 18:18:36 +02001744 invisible(object)
Marc Kupietze95108e2019-09-18 13:23:58 +02001745})