blob: 765e4bf9fb0b5d6ac6a5edccfc7924be5289684d [file] [log] [blame]
Marc Kupietza8c40f42025-06-24 15:49:52 +02001#' KorAPQuery class (internal)
Marc Kupietze95108e2019-09-18 13:23:58 +02002#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02003#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
Marc Kupietze95108e2019-09-18 13:23:58 +02004#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02005#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +02006#' @include KorAPConnection.R
Marc Kupietz6dfeed92025-06-03 11:58:06 +02007#' @include logging.R
Marc Kupietzf9129592025-01-26 19:17:54 +01008#' @import httr2
Marc Kupietze95108e2019-09-18 13:23:58 +02009#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
Marc Kupietza29f3d42025-07-18 10:14:43 +020024 "hasMoreMatches"
Marc Kupietze95108e2019-09-18 13:23:58 +020025))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietza8c40f42025-06-24 15:49:52 +020027#' Initialize KorAPQuery object
28#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +020029#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020030#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020031#' @param request query part of the request URL
32#' @param vc definition of a virtual corpus
33#' @param totalResults number of hits the query has yielded
34#' @param nextStartIndex at what index to start the next fetch of query results
35#' @param fields what data / metadata fields should be collected
36#' @param requestUrl complete URL of the API request
37#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
38#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020039#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020040#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020041#'
42#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020043#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +020044setMethod(
45 "initialize", "KorAPQuery",
46 function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
47 "corpusSigle", "textSigle", "pubDate", "pubPlace",
48 "availability", "textClass", "snippet", "tokens"
49 ),
Marc Kupietza29f3d42025-07-18 10:14:43 +020050 requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
Marc Kupietzd8851222025-05-01 10:57:19 +020051 .Object <- callNextMethod()
52 .Object@korapConnection <- korapConnection
53 .Object@request <- request
54 .Object@vc <- vc
55 .Object@totalResults <- totalResults
56 .Object@nextStartIndex <- nextStartIndex
57 .Object@fields <- fields
58 .Object@requestUrl <- requestUrl
59 .Object@webUIRequestUrl <- webUIRequestUrl
60 .Object@apiResponse <- apiResponse
61 .Object@hasMoreMatches <- hasMoreMatches
62 .Object@collectedMatches <- collectedMatches
63 .Object
64 }
65)
Marc Kupietz632cbd42019-09-06 16:04:51 +020066
Marc Kupietzd8851222025-05-01 10:57:19 +020067setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
68setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
69setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
70setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
Marc Kupietz0af75932025-09-09 18:14:16 +020071setGeneric(
72 "fetchAnnotations",
73 function(kqo,
74 foundry = "tt",
75 overwrite = FALSE,
76 verbose = kqo@korapConnection@verbose) standardGeneric("fetchAnnotations")
77)
Marc Kupietzd8851222025-05-01 10:57:19 +020078setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietze95108e2019-09-18 13:23:58 +020079
80maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020081
Marc Kupietz4de53ec2019-10-04 09:12:00 +020082## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietzef1ef4a2025-02-19 12:12:40 +010083utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020084
Marc Kupietza8c40f42025-06-24 15:49:52 +020085#' Search corpus for query terms
Marc Kupietzdbd431a2021-08-29 12:17:45 +020086#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020087#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020088#'
Marc Kupietza8c40f42025-06-24 15:49:52 +020089#' @family corpus search functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +020090#' @aliases corpusQuery
91#'
92#' @importFrom urltools url_encode
93#' @importFrom purrr pmap
Marc Kupietzea34b812025-06-25 15:49:00 +020094#' @importFrom dplyr bind_rows group_by
Marc Kupietzdbd431a2021-08-29 12:17:45 +020095#'
Marc Kupietz617266d2025-02-27 10:43:07 +010096#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz67edcb52021-09-20 21:54:24 +020097#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020098#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020099#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz132f0052023-04-16 14:23:05 +0200100#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
101#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
102#' your RKorAPClient application as explained in the
103#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
104#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
105#' `FALSE`.
Marc Kupietz67edcb52021-09-20 21:54:24 +0200106#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Marc Kupietz1623fe82025-06-24 16:31:46 +0200107#' @param fields character vector specifying which metadata fields to retrieve for each match.
108#' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
109#' \describe{
110#' \item{**Text identification**:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
111#' \item{**Publication info**:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
112#' \item{**Temporal data**:}{`pubDate`, `creationDate` - when text was published/created}
113#' \item{**Publication details**:}{`pubPlace`, `publisher`, `reference` - where/how published}
114#' \item{**Text classification**:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
115#' \item{**Adminstrative and technical info**:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
116#' \item{**Content data**:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
117#' \item{**System data**:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
118#' }
119#' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
120#' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default if `metadataOnly` is set to `FALSE`.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100121#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +0200122#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200123#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200124#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200125#' @param context string that specifies the size of the left and the right context returned in `snippet`
126#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
127#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
128#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
129#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
130#' as this is typically legally motivated.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200131#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz67edcb52021-09-20 21:54:24 +0200132#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
133#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200134#'
135#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200136#' \dontrun{
137#'
Marc Kupietz1623fe82025-06-24 16:31:46 +0200138#' # Fetch basic metadata for "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200139#' KorAPConnection() |>
140#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200141#' fetchAll()
Marc Kupietz1623fe82025-06-24 16:31:46 +0200142#'
143#' # Fetch specific metadata fields for bibliographic analysis
Marc Kupietzd3526422025-06-25 09:16:15 +0200144#' query <- KorAPConnection() |>
Marc Kupietz1623fe82025-06-24 16:31:46 +0200145#' corpusQuery("Ameisenplage",
146#' fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
147#' results <- fetchAll(query)
148#' results@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100149#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200150#'
Marc Kupietz6ae76052021-09-21 10:34:00 +0200151#' \dontrun{
152#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200153#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
154#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200155#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200156#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200157#' corpusQuery(
158#' KorAPUrl =
159#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
160#' )
Marc Kupietz6ae76052021-09-21 10:34:00 +0200161#' }
162#'
163#' \dontrun{
Marc Kupietz3c531f62019-09-13 12:17:24 +0200164#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200165#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200166#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200167#' {
168#' . ->> kco
Marc Kupietzd3526422025-06-25 09:16:15 +0200169#' } |>
170#' corpusQuery("Ameisenplage") |>
171#' fetchAll() |>
172#' slot("collectedMatches") |>
173#' mutate(year = lubridate::year(pubDate)) |>
174#' dplyr::select(year) |>
175#' group_by(year) |>
176#' summarise(Count = dplyr::n()) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200177#' mutate(Freq = mapply(function(f, y) {
178#' f / corpusStats(kco, paste("pubDate in", y))@tokens
Marc Kupietzd3526422025-06-25 09:16:15 +0200179#' }, Count, year)) |>
180#' dplyr::select(-Count) |>
181#' complete(year = min(year):max(year), fill = list(Freq = 0)) |>
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200182#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100183#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200184#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200185#'
186#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200187#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200188#'
189#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +0200190setMethod(
191 "corpusQuery", "KorAPConnection",
192 function(kco,
193 query = if (missing(KorAPUrl)) {
194 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
195 } else {
196 httr2::url_parse(KorAPUrl)$query$q
197 },
198 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
199 KorAPUrl,
200 metadataOnly = TRUE,
201 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
202 fields = c(
203 "corpusSigle",
204 "textSigle",
205 "pubDate",
206 "pubPlace",
207 "availability",
208 "textClass",
209 "snippet",
210 "tokens"
211 ),
212 accessRewriteFatal = TRUE,
213 verbose = kco@verbose,
214 expand = length(vc) != length(query),
215 as.df = FALSE,
216 context = NULL) {
217 if (length(query) > 1 || length(vc) > 1) {
218 grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200219
220 # Initialize timing variables for ETA calculation
221 total_queries <- nrow(grid)
222 current_query <- 0
223 start_time <- Sys.time()
224
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200225 results <- purrr::pmap(grid, function(query, vc, ...) {
226 current_query <<- current_query + 1
227
228 # Execute the single query directly (avoiding recursive call)
229 contentFields <- c("snippet", "tokens")
230 query_fields <- fields
231 if (metadataOnly) {
232 query_fields <- query_fields[!query_fields %in% contentFields]
233 }
234 if (!"textSigle" %in% query_fields) {
235 query_fields <- c(query_fields, "textSigle")
236 }
237 request <-
238 paste0(
239 "?q=",
240 url_encode(enc2utf8(query)),
241 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
242 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
243 ifelse(!metadataOnly, "&show-tokens=true", ""),
244 "&ql=", ql
245 )
246 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
247 requestUrl <- paste0(
248 kco@apiUrl,
249 "search",
250 request,
251 "&fields=",
252 paste(query_fields, collapse = ","),
253 if (metadataOnly) "&access-rewrite-disabled=true" else ""
254 )
255
256 # Show individual query progress
257 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
258 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
259 if (is.null(res)) {
260 log_info(verbose, ": API call failed\n")
261 totalResults <- 0
262 } else {
263 totalResults <- as.integer(res$meta$totalResults)
264 log_info(verbose, ": ", totalResults, " hits")
265 if (!is.null(res$meta$cached)) {
266 log_info(verbose, " [cached]")
267 } else if (!is.null(res$meta$benchmark)) {
268 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
269 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
270 formatted_time <- paste0(round(time_value, 2), "s")
271 log_info(verbose, ", took ", formatted_time)
272 } else {
273 log_info(verbose, ", took ", res$meta$benchmark)
274 }
275 }
Marc Kupietz365660e2025-06-25 15:09:55 +0200276
277 # Calculate and display ETA information on the same line if verbose and we have more than one query
278 if (verbose && total_queries > 1) {
279 eta_info <- calculate_eta(current_query, total_queries, start_time)
280 if (eta_info != "") {
281 elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
282 avg_time_per_query <- elapsed_time / current_query
283
284 # Add ETA info to the same line - remove the leading ". " for cleaner formatting
285 clean_eta_info <- sub("^\\. ", ". ", eta_info)
286 log_info(verbose, clean_eta_info)
287 }
288 }
289
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200290 log_info(verbose, "\n")
291 }
292
293 result <- data.frame(
294 query = query,
295 totalResults = totalResults,
296 vc = vc,
297 webUIRequestUrl = webUIRequestUrl,
298 stringsAsFactors = FALSE
299 )
300
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200301 return(result)
302 })
303
304 results %>% bind_rows()
Marc Kupietzd8851222025-05-01 10:57:19 +0200305 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200306 contentFields <- c("snippet", "tokens")
Marc Kupietza96537f2019-11-09 23:07:44 +0100307 if (metadataOnly) {
308 fields <- fields[!fields %in% contentFields]
309 }
Marc Kupietz80dc6432025-02-07 16:57:40 +0100310 if (!"textSigle" %in% fields) {
311 fields <- c(fields, "textSigle")
312 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100313 request <-
Marc Kupietzd8851222025-05-01 10:57:19 +0200314 paste0(
315 "?q=",
316 url_encode(enc2utf8(query)),
317 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
318 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
319 ifelse(!metadataOnly, "&show-tokens=true", ""),
320 "&ql=", ql
321 )
Marc Kupietza96537f2019-11-09 23:07:44 +0100322 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
323 requestUrl <- paste0(
324 kco@apiUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200325 "search",
Marc Kupietza96537f2019-11-09 23:07:44 +0100326 request,
Marc Kupietzd8851222025-05-01 10:57:19 +0200327 "&fields=",
Marc Kupietza96537f2019-11-09 23:07:44 +0100328 paste(fields, collapse = ","),
Marc Kupietzd8851222025-05-01 10:57:19 +0200329 if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietza96537f2019-11-09 23:07:44 +0100330 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200331 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
332 sep =
333 ""
334 )
335 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietza4675722022-02-23 23:55:15 +0100336 if (is.null(res)) {
Marc Kupietza4675722022-02-23 23:55:15 +0100337 message("API call failed.")
338 totalResults <- 0
339 } else {
Marc Kupietzd8851222025-05-01 10:57:19 +0200340 totalResults <- as.integer(res$meta$totalResults)
Marc Kupietza47d1502023-04-18 15:26:47 +0200341 log_info(verbose, ": ", totalResults, " hits")
Marc Kupietzd8851222025-05-01 10:57:19 +0200342 if (!is.null(res$meta$cached)) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200343 log_info(verbose, " [cached]\n")
Marc Kupietzd8851222025-05-01 10:57:19 +0200344 } else if (!is.null(res$meta$benchmark)) {
Marc Kupietz2baf5c52025-09-05 16:41:11 +0200345 # Round the benchmark time to 2 decimal places for better readability.
346 # Be robust to locales using comma as decimal separator (e.g., "0,12s").
Marc Kupietz7638ca42025-05-25 13:18:16 +0200347 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
Marc Kupietz2baf5c52025-09-05 16:41:11 +0200348 bench_str <- sub("s$", "", res$meta$benchmark)
349 bench_num <- suppressWarnings(as.numeric(gsub(",", ".", bench_str)))
350 if (!is.na(bench_num)) {
351 formatted_time <- paste0(round(bench_num, 2), "s")
352 } else {
353 formatted_time <- res$meta$benchmark
354 }
Marc Kupietz7638ca42025-05-25 13:18:16 +0200355 log_info(verbose, ", took ", formatted_time, "\n", sep = "")
356 } else {
357 # Fallback if the format is different than expected
358 log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
359 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200360 } else {
361 log_info(verbose, "\n")
362 }
Marc Kupietza4675722022-02-23 23:55:15 +0100363 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200364 if (as.df) {
Marc Kupietza96537f2019-11-09 23:07:44 +0100365 data.frame(
366 query = query,
Marc Kupietza4675722022-02-23 23:55:15 +0100367 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100368 vc = vc,
369 webUIRequestUrl = webUIRequestUrl,
370 stringsAsFactors = FALSE
371 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200372 } else {
Marc Kupietza96537f2019-11-09 23:07:44 +0100373 KorAPQuery(
374 korapConnection = kco,
375 nextStartIndex = 0,
376 fields = fields,
377 requestUrl = requestUrl,
378 request = request,
Marc Kupietza4675722022-02-23 23:55:15 +0100379 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100380 vc = vc,
381 apiResponse = res,
382 webUIRequestUrl = webUIRequestUrl,
Marc Kupietza4675722022-02-23 23:55:15 +0100383 hasMoreMatches = (totalResults > 0),
Marc Kupietza96537f2019-11-09 23:07:44 +0100384 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200385 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100386 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200387 }
388)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200389
Marc Kupietz05a60792024-12-07 16:23:31 +0100390#' @importFrom purrr map
391repair_data_strcuture <- function(x) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200392 if (is.list(x)) {
393 as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz05a60792024-12-07 16:23:31 +0100394 paste(.x, collapse = " ")
395 } else {
396 .x
397 }))
Marc Kupietzd8851222025-05-01 10:57:19 +0200398 } else {
Marc Kupietz05a60792024-12-07 16:23:31 +0100399 ifelse(is.na(x), "", x)
Marc Kupietzd8851222025-05-01 10:57:19 +0200400 }
Marc Kupietz05a60792024-12-07 16:23:31 +0100401}
402
Marc Kupietz62da2b52019-09-12 17:43:34 +0200403#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200404#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200405#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200406#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200407#' @family corpus search functions
408#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200409#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200410#' @param offset start offset for query results to fetch
411#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200412#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200413#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
414#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200415#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100416#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200417#' \dontrun{
418#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200419#' q <- KorAPConnection() |>
420#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200421#' fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100422#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100423#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100424#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200425#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200426#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200427#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200428#' @aliases fetchNext
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200429#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietzf4881122024-12-17 14:55:39 +0100430#' @importFrom tibble enframe add_column
431#' @importFrom stringr word
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200432#' @importFrom tidyr unnest unchop pivot_wider
433#' @importFrom purrr map
Marc Kupietz632cbd42019-09-06 16:04:51 +0200434#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200435setMethod("fetchNext", "KorAPQuery", function(kqo,
436 offset = kqo@nextStartIndex,
437 maxFetch = maxResultsPerPage,
438 verbose = kqo@korapConnection@verbose,
439 randomizePageOrder = FALSE) {
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100440 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietzd8851222025-05-01 10:57:19 +0200441 results <- key <- name <- tmp_positions <- 0
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100442
Marc Kupietze95108e2019-09-18 13:23:58 +0200443 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
444 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200445 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200446 use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz623d7122025-05-25 12:46:12 +0200447 # Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietze95108e2019-09-18 13:23:58 +0200448 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200449
Marc Kupietz24799fd2025-06-25 14:15:36 +0200450 # Track start time for ETA calculation
451 start_time <- Sys.time()
452
Marc Kupietz623d7122025-05-25 12:46:12 +0200453 # For randomized page order, generate a list of randomized page indices
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200454 if (randomizePageOrder) {
Marc Kupietz623d7122025-05-25 12:46:12 +0200455 # Calculate how many pages we need to fetch based on maxFetch
456 total_pages_to_fetch <- if (!is.na(maxFetch)) {
457 # Either limited by maxFetch or total results, whichever is smaller
458 min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
459 } else {
460 # All pages
461 ceiling(kqo@totalResults / maxResultsPerPage)
462 }
463
464 # Generate randomized page indices (0-based for API)
465 pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
466 page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200467 }
468
Marc Kupietzd8851222025-05-01 10:57:19 +0200469 if (is.null(collectedMatches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200470 collectedMatches <- data.frame()
471 }
Marc Kupietz623d7122025-05-25 12:46:12 +0200472
473 # Initialize the page counter properly based on nextStartIndex and any previously fetched results
474 # We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
475 # For first call, this will be 1, for subsequent calls, it will reflect our actual position
476 current_page_number <- ceiling(offset / maxResultsPerPage) + 1
477
478 # For sequential fetches, keep track of which global page we're on
479 # This is important for correctly showing page numbers in subsequent fetchNext calls
480 page_count_start <- current_page_number
481
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200482 repeat {
Marc Kupietz623d7122025-05-25 12:46:12 +0200483 # Determine which page to fetch next
484 if (randomizePageOrder) {
485 # In randomized mode, get the page from our randomized list using the page_index
486 # Make sure we don't exceed the array bounds
487 if (page_index > length(pages)) {
488 break # No more pages to fetch in randomized mode
489 }
490 current_offset_page <- pages[page_index]
491 # For display purposes in randomized mode, show which page out of the total we're fetching
492 display_page_number <- page_index
493 } else {
494 # In sequential mode, use the current_page_number to calculate the offset
495 current_offset_page <- (current_page_number - 1)
496 display_page_number <- current_page_number
497 }
498
499 # Calculate the actual offset in tokens
500 currentOffset <- current_offset_page * maxResultsPerPage
501
Marc Kupietzef0e9392025-06-18 12:21:49 +0200502 # Build the query with the appropriate count and offset using httr2
503 count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
Marc Kupietzecc86702025-06-24 12:12:51 +0200504
Marc Kupietzef0e9392025-06-18 12:21:49 +0200505 # Parse existing URL to preserve all query parameters
506 parsed_url <- httr2::url_parse(kqo@requestUrl)
507 existing_query <- parsed_url$query
Marc Kupietzecc86702025-06-24 12:12:51 +0200508
Marc Kupietzef0e9392025-06-18 12:21:49 +0200509 # Add/update count and offset parameters
510 existing_query$count <- count_param
511 existing_query$offset <- currentOffset
Marc Kupietzecc86702025-06-24 12:12:51 +0200512
Marc Kupietzef0e9392025-06-18 12:21:49 +0200513 # Rebuild the URL with all parameters
514 query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz68170952021-06-30 09:37:21 +0200515 res <- apiCall(kqo@korapConnection, query)
516 if (length(res$matches) == 0) {
517 break
518 }
519
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200520 if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz16ccf112025-01-26 13:25:27 +0100521 log_info(verbose, "Using fields API: ")
Marc Kupietz05a60792024-12-07 16:23:31 +0100522 currentMatches <- res$matches$fields %>%
523 purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
524 tibble::enframe() %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200525 tidyr::unnest(cols = value) %>%
526 tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200527 dplyr::select(-name)
Marc Kupietzd8851222025-05-01 10:57:19 +0200528 if ("snippet" %in% colnames(res$matches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200529 currentMatches$snippet <- res$matches$snippet
530 }
Marc Kupietz3cd2c6c2025-01-08 20:35:39 +0100531 if ("tokens" %in% colnames(res$matches)) {
532 currentMatches$tokens <- res$matches$tokens
533 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200534 } else {
535 currentMatches <- res$matches
536 }
537
Marc Kupietze95108e2019-09-18 13:23:58 +0200538 for (field in kqo@fields) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200539 if (!field %in% colnames(currentMatches)) {
540 currentMatches[, field] <- NA
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200541 }
542 }
Marc Kupietzf4881122024-12-17 14:55:39 +0100543 currentMatches <- currentMatches %>%
544 select(kqo@fields) %>%
545 mutate(
Marc Kupietzff712a92025-07-18 09:07:23 +0200546 matchID = res$matches$matchID,
Marc Kupietz0447da02025-01-08 20:51:09 +0100547 tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
Marc Kupietzf4881122024-12-17 14:55:39 +0100548 matchStart = as.integer(stringr::word(tmp_positions, 1)),
549 matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
550 ) %>%
551 select(-tmp_positions)
552
Marc Kupietz62da2b52019-09-12 17:43:34 +0200553 if (!is.list(collectedMatches)) {
554 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200555 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200556 collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200557 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200558
Marc Kupietz623d7122025-05-25 12:46:12 +0200559 # Get the actual items per page from the API response
560 # We now consistently use maxResultsPerPage instead
Marc Kupietzacbaab02025-05-01 10:56:35 +0200561
Marc Kupietz623d7122025-05-25 12:46:12 +0200562 # Calculate total pages consistently using fixed maxResultsPerPage
563 # This ensures consistent page counting across the function
564 total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
565
Marc Kupietz24799fd2025-06-25 14:15:36 +0200566 # Calculate ETA using the centralized function from logging.R
567 current_page <- if (randomizePageOrder) page_index else display_page_number
568 total_pages_to_fetch <- if (!is.na(maxFetch)) {
569 # Account for offset - we can only fetch from the remaining results after offset
570 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
571 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
572 } else {
573 total_pages
574 }
Marc Kupietz365660e2025-06-25 15:09:55 +0200575
Marc Kupietz24799fd2025-06-25 14:15:36 +0200576 eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)
Marc Kupietz365660e2025-06-25 15:09:55 +0200577
Marc Kupietz24799fd2025-06-25 14:15:36 +0200578 # Extract timing information for display
Marc Kupietzae9b6172025-05-02 15:50:01 +0200579 time_per_page <- NA
Marc Kupietzae9b6172025-05-02 15:50:01 +0200580 if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
Marc Kupietzae9b6172025-05-02 15:50:01 +0200581 time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
Marc Kupietzacbaab02025-05-01 10:56:35 +0200582 }
583
Marc Kupietz623d7122025-05-25 12:46:12 +0200584 # Create the page display string with proper formatting
Marc Kupietzacbaab02025-05-01 10:56:35 +0200585
Marc Kupietz623d7122025-05-25 12:46:12 +0200586 # For global page tracking, calculate the absolute page number
587 actual_display_number <- if (randomizePageOrder) {
588 current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
589 } else {
590 # In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
591 current_offset_page + 1
592 }
593
594 # For subsequent calls to fetchNext, we need to calculate the correct page numbers
595 # based on the current batch being fetched
596
597 # For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
598 # Simply count from 1 within the current batch
599
600 # The relative page number is simply the current position in this batch
601 if (randomizePageOrder) {
602 relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
603 } else {
604 relative_page_number <- display_page_number - (page_count_start - 1)
605 }
606
607 # How many pages will we fetch in this batch?
Marc Kupietz021663d2025-06-18 17:49:22 +0200608 # If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz623d7122025-05-25 12:46:12 +0200609 pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz021663d2025-06-18 17:49:22 +0200610 # Account for offset - we can only fetch from the remaining results after offset
611 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
612 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz623d7122025-05-25 12:46:12 +0200613 } else {
614 # Otherwise fetch all remaining pages
615 total_pages - page_count_start + 1
616 }
617
618 # The total pages to be shown in this batch
619 batch_total_pages <- pages_in_this_batch
620
621 page_display <- paste0(
622 "Retrieved page ",
623 sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
624 "/",
625 sprintf("%d", batch_total_pages)
626 )
627
628 # If randomized, also show which actual page we fetched
629 if (randomizePageOrder) {
630 # Determine the maximum width needed for page numbers (based on total pages)
631 # This ensures consistent alignment
632 max_page_width <- nchar(as.character(total_pages))
633 # Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz7638ca42025-05-25 13:18:16 +0200634 page_display <- paste0(
635 page_display,
636 sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
637 )
Marc Kupietz623d7122025-05-25 12:46:12 +0200638 }
639 # Always show the absolute page number and total pages (for clarity)
640 else {
641 # Show the absolute page number (out of total possible pages)
642 page_display <- paste0(page_display, sprintf(
643 " (page %d of %d total)",
644 actual_display_number, total_pages
645 ))
646 }
647
648 # Add caching or timing information
649 if (!is.null(res$meta$cached)) {
650 page_display <- paste0(page_display, " [cached]")
651 } else {
652 page_display <- paste0(
653 page_display,
654 " in ",
655 if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
Marc Kupietz24799fd2025-06-25 14:15:36 +0200656 "s",
657 eta_info
Marc Kupietz623d7122025-05-25 12:46:12 +0200658 )
659 }
660
661 log_info(verbose, paste0(page_display, "\n"))
662
663 # Increment the appropriate counter based on mode
664 if (randomizePageOrder) {
665 page_index <- page_index + 1
666 } else {
667 current_page_number <- current_page_number + 1
668 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200669 results <- results + res$meta$itemsPerPage
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200670 if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200671 break
672 }
673 }
Marc Kupietz68170952021-06-30 09:37:21 +0200674 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietzd8851222025-05-01 10:57:19 +0200675 KorAPQuery(
676 nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200677 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200678 fields = kqo@fields,
679 requestUrl = kqo@requestUrl,
680 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200681 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200682 vc = kqo@vc,
683 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200684 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200685 apiResponse = res,
Marc Kupietzd8851222025-05-01 10:57:19 +0200686 collectedMatches = collectedMatches
687 )
Marc Kupietze95108e2019-09-18 13:23:58 +0200688})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200689
690#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200691#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200692#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100693#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200694#' @family corpus search functions
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200695#' @param kqo object obtained from [corpusQuery()]
696#' @param verbose print progress information if true
697#' @param ... further arguments passed to [fetchNext()]
698#' @return The updated `kqo` object with all results in `@collectedMatches`
Marc Kupietza8c40f42025-06-24 15:49:52 +0200699#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200700#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200701#' \dontrun{
Marc Kupietzecc86702025-06-24 12:12:51 +0200702#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
703#' q <- KorAPConnection() |>
704#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200705#' fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200706#' q@collectedMatches
Marc Kupietzecc86702025-06-24 12:12:51 +0200707#'
708#' # Fetch also all KWICs
709#' q <- KorAPConnection() |> auth() |>
710#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
711#' fetchAll()
712#' q@collectedMatches
713#'
714#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
715#' q <- KorAPConnection() |>
716#' corpusQuery("<base/s=t>", # this matches each text once
717#' vc = "pubDate in 1958-03-12",
718#' fields = c("textSigle", "title"),
719#' ) |>
720#' fetchAll()
721#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100722#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200723#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200724#' @aliases fetchAll
Marc Kupietz62da2b52019-09-12 17:43:34 +0200725#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200726setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
727 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200728})
729
730#' Fetches the remaining results of a KorAP query.
731#'
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200732#' @param kqo object obtained from [corpusQuery()]
733#' @param verbose print progress information if true
734#' @param ... further arguments passed to [fetchNext()]
735#' @return The updated `kqo` object with remaining results in `@collectedMatches`
736#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200737#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200738#' \dontrun{
739#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200740#' q <- KorAPConnection() |>
741#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200742#' fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200743#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100744#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200745#'
746#' @aliases fetchRest
Marc Kupietze95108e2019-09-18 13:23:58 +0200747#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200748setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
749 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200750})
751
Marc Kupietza29f3d42025-07-18 10:14:43 +0200752#'
753#' Parse XML annotations into linguistic layers
754#'
755#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
756#' from XML annotation snippets returned by the KorAP API.
757#'
758#' @param xml_snippet XML string containing annotation data
759#' @return Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
760#' @keywords internal
761parse_xml_annotations <- function(xml_snippet) {
762 if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
763 return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
764 }
765
766 # Extract content within <span class="match">...</span> using a more robust approach
767 if (grepl('<span class="match">', xml_snippet)) {
768 # Find the start of match span
769 start_pos <- regexpr('<span class="match">', xml_snippet)
770 if (start_pos > 0) {
771 # Find the end by counting nested spans
772 content_start <- start_pos + attr(start_pos, "match.length")
773 remaining <- substr(xml_snippet, content_start, nchar(xml_snippet))
774
775 # Simple approach: extract everything until we hit context-right or end
776 if (grepl('<span class="context-right">', remaining)) {
777 content_to_parse <- gsub('(.*?)<span class="context-right">.*', '\\1', remaining)
778 } else {
779 # Find the closing </span> that matches our opening span
780 # For now, use a simpler approach - take everything until the last </span> sequence
781 content_to_parse <- gsub('(.*)</span>\\s*$', '\\1', remaining)
782 }
783 } else {
784 content_to_parse <- xml_snippet
785 }
786 } else {
787 content_to_parse <- xml_snippet
788 }
789
790 # Initialize result vectors
791 tokens <- character(0)
792 lemmas <- character(0)
793 pos_tags <- character(0)
794 morph_tags <- character(0)
795
796 # Split the content by </span> and process each meaningful part
797 parts <- unlist(strsplit(content_to_parse, '</span>'))
798
799 for (part in parts) {
800 part <- trimws(part)
801 if (nchar(part) == 0) next
802
803 # Look for parts that have title attributes and end with text
804 if (grepl('<span[^>]*title=', part)) {
805 # Extract the text content (everything after the last >)
806 text_content <- gsub('.*>([^<]*)$', '\\1', part)
807 text_content <- trimws(text_content)
808
809 if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
810 tokens <- c(tokens, text_content)
811
812 # Extract all title attributes from this part
813 title_pattern <- 'title="([^"]*)"'
814 title_matches <- gregexpr(title_pattern, part)
815
816 lemma <- NA
817 pos_tag <- NA
818 morph_tag <- NA
819
820 if (title_matches[[1]][1] != -1) {
821 all_titles <- regmatches(part, title_matches)[[1]]
822 for (title_match in all_titles) {
823 title_content <- gsub(title_pattern, '\\1', title_match)
Marc Kupietzc643a122025-07-18 18:18:36 +0200824
Marc Kupietza29f3d42025-07-18 10:14:43 +0200825 # Split by spaces and process each annotation
826 annotations <- unlist(strsplit(title_content, "\\s+"))
827 for (annotation in annotations) {
Marc Kupietzc643a122025-07-18 18:18:36 +0200828 if (grepl('^[^/]+/l:', annotation)) {
829 lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
830 } else if (grepl('^[^/]+/p:', annotation)) {
831 pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
832 } else if (grepl('^[^/]+/m:', annotation)) {
833 morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200834 }
835 }
836 }
837 }
838
839 lemmas <- c(lemmas, lemma)
840 pos_tags <- c(pos_tags, pos_tag)
841 morph_tags <- c(morph_tags, morph_tag)
842 }
843 }
844 }
845
846 # If no tokens found with the splitting approach, try a different method
847 if (length(tokens) == 0) {
848 # Look for the innermost spans that contain actual text
849 innermost_pattern <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
850 innermost_matches <- gregexpr(innermost_pattern, content_to_parse, perl = TRUE)
851
852 if (innermost_matches[[1]][1] != -1) {
853 matches <- regmatches(content_to_parse, innermost_matches)[[1]]
854
855 for (match in matches) {
856 title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
857 text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
858 text <- trimws(text)
859
860 if (nchar(text) > 0) {
861 tokens <- c(tokens, text)
862
863 # Parse space-separated annotations in title
864 lemma <- NA
865 pos_tag <- NA
866 morph_tag <- NA
Marc Kupietzc643a122025-07-18 18:18:36 +0200867
Marc Kupietza29f3d42025-07-18 10:14:43 +0200868 annotations <- unlist(strsplit(title, "\\s+"))
869 for (annotation in annotations) {
Marc Kupietzc643a122025-07-18 18:18:36 +0200870 if (grepl('^[^/]+/l:', annotation)) {
871 lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
872 } else if (grepl('^[^/]+/p:', annotation)) {
873 pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
874 } else if (grepl('^[^/]+/m:', annotation)) {
875 morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200876 }
877 }
Marc Kupietzc643a122025-07-18 18:18:36 +0200878
Marc Kupietza29f3d42025-07-18 10:14:43 +0200879 lemmas <- c(lemmas, lemma)
880 pos_tags <- c(pos_tags, pos_tag)
881 morph_tags <- c(morph_tags, morph_tag)
882 }
883 }
884 }
885 }
886
887 # Ensure all vectors have the same length
888 max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
889 if (max_length > 0) {
890 tokens <- c(tokens, rep(NA, max_length - length(tokens)))
891 lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
892 pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
893 morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
894 }
895
896 return(list(
897 token = tokens,
898 lemma = lemmas,
899 pos = pos_tags,
900 morph = morph_tags
901 ))
902}
903
904#'
905#' Parse XML annotations into linguistic layers with left/match/right structure
906#'
907#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
908#' from XML annotation snippets returned by the KorAP API, split into left context,
909#' match, and right context sections like the tokens field.
910#'
911#' @param xml_snippet XML string containing annotation data
912#' @return Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
913#' @keywords internal
914parse_xml_annotations_structured <- function(xml_snippet) {
915 if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
916 empty_result <- list(left = character(0), match = character(0), right = character(0))
917 return(list(
918 atokens = empty_result,
919 lemma = empty_result,
920 pos = empty_result,
921 morph = empty_result
922 ))
923 }
924
925 # Helper function to extract annotations from a span section
926 extract_annotations_from_section <- function(section_content) {
Marc Kupietz560b5912025-09-01 17:36:13 +0200927 # Remove any <mark>...</mark> tags that may interrupt token boundaries
928 section_no_marks <- gsub('</?mark[^>]*>', '', section_content, perl = TRUE)
929 # Normalize separators between adjacent top-level spans so splitting is robust.
930 # Replace any punctuation/entity/space run between one-or-more closing spans and the next opening span
931 # with a single space, preserving all closing spans.
932 section_norm <- gsub('((?:</span>)+)[[:space:]]*(?:&[^;]+;|[[:punct:]]|[[:space:]])*[[:space:]]*(<span)', '\\1 \\2', section_no_marks, perl = TRUE)
933 # Handle both spaced tokens and nested single tokens by scanning innermost spans with direct text
Marc Kupietza29f3d42025-07-18 10:14:43 +0200934 tokens <- character(0)
935 lemmas <- character(0)
936 pos_tags <- character(0)
937 morph_tags <- character(0)
Marc Kupietz89f796e2025-07-19 09:05:06 +0200938
Marc Kupietz560b5912025-09-01 17:36:13 +0200939 pat_token <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
940 mm <- gregexpr(pat_token, section_norm, perl = TRUE)
941 if (mm[[1]][1] != -1) {
942 starts <- mm[[1]]
943 lens <- attr(mm[[1]], 'match.length')
944 for (k in seq_along(starts)) {
945 s <- starts[k]
946 e <- s + lens[k] - 1
947 fragment <- substr(section_norm, s, e)
948 text_content <- sub(pat_token, '\\2', fragment, perl = TRUE)
949 text_content <- trimws(text_content)
950 title_content <- sub(pat_token, '\\1', fragment, perl = TRUE)
Marc Kupietz89f796e2025-07-19 09:05:06 +0200951
Marc Kupietz560b5912025-09-01 17:36:13 +0200952 if (nchar(text_content) == 0) next
Marc Kupietz89f796e2025-07-19 09:05:06 +0200953
Marc Kupietz560b5912025-09-01 17:36:13 +0200954 lemma <- NA
955 pos_tag <- NA
956 morph_features <- character(0)
Marc Kupietz89f796e2025-07-19 09:05:06 +0200957
Marc Kupietz560b5912025-09-01 17:36:13 +0200958 # parse inner title
959 ann <- unlist(strsplit(title_content, "[[:space:]]+"))
960 for (a in ann) {
961 if (grepl('/l:', a)) {
962 lemma <- sub('.*?/l:(.*)$', '\\1', a, perl = TRUE)
963 } else if (grepl('/p:', a)) {
964 pos_tag <- sub('.*?/p:(.*)$', '\\1', a, perl = TRUE)
965 } else if (grepl('/m:', a)) {
966 morph_features <- c(morph_features, sub('.*?/m:(.*)$', '\\1', a, perl = TRUE))
Marc Kupietza29f3d42025-07-18 10:14:43 +0200967 }
968 }
Marc Kupietz560b5912025-09-01 17:36:13 +0200969
970 # If lemma missing, look back in nearby context for the nearest title containing l:
971 if (is.na(lemma) || nchar(lemma) == 0) {
972 ctx_start <- max(1, s - 500)
973 context <- substr(section_norm, ctx_start, s - 1)
974 tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
975 if (tmm[[1]][1] != -1) {
976 ctx_titles <- regmatches(context, tmm)[[1]]
977 for (ti in rev(ctx_titles)) {
978 cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
979 if (grepl('/l:', cont)) {
980 lemma <- sub('.*?/l:([^ ]+).*', '\\1', cont, perl = TRUE)
981 break
982 }
983 }
984 }
985 }
986
987 # If POS missing, keep NA; morphological features may also appear in outer titles
988 if (length(morph_features) == 0) {
989 ctx_start <- max(1, s - 500)
990 context <- substr(section_norm, ctx_start, s - 1)
991 tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
992 if (tmm[[1]][1] != -1) {
993 ctx_titles <- regmatches(context, tmm)[[1]]
994 for (ti in rev(ctx_titles)) {
995 cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
996 if (grepl('/m:', cont)) {
997 mparts <- unlist(strsplit(cont, "[[:space:]]+"))
998 for (mp in mparts) if (grepl('/m:', mp)) morph_features <- c(morph_features, sub('.*?/m:(.*)$', '\\1', mp, perl = TRUE))
999 break
1000 }
1001 }
1002 }
1003 }
1004
1005 tokens <- c(tokens, text_content)
1006 lemmas <- c(lemmas, if (!is.null(lemma)) lemma else NA)
1007 pos_tags <- c(pos_tags, if (!is.null(pos_tag)) pos_tag else NA)
1008 morph_tags <- c(morph_tags, if (length(morph_features) > 0) paste(morph_features, collapse = "|") else NA)
Marc Kupietza29f3d42025-07-18 10:14:43 +02001009 }
1010 }
1011
1012 # Ensure all vectors have the same length
1013 max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
1014 if (max_length > 0) {
1015 tokens <- c(tokens, rep(NA, max_length - length(tokens)))
1016 lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
1017 pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
1018 morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
1019 }
1020
1021 return(list(
1022 tokens = tokens,
1023 lemmas = lemmas,
1024 pos_tags = pos_tags,
1025 morph_tags = morph_tags
1026 ))
1027 }
1028
1029 # Split the XML into three parts: left context, match content, and right context
1030 # The structure is: <span class="match">...left...<mark>...match...</mark>...right...</span>
Marc Kupietzc643a122025-07-18 18:18:36 +02001031
Marc Kupietza29f3d42025-07-18 10:14:43 +02001032 # First extract the content within the match span using DOTALL modifier
1033 match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*<span class="context-right">'
1034 match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001035
Marc Kupietza29f3d42025-07-18 10:14:43 +02001036 if (match_span_match == -1) {
1037 # Try alternative pattern if no context-right
1038 match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*$'
1039 match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
1040 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001041
Marc Kupietza29f3d42025-07-18 10:14:43 +02001042 if (match_span_match > 0) {
1043 match_span_content <- gsub(match_span_pattern, '\\1', xml_snippet, perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001044
Marc Kupietza29f3d42025-07-18 10:14:43 +02001045 # Now find the <mark> and </mark> positions within this content
1046 mark_start <- regexpr('<mark[^>]*>', match_span_content, perl = TRUE)
Marc Kupietz560b5912025-09-01 17:36:13 +02001047 # Use the LAST closing </mark> to cover multi-part matches
1048 mark_end_gre <- gregexpr('</mark>', match_span_content, perl = TRUE)
1049 mark_end_positions <- mark_end_gre[[1]]
1050 mark_end <- if (!is.null(mark_end_positions) && length(mark_end_positions) > 0 && mark_end_positions[1] != -1)
1051 mark_end_positions[length(mark_end_positions)] else -1
1052 mark_end_len <- if (mark_end != -1) attr(mark_end_gre[[1]], "match.length")[length(mark_end_positions)] else 0
Marc Kupietzc643a122025-07-18 18:18:36 +02001053
Marc Kupietza29f3d42025-07-18 10:14:43 +02001054 if (mark_start > 0 && mark_end > 0) {
Marc Kupietz560b5912025-09-01 17:36:13 +02001055 # Left context: everything before first <mark>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001056 left_content <- substr(match_span_content, 1, mark_start - 1)
Marc Kupietzc643a122025-07-18 18:18:36 +02001057
Marc Kupietz560b5912025-09-01 17:36:13 +02001058 # Match content: everything between first <mark> and last </mark>
1059 match_content <- substr(match_span_content, mark_start, mark_end + mark_end_len - 1)
Marc Kupietzc643a122025-07-18 18:18:36 +02001060
Marc Kupietz560b5912025-09-01 17:36:13 +02001061 # Right context: everything after last </mark>
1062 right_content_start <- mark_end + mark_end_len
Marc Kupietza29f3d42025-07-18 10:14:43 +02001063 right_content <- substr(match_span_content, right_content_start, nchar(match_span_content))
1064 } else {
1065 # No mark tags found, treat entire match span as match content
1066 left_content <- ""
1067 match_content <- match_span_content
1068 right_content <- ""
1069 }
1070 } else {
1071 # No match span found, treat entire content as match
1072 left_content <- ""
1073 match_content <- xml_snippet
1074 right_content <- ""
1075 }
1076
1077 # Process each section
1078 left_annotations <- extract_annotations_from_section(left_content)
1079 match_annotations <- extract_annotations_from_section(match_content)
1080 right_annotations <- extract_annotations_from_section(right_content)
1081
1082 return(list(
1083 atokens = list(
1084 left = left_annotations$tokens,
1085 match = match_annotations$tokens,
1086 right = right_annotations$tokens
1087 ),
1088 lemma = list(
1089 left = left_annotations$lemmas,
1090 match = match_annotations$lemmas,
1091 right = right_annotations$lemmas
1092 ),
1093 pos = list(
1094 left = left_annotations$pos_tags,
1095 match = match_annotations$pos_tags,
1096 right = right_annotations$pos_tags
1097 ),
1098 morph = list(
1099 left = left_annotations$morph_tags,
1100 match = match_annotations$morph_tags,
1101 right = right_annotations$morph_tags
1102 )
1103 ))
1104}
1105
Marc Kupietze52b2952025-07-17 16:53:02 +02001106#' Fetch annotations for all collected matches
1107#'
Marc Kupietz89f796e2025-07-19 09:05:06 +02001108#' `r lifecycle::badge("experimental")`
1109#'
1110#' **`fetchAnnotations`** fetches annotations (only token annotations, for now)
1111#' for all matches in the `@collectedMatches` slot
Marc Kupietzc643a122025-07-18 18:18:36 +02001112#' of a KorAPQuery object and adds annotation columns directly to the `@collectedMatches`
Marc Kupietz89f796e2025-07-19 09:05:06 +02001113#' data frame. The method uses the `matchID` from collected matches.
Marc Kupietza29f3d42025-07-18 10:14:43 +02001114#'
1115#' **Important**: For copyright-restricted corpora, users must be authorized via [auth()]
1116#' and the initial corpus query must have `metadataOnly = FALSE` to ensure snippets are
1117#' available for annotation parsing.
1118#'
1119#' The method parses XML snippet annotations and adds linguistic columns to the data frame:
1120#' - `pos`: data frame with `left`, `match`, `right` columns, each containing list vectors of part-of-speech tags
1121#' - `lemma`: data frame with `left`, `match`, `right` columns, each containing list vectors of lemmas
1122#' - `morph`: data frame with `left`, `match`, `right` columns, each containing list vectors of morphological tags
1123#' - `atokens`: data frame with `left`, `match`, `right` columns, each containing list vectors of token text (from annotations)
1124#' - `annotation_snippet`: original XML snippet from the annotation API
Marc Kupietze52b2952025-07-17 16:53:02 +02001125#'
1126#' @family corpus search functions
Marc Kupietz89f796e2025-07-19 09:05:06 +02001127#' @concept Annotations
Marc Kupietze52b2952025-07-17 16:53:02 +02001128#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001129#' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
Marc Kupietze52b2952025-07-17 16:53:02 +02001130#' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
Marc Kupietz93787d52025-09-03 13:33:25 +02001131#' @param overwrite logical; if TRUE, re-fetch and replace any existing
1132#' annotation columns. If FALSE (default), only add missing annotation layers
1133#' and preserve already fetched ones (e.g., keep POS/lemma from a previous
1134#' foundry while adding morph from another).
Marc Kupietze52b2952025-07-17 16:53:02 +02001135#' @param verbose print progress information if true
Marc Kupietz0af75932025-09-09 18:14:16 +02001136#' @return The updated `kqo` object with annotation columns
Marc Kupietz89f796e2025-07-19 09:05:06 +02001137#' like `pos`, `lemma`, `morph` (and `atokens` and `annotation_snippet`)
1138#' in the `@collectedMatches` slot. Each column is a data frame
1139#' with `left`, `match`, and `right` columns containing list vectors of annotations
1140#' for the left context, matched tokens, and right context, respectively.
1141#' The original XML snippet for each match is also stored in `annotation_snippet`.
Marc Kupietze52b2952025-07-17 16:53:02 +02001142#'
1143#' @examples
1144#' \dontrun{
1145#'
1146#' # Fetch annotations for matches using Tree-Tagger foundry
Marc Kupietza29f3d42025-07-18 10:14:43 +02001147#' # Note: Authorization required for copyright-restricted corpora
Marc Kupietze52b2952025-07-17 16:53:02 +02001148#' q <- KorAPConnection() |>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001149#' auth() |>
1150#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
Marc Kupietze52b2952025-07-17 16:53:02 +02001151#' fetchNext(maxFetch = 10) |>
1152#' fetchAnnotations()
Marc Kupietze52b2952025-07-17 16:53:02 +02001153#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001154#' # Access linguistic annotations for match i:
Marc Kupietz6aa5a0d2025-09-08 17:51:47 +02001155#' pos_tags <- q@collectedMatches$pos
1156#' # Data frame with left/match/right columns for POS tags
1157#' lemmas <- q@collectedMatches$lemma
1158#' # Data frame with left/match/right columns for lemmas
1159#' morphology <- q@collectedMatches$morph
1160#' # Data frame with left/match/right columns for morphological tags
1161#' atokens <- q@collectedMatches$atokens
1162#' # Data frame with left/match/right columns for annotation token text
Marc Kupietz0af75932025-09-09 18:14:16 +02001163#' # Original XML snippet for match i
1164#' raw_snippet <- q@collectedMatches$annotation_snippet[[i]]
Marc Kupietzc643a122025-07-18 18:18:36 +02001165#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001166#' # Access specific components:
Marc Kupietz0af75932025-09-09 18:14:16 +02001167#' # POS tags for the matched tokens in match i
1168#' match_pos <- q@collectedMatches$pos$match[[i]]
1169#' # Lemmas for the left context in match i
1170#' left_lemmas <- q@collectedMatches$lemma$left[[i]]
1171#' # Token text for the right context in match i
1172#' right_tokens <- q@collectedMatches$atokens$right[[i]]
Marc Kupietza29f3d42025-07-18 10:14:43 +02001173#'
Marc Kupietz89f796e2025-07-19 09:05:06 +02001174#' # Use a different foundry (e.g., MarMoT)
Marc Kupietze52b2952025-07-17 16:53:02 +02001175#' q <- KorAPConnection() |>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001176#' auth() |>
1177#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
Marc Kupietze52b2952025-07-17 16:53:02 +02001178#' fetchNext(maxFetch = 10) |>
Marc Kupietz89f796e2025-07-19 09:05:06 +02001179#' fetchAnnotations(foundry = "marmot")
1180#' q@collectedMatches$pos$left[1] # POS tags for the left context of the first match
Marc Kupietze52b2952025-07-17 16:53:02 +02001181#' }
Marc Kupietze52b2952025-07-17 16:53:02 +02001182#' @export
Marc Kupietz0af75932025-09-09 18:14:16 +02001183setMethod("fetchAnnotations", "KorAPQuery", function(kqo,
1184 foundry = "tt",
1185 overwrite = FALSE,
1186 verbose = kqo@korapConnection@verbose) {
1187 if (is.null(kqo@collectedMatches) ||
1188 nrow(kqo@collectedMatches) == 0) {
1189 warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
1190 return(kqo)
1191 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001192
Marc Kupietze52b2952025-07-17 16:53:02 +02001193 df <- kqo@collectedMatches
1194 kco <- kqo@korapConnection
Marc Kupietza29f3d42025-07-18 10:14:43 +02001195
Marc Kupietza29f3d42025-07-18 10:14:43 +02001196 # Initialize annotation columns as data frames (like tokens field)
1197 # Create the structure more explicitly to avoid assignment issues
1198 nrows <- nrow(df)
Marc Kupietzc643a122025-07-18 18:18:36 +02001199
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001200 # Pre-compute the empty character vector list to avoid repeated computation
1201 empty_char_list <- I(replicate(nrows, character(0), simplify = FALSE))
Marc Kupietz0af75932025-09-09 18:14:16 +02001202
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001203 # Helper function to create annotation data frame structure
1204 create_annotation_df <- function(empty_list) {
1205 data.frame(
1206 left = empty_list,
1207 match = empty_list,
1208 right = empty_list,
1209 stringsAsFactors = FALSE
1210 )
1211 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001212
Marc Kupietz93787d52025-09-03 13:33:25 +02001213 # Track which annotation columns already existed to decide overwrite behavior
1214 existing_types <- list(
1215 pos = "pos" %in% colnames(df),
1216 lemma = "lemma" %in% colnames(df),
1217 morph = "morph" %in% colnames(df),
1218 atokens = "atokens" %in% colnames(df),
1219 annotation_snippet = "annotation_snippet" %in% colnames(df)
1220 )
1221
1222 # Initialize annotation columns using the helper function
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001223 annotation_types <- c("pos", "lemma", "morph", "atokens")
1224 for (type in annotation_types) {
Marc Kupietz93787d52025-09-03 13:33:25 +02001225 if (overwrite || !existing_types[[type]]) {
1226 df[[type]] <- create_annotation_df(empty_char_list)
1227 }
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001228 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001229
Marc Kupietz93787d52025-09-03 13:33:25 +02001230 if (overwrite || !existing_types$annotation_snippet) {
1231 df$annotation_snippet <- replicate(nrows, NA, simplify = FALSE)
1232 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001233
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001234 # Initialize timing for ETA calculation
1235 start_time <- Sys.time()
1236 if (verbose) {
1237 log_info(verbose, paste("Starting to fetch annotations for", nrows, "matches\n"))
1238 }
1239
Marc Kupietz93787d52025-09-03 13:33:25 +02001240 # Helper to decide if existing annotation row is effectively empty
1241 is_empty_annotation_row <- function(ann_df, row_index) {
1242 if (is.null(ann_df) || nrow(ann_df) < row_index) return(TRUE)
1243 left_val <- ann_df$left[[row_index]]
1244 match_val <- ann_df$match[[row_index]]
1245 right_val <- ann_df$right[[row_index]]
1246 all(
1247 (is.null(left_val) || (length(left_val) == 0) || all(is.na(left_val))),
1248 (is.null(match_val) || (length(match_val) == 0) || all(is.na(match_val))),
1249 (is.null(right_val) || (length(right_val) == 0) || all(is.na(right_val)))
1250 )
1251 }
1252
Marc Kupietze52b2952025-07-17 16:53:02 +02001253 for (i in seq_len(nrow(df))) {
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001254 # ETA logging
1255 if (verbose && i > 1) {
1256 eta_info <- calculate_eta(i, nrows, start_time)
1257 log_info(verbose, paste("Fetching annotations for match", i, "of", nrows, eta_info, "\n"))
1258 }
Marc Kupietzff712a92025-07-18 09:07:23 +02001259 # Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
1260 if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001261 # matchID format: "match-match-A00/JUN/39609-p202-203" or encrypted format like
1262 # "match-DNB10/CSL/80400-p2343-2344x_MinDOhu_P6dd2MMZJyyus_7MairdKnr1LxY07Cya-Ow"
1263 # Extract document path and position, handling both regular and encrypted formats
Marc Kupietzc643a122025-07-18 18:18:36 +02001264
Marc Kupietza29f3d42025-07-18 10:14:43 +02001265 # More flexible regex to extract the document path with position and encryption
1266 # Look for pattern: match-(...)-p(\d+)-(\d+)(.*) where (.*) is the encrypted part
1267 # We need to capture the entire path including the encrypted suffix
1268 match_result <- regexpr("match-(.+?-p\\d+-\\d+.*)", df$matchID[i], perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001269
Marc Kupietza29f3d42025-07-18 10:14:43 +02001270 if (match_result > 0) {
1271 # Extract the complete path including encryption (everything after "match-")
1272 doc_path_with_pos_and_encryption <- gsub("^match-(.+)$", "\\1", df$matchID[i], perl = TRUE)
1273 # Convert the dash before position to slash, but keep everything after the position
1274 match_path <- gsub("-p(\\d+-\\d+.*)", "/p\\1", doc_path_with_pos_and_encryption)
Marc Kupietz25121302025-07-19 08:45:43 +02001275 # Use httr2 to construct URL safely
1276 base_url <- paste0(kco@apiUrl, "corpus/", match_path)
1277 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietza29f3d42025-07-18 10:14:43 +02001278 } else {
Marc Kupietz25121302025-07-19 08:45:43 +02001279 # If regex fails, fall back to the old method with httr2
1280 # Format numbers to avoid scientific notation
1281 match_start <- format(df$matchStart[i], scientific = FALSE)
1282 match_end <- format(df$matchEnd[i], scientific = FALSE)
1283 base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
1284 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietzff712a92025-07-18 09:07:23 +02001285 }
1286 } else {
Marc Kupietz25121302025-07-19 08:45:43 +02001287 # Fallback to the old method with httr2
1288 # Format numbers to avoid scientific notation
1289 match_start <- format(df$matchStart[i], scientific = FALSE)
1290 match_end <- format(df$matchEnd[i], scientific = FALSE)
1291 base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
1292 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietzff712a92025-07-18 09:07:23 +02001293 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001294
Marc Kupietze52b2952025-07-17 16:53:02 +02001295 tryCatch({
1296 res <- apiCall(kco, req)
Marc Kupietzc643a122025-07-18 18:18:36 +02001297
Marc Kupietze52b2952025-07-17 16:53:02 +02001298 if (!is.null(res)) {
Marc Kupietz93787d52025-09-03 13:33:25 +02001299 # Store the raw annotation snippet (respect overwrite flag)
1300 if (overwrite || !existing_types$annotation_snippet || is.null(df$annotation_snippet[[i]]) || is.na(df$annotation_snippet[[i]])) {
1301 df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
1302 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001303
1304 # Parse XML annotations if snippet is available
1305 if (is.list(res) && "snippet" %in% names(res)) {
1306 parsed_annotations <- parse_xml_annotations_structured(res$snippet)
1307
1308 # Store the parsed linguistic data in data frame format (like tokens)
1309 # Use individual assignment to avoid data frame mismatch errors
1310 tryCatch({
1311 # Assign POS annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001312 if (overwrite || !existing_types$pos || is_empty_annotation_row(df$pos, i)) {
1313 df$pos$left[i] <- list(parsed_annotations$pos$left)
1314 df$pos$match[i] <- list(parsed_annotations$pos$match)
1315 df$pos$right[i] <- list(parsed_annotations$pos$right)
1316 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001317
Marc Kupietza29f3d42025-07-18 10:14:43 +02001318 # Assign lemma annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001319 if (overwrite || !existing_types$lemma || is_empty_annotation_row(df$lemma, i)) {
1320 df$lemma$left[i] <- list(parsed_annotations$lemma$left)
1321 df$lemma$match[i] <- list(parsed_annotations$lemma$match)
1322 df$lemma$right[i] <- list(parsed_annotations$lemma$right)
1323 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001324
Marc Kupietza29f3d42025-07-18 10:14:43 +02001325 # Assign morphology annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001326 if (overwrite || !existing_types$morph || is_empty_annotation_row(df$morph, i)) {
1327 df$morph$left[i] <- list(parsed_annotations$morph$left)
1328 df$morph$match[i] <- list(parsed_annotations$morph$match)
1329 df$morph$right[i] <- list(parsed_annotations$morph$right)
1330 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001331
Marc Kupietza29f3d42025-07-18 10:14:43 +02001332 # Assign token annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001333 if (overwrite || !existing_types$atokens || is_empty_annotation_row(df$atokens, i)) {
1334 df$atokens$left[i] <- list(parsed_annotations$atokens$left)
1335 df$atokens$match[i] <- list(parsed_annotations$atokens$match)
1336 df$atokens$right[i] <- list(parsed_annotations$atokens$right)
1337 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001338 }, error = function(assign_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001339 # Set empty character vectors on assignment error using list assignment
Marc Kupietz93787d52025-09-03 13:33:25 +02001340 if (overwrite || !existing_types$pos) {
1341 df$pos$left[i] <<- list(character(0))
1342 df$pos$match[i] <<- list(character(0))
1343 df$pos$right[i] <<- list(character(0))
1344 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001345
Marc Kupietz93787d52025-09-03 13:33:25 +02001346 if (overwrite || !existing_types$lemma) {
1347 df$lemma$left[i] <<- list(character(0))
1348 df$lemma$match[i] <<- list(character(0))
1349 df$lemma$right[i] <<- list(character(0))
1350 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001351
Marc Kupietz93787d52025-09-03 13:33:25 +02001352 if (overwrite || !existing_types$morph) {
1353 df$morph$left[i] <<- list(character(0))
1354 df$morph$match[i] <<- list(character(0))
1355 df$morph$right[i] <<- list(character(0))
1356 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001357
Marc Kupietz93787d52025-09-03 13:33:25 +02001358 if (overwrite || !existing_types$atokens) {
1359 df$atokens$left[i] <<- list(character(0))
1360 df$atokens$match[i] <<- list(character(0))
1361 df$atokens$right[i] <<- list(character(0))
1362 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001363 })
Marc Kupietza29f3d42025-07-18 10:14:43 +02001364 } else {
1365 # No snippet available, store empty vectors
Marc Kupietz93787d52025-09-03 13:33:25 +02001366 if (overwrite || !existing_types$pos) {
1367 df$pos$left[i] <- list(character(0))
1368 df$pos$match[i] <- list(character(0))
1369 df$pos$right[i] <- list(character(0))
1370 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001371
Marc Kupietz93787d52025-09-03 13:33:25 +02001372 if (overwrite || !existing_types$lemma) {
1373 df$lemma$left[i] <- list(character(0))
1374 df$lemma$match[i] <- list(character(0))
1375 df$lemma$right[i] <- list(character(0))
1376 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001377
Marc Kupietz93787d52025-09-03 13:33:25 +02001378 if (overwrite || !existing_types$morph) {
1379 df$morph$left[i] <- list(character(0))
1380 df$morph$match[i] <- list(character(0))
1381 df$morph$right[i] <- list(character(0))
1382 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001383
Marc Kupietz93787d52025-09-03 13:33:25 +02001384 if (overwrite || !existing_types$atokens) {
1385 df$atokens$left[i] <- list(character(0))
1386 df$atokens$match[i] <- list(character(0))
1387 df$atokens$right[i] <- list(character(0))
1388 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001389 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001390 } else {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001391 # Store NAs for failed requests
Marc Kupietz93787d52025-09-03 13:33:25 +02001392 if (overwrite || !existing_types$pos) {
1393 df$pos$left[i] <- list(NA)
1394 df$pos$match[i] <- list(NA)
1395 df$pos$right[i] <- list(NA)
1396 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001397
Marc Kupietz93787d52025-09-03 13:33:25 +02001398 if (overwrite || !existing_types$lemma) {
1399 df$lemma$left[i] <- list(NA)
1400 df$lemma$match[i] <- list(NA)
1401 df$lemma$right[i] <- list(NA)
1402 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001403
Marc Kupietz93787d52025-09-03 13:33:25 +02001404 if (overwrite || !existing_types$morph) {
1405 df$morph$left[i] <- list(NA)
1406 df$morph$match[i] <- list(NA)
1407 df$morph$right[i] <- list(NA)
1408 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001409
Marc Kupietz93787d52025-09-03 13:33:25 +02001410 if (overwrite || !existing_types$atokens) {
1411 df$atokens$left[i] <- list(NA)
1412 df$atokens$match[i] <- list(NA)
1413 df$atokens$right[i] <- list(NA)
1414 }
1415 if (overwrite || !existing_types$annotation_snippet) {
1416 df$annotation_snippet[[i]] <- NA
1417 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001418 }
1419 }, error = function(e) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001420 # Store NAs for failed requests
Marc Kupietz93787d52025-09-03 13:33:25 +02001421 if (overwrite || !existing_types$pos) {
1422 df$pos$left[i] <- list(NA)
1423 df$pos$match[i] <- list(NA)
1424 df$pos$right[i] <- list(NA)
1425 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001426
Marc Kupietz93787d52025-09-03 13:33:25 +02001427 if (overwrite || !existing_types$lemma) {
1428 df$lemma$left[i] <- list(NA)
1429 df$lemma$match[i] <- list(NA)
1430 df$lemma$right[i] <- list(NA)
1431 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001432
Marc Kupietz93787d52025-09-03 13:33:25 +02001433 if (overwrite || !existing_types$morph) {
1434 df$morph$left[i] <- list(NA)
1435 df$morph$match[i] <- list(NA)
1436 df$morph$right[i] <- list(NA)
1437 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001438
Marc Kupietz93787d52025-09-03 13:33:25 +02001439 if (overwrite || !existing_types$atokens) {
1440 df$atokens$left[i] <- list(NA)
1441 df$atokens$match[i] <- list(NA)
1442 df$atokens$right[i] <- list(NA)
1443 }
1444 if (overwrite || !existing_types$annotation_snippet) {
1445 df$annotation_snippet[[i]] <- NA
1446 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001447 })
1448 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001449
Marc Kupietza29f3d42025-07-18 10:14:43 +02001450 # Validate data frame structure before assignment
1451 if (nrow(df) != nrow(kqo@collectedMatches)) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001452 }
1453
1454 # Update the collectedMatches with annotation data
1455 tryCatch({
1456 kqo@collectedMatches <- df
1457 }, error = function(assign_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001458 # Try a safer approach: add columns individually
1459 tryCatch({
1460 kqo@collectedMatches$pos <- df$pos
Marc Kupietzc643a122025-07-18 18:18:36 +02001461 kqo@collectedMatches$lemma <- df$lemma
Marc Kupietza29f3d42025-07-18 10:14:43 +02001462 kqo@collectedMatches$morph <- df$morph
1463 kqo@collectedMatches$atokens <- df$atokens
1464 kqo@collectedMatches$annotation_snippet <- df$annotation_snippet
1465 }, error = function(col_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001466 warning("Failed to add annotation data to collectedMatches")
1467 })
1468 })
1469
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001470 if (verbose) {
1471 elapsed_time <- Sys.time() - start_time
1472 log_info(verbose, paste("Finished fetching annotations for", nrows, "matches in", format_duration(as.numeric(elapsed_time, units = "secs")), "\n"))
1473 }
1474
Marc Kupietze52b2952025-07-17 16:53:02 +02001475 return(kqo)
1476})
1477
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001478#' Query frequencies of search expressions in virtual corpora
Marc Kupietz3f575282019-10-04 14:46:04 +02001479#'
Marc Kupietz67edcb52021-09-20 21:54:24 +02001480#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001481#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +02001482#' confidence intervals of one ore multiple search terms across one or multiple
1483#' virtual corpora.
1484#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02001485#' @family frequency analysis
Marc Kupietz3f575282019-10-04 14:46:04 +02001486#' @aliases frequencyQuery
Marc Kupietz3f575282019-10-04 14:46:04 +02001487#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +02001488#' \dontrun{
1489#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001490#' KorAPConnection(verbose = TRUE) |>
Marc Kupietz3f575282019-10-04 14:46:04 +02001491#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +01001492#' }
Marc Kupietz3f575282019-10-04 14:46:04 +02001493#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001494# @inheritParams corpusQuery
Marc Kupietz617266d2025-02-27 10:43:07 +01001495#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001496#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
1497#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz67edcb52021-09-20 21:54:24 +02001498#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
1499#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001500#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz3f575282019-10-04 14:46:04 +02001501#' @export
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001502#'
1503#' @return A tibble, with each row containing the following result columns for query and vc combinations:
1504#' - **query**: the query string used for the frequency analysis.
1505#' - **totalResults**: absolute frequency of query matches in the vc.
1506#' - **vc**: virtual corpus used for the query.
1507#' - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
1508#' - **total**: total number of words in vc.
1509#' - **f**: relative frequency of query matches in the vc.
1510#' - **conf.low**: lower bound of the confidence interval for the relative frequency, given `conf.level`.
1511#' - **conf.high**: upper bound of the confidence interval for the relative frequency, given `conf.level`.
1512
Marc Kupietzd8851222025-05-01 10:57:19 +02001513setMethod(
1514 "frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +01001515 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietzd8851222025-05-01 10:57:19 +02001516 (if (as.alternatives) {
1517 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
Marc Kupietzea34b812025-06-25 15:49:00 +02001518 group_by(vc) |>
Marc Kupietz71d6e052019-11-22 18:42:10 +01001519 mutate(total = sum(totalResults))
Marc Kupietzd8851222025-05-01 10:57:19 +02001520 } else {
1521 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
1522 mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
Marc Kupietzea34b812025-06-25 15:49:00 +02001523 }) |>
Marc Kupietz0c29cea2019-10-09 08:44:36 +02001524 ci(conf.level = conf.level)
Marc Kupietzd8851222025-05-01 10:57:19 +02001525 }
1526)
Marc Kupietz3f575282019-10-04 14:46:04 +02001527
Marc Kupietz38a9d682024-12-06 16:17:09 +01001528#' buildWebUIRequestUrlFromString
1529#'
1530#' @rdname KorAPQuery-class
1531#' @importFrom urltools url_encode
1532#' @export
1533buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +02001534 query,
1535 vc = "",
1536 ql = "poliqarp") {
Marc Kupietz38a9d682024-12-06 16:17:09 +01001537 if ("KorAPConnection" %in% class(KorAPUrl)) {
1538 KorAPUrl <- KorAPUrl@KorAPUrl
1539 }
1540
1541 request <-
1542 paste0(
Marc Kupietzd8851222025-05-01 10:57:19 +02001543 "?q=",
Marc Kupietz38a9d682024-12-06 16:17:09 +01001544 urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietzd8851222025-05-01 10:57:19 +02001545 ifelse(vc != "",
1546 paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
1547 ""
1548 ),
1549 "&ql=",
Marc Kupietz38a9d682024-12-06 16:17:09 +01001550 ql
1551 )
1552 paste0(KorAPUrl, request)
1553}
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001554
1555#' buildWebUIRequestUrl
1556#'
1557#' @rdname KorAPQuery-class
Marc Kupietzf9129592025-01-26 19:17:54 +01001558#' @importFrom httr2 url_parse
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001559#' @export
1560buildWebUIRequestUrl <- function(kco,
Marc Kupietzd8851222025-05-01 10:57:19 +02001561 query = if (missing(KorAPUrl)) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001562 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietzd8851222025-05-01 10:57:19 +02001563 } else {
1564 httr2::url_parse(KorAPUrl)$query$q
1565 },
Marc Kupietzf9129592025-01-26 19:17:54 +01001566 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001567 KorAPUrl,
Marc Kupietzf9129592025-01-26 19:17:54 +01001568 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz38a9d682024-12-06 16:17:09 +01001569 buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001570}
1571
Marc Kupietzd8851222025-05-01 10:57:19 +02001572#' format()
Marc Kupietze95108e2019-09-18 13:23:58 +02001573#' @rdname KorAPQuery-class
1574#' @param x KorAPQuery object
1575#' @param ... further arguments passed to or from other methods
Marc Kupietzb73ca0f2025-01-28 20:45:01 +01001576#' @importFrom urltools param_get url_decode
Marc Kupietze95108e2019-09-18 13:23:58 +02001577#' @export
1578format.KorAPQuery <- function(x, ...) {
1579 cat("<KorAPQuery>\n")
1580 q <- x
Marc Kupietzd8851222025-05-01 10:57:19 +02001581 param <- urltools::param_get(q@request) |> lapply(urltools::url_decode)
Marc Kupietzb73ca0f2025-01-28 20:45:01 +01001582 cat(" Query: ", param$q, "\n")
1583 if (!is.null(param$cq) && param$cq != "") {
1584 cat(" Virtual corpus: ", param$cq, "\n")
1585 }
1586 if (!is.null(q@collectedMatches)) {
1587 cat("==============================================================================================================", "\n")
1588 print(summary(q@collectedMatches))
1589 cat("==============================================================================================================", "\n")
1590 }
1591 cat(" Total results: ", q@totalResults, "\n")
1592 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietza29f3d42025-07-18 10:14:43 +02001593 if (!is.null(q@collectedMatches) && "pos" %in% colnames(q@collectedMatches)) {
1594 successful_annotations <- sum(!is.na(q@collectedMatches$annotation_snippet))
1595 parsed_annotations <- sum(!is.na(q@collectedMatches$pos))
1596 cat(" Annotations: ", successful_annotations, " of ", nrow(q@collectedMatches), " matches")
1597 if (parsed_annotations > 0) {
1598 cat(" (", parsed_annotations, " with parsed linguistic data)")
1599 }
1600 cat("\n")
Marc Kupietze52b2952025-07-17 16:53:02 +02001601 }
Marc Kupietz62da2b52019-09-12 17:43:34 +02001602}
1603
Marc Kupietze95108e2019-09-18 13:23:58 +02001604#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +02001605#'
Marc Kupietze95108e2019-09-18 13:23:58 +02001606#' @rdname KorAPQuery-class
1607#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +02001608#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +02001609setMethod("show", "KorAPQuery", function(object) {
1610 format(object)
Marc Kupietzc643a122025-07-18 18:18:36 +02001611 invisible(object)
Marc Kupietze95108e2019-09-18 13:23:58 +02001612})