blob: 92a534895856a17ec5d88d088838f65634d1d475 [file] [log] [blame]
Marc Kupietza8c40f42025-06-24 15:49:52 +02001#' KorAPQuery class (internal)
Marc Kupietze95108e2019-09-18 13:23:58 +02002#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02003#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
Marc Kupietze95108e2019-09-18 13:23:58 +02004#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02005#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +02006#' @include KorAPConnection.R
Marc Kupietz6dfeed92025-06-03 11:58:06 +02007#' @include logging.R
Marc Kupietzf9129592025-01-26 19:17:54 +01008#' @import httr2
Marc Kupietze95108e2019-09-18 13:23:58 +02009#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
Marc Kupietza29f3d42025-07-18 10:14:43 +020024 "hasMoreMatches"
Marc Kupietze95108e2019-09-18 13:23:58 +020025))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietza8c40f42025-06-24 15:49:52 +020027#' Initialize KorAPQuery object
28#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +020029#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020030#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020031#' @param request query part of the request URL
32#' @param vc definition of a virtual corpus
33#' @param totalResults number of hits the query has yielded
34#' @param nextStartIndex at what index to start the next fetch of query results
35#' @param fields what data / metadata fields should be collected
36#' @param requestUrl complete URL of the API request
37#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
38#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020039#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020040#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020041#'
42#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020043#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +020044setMethod(
45 "initialize", "KorAPQuery",
46 function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
47 "corpusSigle", "textSigle", "pubDate", "pubPlace",
48 "availability", "textClass", "snippet", "tokens"
49 ),
Marc Kupietza29f3d42025-07-18 10:14:43 +020050 requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
Marc Kupietzd8851222025-05-01 10:57:19 +020051 .Object <- callNextMethod()
52 .Object@korapConnection <- korapConnection
53 .Object@request <- request
54 .Object@vc <- vc
55 .Object@totalResults <- totalResults
56 .Object@nextStartIndex <- nextStartIndex
57 .Object@fields <- fields
58 .Object@requestUrl <- requestUrl
59 .Object@webUIRequestUrl <- webUIRequestUrl
60 .Object@apiResponse <- apiResponse
61 .Object@hasMoreMatches <- hasMoreMatches
62 .Object@collectedMatches <- collectedMatches
63 .Object
64 }
65)
Marc Kupietz632cbd42019-09-06 16:04:51 +020066
Marc Kupietzd8851222025-05-01 10:57:19 +020067setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
68setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
69setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
70setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
Marc Kupietze52b2952025-07-17 16:53:02 +020071setGeneric("fetchAnnotations", function(kqo, ...) standardGeneric("fetchAnnotations"))
Marc Kupietzd8851222025-05-01 10:57:19 +020072setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietze95108e2019-09-18 13:23:58 +020073
74maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020075
Marc Kupietz4de53ec2019-10-04 09:12:00 +020076## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietzef1ef4a2025-02-19 12:12:40 +010077utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020078
Marc Kupietza8c40f42025-06-24 15:49:52 +020079#' Search corpus for query terms
Marc Kupietzdbd431a2021-08-29 12:17:45 +020080#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020081#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020082#'
Marc Kupietza8c40f42025-06-24 15:49:52 +020083#' @family corpus search functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +020084#' @aliases corpusQuery
85#'
86#' @importFrom urltools url_encode
87#' @importFrom purrr pmap
Marc Kupietzea34b812025-06-25 15:49:00 +020088#' @importFrom dplyr bind_rows group_by
Marc Kupietzdbd431a2021-08-29 12:17:45 +020089#'
Marc Kupietz617266d2025-02-27 10:43:07 +010090#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz67edcb52021-09-20 21:54:24 +020091#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020092#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020093#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz132f0052023-04-16 14:23:05 +020094#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
95#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
96#' your RKorAPClient application as explained in the
97#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
98#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
99#' `FALSE`.
Marc Kupietz67edcb52021-09-20 21:54:24 +0200100#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Marc Kupietz1623fe82025-06-24 16:31:46 +0200101#' @param fields character vector specifying which metadata fields to retrieve for each match.
102#' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
103#' \describe{
104#' \item{**Text identification**:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
105#' \item{**Publication info**:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
106#' \item{**Temporal data**:}{`pubDate`, `creationDate` - when text was published/created}
107#' \item{**Publication details**:}{`pubPlace`, `publisher`, `reference` - where/how published}
108#' \item{**Text classification**:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
109#' \item{**Adminstrative and technical info**:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
110#' \item{**Content data**:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
111#' \item{**System data**:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
112#' }
113#' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
114#' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default if `metadataOnly` is set to `FALSE`.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100115#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +0200116#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200117#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200118#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200119#' @param context string that specifies the size of the left and the right context returned in `snippet`
120#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
121#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
122#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
123#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
124#' as this is typically legally motivated.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200125#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz67edcb52021-09-20 21:54:24 +0200126#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
127#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200128#'
129#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200130#' \dontrun{
131#'
Marc Kupietz1623fe82025-06-24 16:31:46 +0200132#' # Fetch basic metadata for "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200133#' KorAPConnection() |>
134#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200135#' fetchAll()
Marc Kupietz1623fe82025-06-24 16:31:46 +0200136#'
137#' # Fetch specific metadata fields for bibliographic analysis
Marc Kupietzd3526422025-06-25 09:16:15 +0200138#' query <- KorAPConnection() |>
Marc Kupietz1623fe82025-06-24 16:31:46 +0200139#' corpusQuery("Ameisenplage",
140#' fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
141#' results <- fetchAll(query)
142#' results@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100143#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200144#'
Marc Kupietz6ae76052021-09-21 10:34:00 +0200145#' \dontrun{
146#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200147#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
148#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200149#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200150#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200151#' corpusQuery(
152#' KorAPUrl =
153#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
154#' )
Marc Kupietz6ae76052021-09-21 10:34:00 +0200155#' }
156#'
157#' \dontrun{
Marc Kupietz3c531f62019-09-13 12:17:24 +0200158#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200159#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200160#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200161#' {
162#' . ->> kco
Marc Kupietzd3526422025-06-25 09:16:15 +0200163#' } |>
164#' corpusQuery("Ameisenplage") |>
165#' fetchAll() |>
166#' slot("collectedMatches") |>
167#' mutate(year = lubridate::year(pubDate)) |>
168#' dplyr::select(year) |>
169#' group_by(year) |>
170#' summarise(Count = dplyr::n()) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200171#' mutate(Freq = mapply(function(f, y) {
172#' f / corpusStats(kco, paste("pubDate in", y))@tokens
Marc Kupietzd3526422025-06-25 09:16:15 +0200173#' }, Count, year)) |>
174#' dplyr::select(-Count) |>
175#' complete(year = min(year):max(year), fill = list(Freq = 0)) |>
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200176#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100177#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200178#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200179#'
180#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200181#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200182#'
183#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +0200184setMethod(
185 "corpusQuery", "KorAPConnection",
186 function(kco,
187 query = if (missing(KorAPUrl)) {
188 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
189 } else {
190 httr2::url_parse(KorAPUrl)$query$q
191 },
192 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
193 KorAPUrl,
194 metadataOnly = TRUE,
195 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
196 fields = c(
197 "corpusSigle",
198 "textSigle",
199 "pubDate",
200 "pubPlace",
201 "availability",
202 "textClass",
203 "snippet",
204 "tokens"
205 ),
206 accessRewriteFatal = TRUE,
207 verbose = kco@verbose,
208 expand = length(vc) != length(query),
209 as.df = FALSE,
210 context = NULL) {
211 if (length(query) > 1 || length(vc) > 1) {
212 grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200213
214 # Initialize timing variables for ETA calculation
215 total_queries <- nrow(grid)
216 current_query <- 0
217 start_time <- Sys.time()
218
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200219 results <- purrr::pmap(grid, function(query, vc, ...) {
220 current_query <<- current_query + 1
221
222 # Execute the single query directly (avoiding recursive call)
223 contentFields <- c("snippet", "tokens")
224 query_fields <- fields
225 if (metadataOnly) {
226 query_fields <- query_fields[!query_fields %in% contentFields]
227 }
228 if (!"textSigle" %in% query_fields) {
229 query_fields <- c(query_fields, "textSigle")
230 }
231 request <-
232 paste0(
233 "?q=",
234 url_encode(enc2utf8(query)),
235 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
236 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
237 ifelse(!metadataOnly, "&show-tokens=true", ""),
238 "&ql=", ql
239 )
240 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
241 requestUrl <- paste0(
242 kco@apiUrl,
243 "search",
244 request,
245 "&fields=",
246 paste(query_fields, collapse = ","),
247 if (metadataOnly) "&access-rewrite-disabled=true" else ""
248 )
249
250 # Show individual query progress
251 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
252 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
253 if (is.null(res)) {
254 log_info(verbose, ": API call failed\n")
255 totalResults <- 0
256 } else {
257 totalResults <- as.integer(res$meta$totalResults)
258 log_info(verbose, ": ", totalResults, " hits")
259 if (!is.null(res$meta$cached)) {
260 log_info(verbose, " [cached]")
261 } else if (!is.null(res$meta$benchmark)) {
262 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
263 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
264 formatted_time <- paste0(round(time_value, 2), "s")
265 log_info(verbose, ", took ", formatted_time)
266 } else {
267 log_info(verbose, ", took ", res$meta$benchmark)
268 }
269 }
Marc Kupietz365660e2025-06-25 15:09:55 +0200270
271 # Calculate and display ETA information on the same line if verbose and we have more than one query
272 if (verbose && total_queries > 1) {
273 eta_info <- calculate_eta(current_query, total_queries, start_time)
274 if (eta_info != "") {
275 elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
276 avg_time_per_query <- elapsed_time / current_query
277
278 # Add ETA info to the same line - remove the leading ". " for cleaner formatting
279 clean_eta_info <- sub("^\\. ", ". ", eta_info)
280 log_info(verbose, clean_eta_info)
281 }
282 }
283
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200284 log_info(verbose, "\n")
285 }
286
287 result <- data.frame(
288 query = query,
289 totalResults = totalResults,
290 vc = vc,
291 webUIRequestUrl = webUIRequestUrl,
292 stringsAsFactors = FALSE
293 )
294
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200295 return(result)
296 })
297
298 results %>% bind_rows()
Marc Kupietzd8851222025-05-01 10:57:19 +0200299 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200300 contentFields <- c("snippet", "tokens")
Marc Kupietza96537f2019-11-09 23:07:44 +0100301 if (metadataOnly) {
302 fields <- fields[!fields %in% contentFields]
303 }
Marc Kupietz80dc6432025-02-07 16:57:40 +0100304 if (!"textSigle" %in% fields) {
305 fields <- c(fields, "textSigle")
306 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100307 request <-
Marc Kupietzd8851222025-05-01 10:57:19 +0200308 paste0(
309 "?q=",
310 url_encode(enc2utf8(query)),
311 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
312 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
313 ifelse(!metadataOnly, "&show-tokens=true", ""),
314 "&ql=", ql
315 )
Marc Kupietza96537f2019-11-09 23:07:44 +0100316 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
317 requestUrl <- paste0(
318 kco@apiUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200319 "search",
Marc Kupietza96537f2019-11-09 23:07:44 +0100320 request,
Marc Kupietzd8851222025-05-01 10:57:19 +0200321 "&fields=",
Marc Kupietza96537f2019-11-09 23:07:44 +0100322 paste(fields, collapse = ","),
Marc Kupietzd8851222025-05-01 10:57:19 +0200323 if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietza96537f2019-11-09 23:07:44 +0100324 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200325 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
326 sep =
327 ""
328 )
329 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietza4675722022-02-23 23:55:15 +0100330 if (is.null(res)) {
Marc Kupietza4675722022-02-23 23:55:15 +0100331 message("API call failed.")
332 totalResults <- 0
333 } else {
Marc Kupietzd8851222025-05-01 10:57:19 +0200334 totalResults <- as.integer(res$meta$totalResults)
Marc Kupietza47d1502023-04-18 15:26:47 +0200335 log_info(verbose, ": ", totalResults, " hits")
Marc Kupietzd8851222025-05-01 10:57:19 +0200336 if (!is.null(res$meta$cached)) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200337 log_info(verbose, " [cached]\n")
Marc Kupietzd8851222025-05-01 10:57:19 +0200338 } else if (!is.null(res$meta$benchmark)) {
Marc Kupietz2baf5c52025-09-05 16:41:11 +0200339 # Round the benchmark time to 2 decimal places for better readability.
340 # Be robust to locales using comma as decimal separator (e.g., "0,12s").
Marc Kupietz7638ca42025-05-25 13:18:16 +0200341 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
Marc Kupietz2baf5c52025-09-05 16:41:11 +0200342 bench_str <- sub("s$", "", res$meta$benchmark)
343 bench_num <- suppressWarnings(as.numeric(gsub(",", ".", bench_str)))
344 if (!is.na(bench_num)) {
345 formatted_time <- paste0(round(bench_num, 2), "s")
346 } else {
347 formatted_time <- res$meta$benchmark
348 }
Marc Kupietz7638ca42025-05-25 13:18:16 +0200349 log_info(verbose, ", took ", formatted_time, "\n", sep = "")
350 } else {
351 # Fallback if the format is different than expected
352 log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
353 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200354 } else {
355 log_info(verbose, "\n")
356 }
Marc Kupietza4675722022-02-23 23:55:15 +0100357 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200358 if (as.df) {
Marc Kupietza96537f2019-11-09 23:07:44 +0100359 data.frame(
360 query = query,
Marc Kupietza4675722022-02-23 23:55:15 +0100361 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100362 vc = vc,
363 webUIRequestUrl = webUIRequestUrl,
364 stringsAsFactors = FALSE
365 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200366 } else {
Marc Kupietza96537f2019-11-09 23:07:44 +0100367 KorAPQuery(
368 korapConnection = kco,
369 nextStartIndex = 0,
370 fields = fields,
371 requestUrl = requestUrl,
372 request = request,
Marc Kupietza4675722022-02-23 23:55:15 +0100373 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100374 vc = vc,
375 apiResponse = res,
376 webUIRequestUrl = webUIRequestUrl,
Marc Kupietza4675722022-02-23 23:55:15 +0100377 hasMoreMatches = (totalResults > 0),
Marc Kupietza96537f2019-11-09 23:07:44 +0100378 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200379 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100380 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200381 }
382)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200383
Marc Kupietz05a60792024-12-07 16:23:31 +0100384#' @importFrom purrr map
385repair_data_strcuture <- function(x) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200386 if (is.list(x)) {
387 as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz05a60792024-12-07 16:23:31 +0100388 paste(.x, collapse = " ")
389 } else {
390 .x
391 }))
Marc Kupietzd8851222025-05-01 10:57:19 +0200392 } else {
Marc Kupietz05a60792024-12-07 16:23:31 +0100393 ifelse(is.na(x), "", x)
Marc Kupietzd8851222025-05-01 10:57:19 +0200394 }
Marc Kupietz05a60792024-12-07 16:23:31 +0100395}
396
Marc Kupietz62da2b52019-09-12 17:43:34 +0200397#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200398#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200399#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200400#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200401#' @family corpus search functions
402#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200403#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200404#' @param offset start offset for query results to fetch
405#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200406#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200407#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
408#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200409#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100410#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200411#' \dontrun{
412#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200413#' q <- KorAPConnection() |>
414#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200415#' fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100416#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100417#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100418#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200419#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200420#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200421#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200422#' @aliases fetchNext
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200423#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietzf4881122024-12-17 14:55:39 +0100424#' @importFrom tibble enframe add_column
425#' @importFrom stringr word
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200426#' @importFrom tidyr unnest unchop pivot_wider
427#' @importFrom purrr map
Marc Kupietz632cbd42019-09-06 16:04:51 +0200428#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200429setMethod("fetchNext", "KorAPQuery", function(kqo,
430 offset = kqo@nextStartIndex,
431 maxFetch = maxResultsPerPage,
432 verbose = kqo@korapConnection@verbose,
433 randomizePageOrder = FALSE) {
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100434 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietzd8851222025-05-01 10:57:19 +0200435 results <- key <- name <- tmp_positions <- 0
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100436
Marc Kupietze95108e2019-09-18 13:23:58 +0200437 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
438 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200439 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200440 use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz623d7122025-05-25 12:46:12 +0200441 # Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietze95108e2019-09-18 13:23:58 +0200442 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200443
Marc Kupietz24799fd2025-06-25 14:15:36 +0200444 # Track start time for ETA calculation
445 start_time <- Sys.time()
446
Marc Kupietz623d7122025-05-25 12:46:12 +0200447 # For randomized page order, generate a list of randomized page indices
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200448 if (randomizePageOrder) {
Marc Kupietz623d7122025-05-25 12:46:12 +0200449 # Calculate how many pages we need to fetch based on maxFetch
450 total_pages_to_fetch <- if (!is.na(maxFetch)) {
451 # Either limited by maxFetch or total results, whichever is smaller
452 min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
453 } else {
454 # All pages
455 ceiling(kqo@totalResults / maxResultsPerPage)
456 }
457
458 # Generate randomized page indices (0-based for API)
459 pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
460 page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200461 }
462
Marc Kupietzd8851222025-05-01 10:57:19 +0200463 if (is.null(collectedMatches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200464 collectedMatches <- data.frame()
465 }
Marc Kupietz623d7122025-05-25 12:46:12 +0200466
467 # Initialize the page counter properly based on nextStartIndex and any previously fetched results
468 # We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
469 # For first call, this will be 1, for subsequent calls, it will reflect our actual position
470 current_page_number <- ceiling(offset / maxResultsPerPage) + 1
471
472 # For sequential fetches, keep track of which global page we're on
473 # This is important for correctly showing page numbers in subsequent fetchNext calls
474 page_count_start <- current_page_number
475
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200476 repeat {
Marc Kupietz623d7122025-05-25 12:46:12 +0200477 # Determine which page to fetch next
478 if (randomizePageOrder) {
479 # In randomized mode, get the page from our randomized list using the page_index
480 # Make sure we don't exceed the array bounds
481 if (page_index > length(pages)) {
482 break # No more pages to fetch in randomized mode
483 }
484 current_offset_page <- pages[page_index]
485 # For display purposes in randomized mode, show which page out of the total we're fetching
486 display_page_number <- page_index
487 } else {
488 # In sequential mode, use the current_page_number to calculate the offset
489 current_offset_page <- (current_page_number - 1)
490 display_page_number <- current_page_number
491 }
492
493 # Calculate the actual offset in tokens
494 currentOffset <- current_offset_page * maxResultsPerPage
495
Marc Kupietzef0e9392025-06-18 12:21:49 +0200496 # Build the query with the appropriate count and offset using httr2
497 count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
Marc Kupietzecc86702025-06-24 12:12:51 +0200498
Marc Kupietzef0e9392025-06-18 12:21:49 +0200499 # Parse existing URL to preserve all query parameters
500 parsed_url <- httr2::url_parse(kqo@requestUrl)
501 existing_query <- parsed_url$query
Marc Kupietzecc86702025-06-24 12:12:51 +0200502
Marc Kupietzef0e9392025-06-18 12:21:49 +0200503 # Add/update count and offset parameters
504 existing_query$count <- count_param
505 existing_query$offset <- currentOffset
Marc Kupietzecc86702025-06-24 12:12:51 +0200506
Marc Kupietzef0e9392025-06-18 12:21:49 +0200507 # Rebuild the URL with all parameters
508 query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz68170952021-06-30 09:37:21 +0200509 res <- apiCall(kqo@korapConnection, query)
510 if (length(res$matches) == 0) {
511 break
512 }
513
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200514 if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz16ccf112025-01-26 13:25:27 +0100515 log_info(verbose, "Using fields API: ")
Marc Kupietz05a60792024-12-07 16:23:31 +0100516 currentMatches <- res$matches$fields %>%
517 purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
518 tibble::enframe() %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200519 tidyr::unnest(cols = value) %>%
520 tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200521 dplyr::select(-name)
Marc Kupietzd8851222025-05-01 10:57:19 +0200522 if ("snippet" %in% colnames(res$matches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200523 currentMatches$snippet <- res$matches$snippet
524 }
Marc Kupietz3cd2c6c2025-01-08 20:35:39 +0100525 if ("tokens" %in% colnames(res$matches)) {
526 currentMatches$tokens <- res$matches$tokens
527 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200528 } else {
529 currentMatches <- res$matches
530 }
531
Marc Kupietze95108e2019-09-18 13:23:58 +0200532 for (field in kqo@fields) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200533 if (!field %in% colnames(currentMatches)) {
534 currentMatches[, field] <- NA
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200535 }
536 }
Marc Kupietzf4881122024-12-17 14:55:39 +0100537 currentMatches <- currentMatches %>%
538 select(kqo@fields) %>%
539 mutate(
Marc Kupietzff712a92025-07-18 09:07:23 +0200540 matchID = res$matches$matchID,
Marc Kupietz0447da02025-01-08 20:51:09 +0100541 tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
Marc Kupietzf4881122024-12-17 14:55:39 +0100542 matchStart = as.integer(stringr::word(tmp_positions, 1)),
543 matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
544 ) %>%
545 select(-tmp_positions)
546
Marc Kupietz62da2b52019-09-12 17:43:34 +0200547 if (!is.list(collectedMatches)) {
548 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200549 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200550 collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200551 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200552
Marc Kupietz623d7122025-05-25 12:46:12 +0200553 # Get the actual items per page from the API response
554 # We now consistently use maxResultsPerPage instead
Marc Kupietzacbaab02025-05-01 10:56:35 +0200555
Marc Kupietz623d7122025-05-25 12:46:12 +0200556 # Calculate total pages consistently using fixed maxResultsPerPage
557 # This ensures consistent page counting across the function
558 total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
559
Marc Kupietz24799fd2025-06-25 14:15:36 +0200560 # Calculate ETA using the centralized function from logging.R
561 current_page <- if (randomizePageOrder) page_index else display_page_number
562 total_pages_to_fetch <- if (!is.na(maxFetch)) {
563 # Account for offset - we can only fetch from the remaining results after offset
564 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
565 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
566 } else {
567 total_pages
568 }
Marc Kupietz365660e2025-06-25 15:09:55 +0200569
Marc Kupietz24799fd2025-06-25 14:15:36 +0200570 eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)
Marc Kupietz365660e2025-06-25 15:09:55 +0200571
Marc Kupietz24799fd2025-06-25 14:15:36 +0200572 # Extract timing information for display
Marc Kupietzae9b6172025-05-02 15:50:01 +0200573 time_per_page <- NA
Marc Kupietzae9b6172025-05-02 15:50:01 +0200574 if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
Marc Kupietzae9b6172025-05-02 15:50:01 +0200575 time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
Marc Kupietzacbaab02025-05-01 10:56:35 +0200576 }
577
Marc Kupietz623d7122025-05-25 12:46:12 +0200578 # Create the page display string with proper formatting
Marc Kupietzacbaab02025-05-01 10:56:35 +0200579
Marc Kupietz623d7122025-05-25 12:46:12 +0200580 # For global page tracking, calculate the absolute page number
581 actual_display_number <- if (randomizePageOrder) {
582 current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
583 } else {
584 # In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
585 current_offset_page + 1
586 }
587
588 # For subsequent calls to fetchNext, we need to calculate the correct page numbers
589 # based on the current batch being fetched
590
591 # For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
592 # Simply count from 1 within the current batch
593
594 # The relative page number is simply the current position in this batch
595 if (randomizePageOrder) {
596 relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
597 } else {
598 relative_page_number <- display_page_number - (page_count_start - 1)
599 }
600
601 # How many pages will we fetch in this batch?
Marc Kupietz021663d2025-06-18 17:49:22 +0200602 # If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz623d7122025-05-25 12:46:12 +0200603 pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz021663d2025-06-18 17:49:22 +0200604 # Account for offset - we can only fetch from the remaining results after offset
605 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
606 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz623d7122025-05-25 12:46:12 +0200607 } else {
608 # Otherwise fetch all remaining pages
609 total_pages - page_count_start + 1
610 }
611
612 # The total pages to be shown in this batch
613 batch_total_pages <- pages_in_this_batch
614
615 page_display <- paste0(
616 "Retrieved page ",
617 sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
618 "/",
619 sprintf("%d", batch_total_pages)
620 )
621
622 # If randomized, also show which actual page we fetched
623 if (randomizePageOrder) {
624 # Determine the maximum width needed for page numbers (based on total pages)
625 # This ensures consistent alignment
626 max_page_width <- nchar(as.character(total_pages))
627 # Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz7638ca42025-05-25 13:18:16 +0200628 page_display <- paste0(
629 page_display,
630 sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
631 )
Marc Kupietz623d7122025-05-25 12:46:12 +0200632 }
633 # Always show the absolute page number and total pages (for clarity)
634 else {
635 # Show the absolute page number (out of total possible pages)
636 page_display <- paste0(page_display, sprintf(
637 " (page %d of %d total)",
638 actual_display_number, total_pages
639 ))
640 }
641
642 # Add caching or timing information
643 if (!is.null(res$meta$cached)) {
644 page_display <- paste0(page_display, " [cached]")
645 } else {
646 page_display <- paste0(
647 page_display,
648 " in ",
649 if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
Marc Kupietz24799fd2025-06-25 14:15:36 +0200650 "s",
651 eta_info
Marc Kupietz623d7122025-05-25 12:46:12 +0200652 )
653 }
654
655 log_info(verbose, paste0(page_display, "\n"))
656
657 # Increment the appropriate counter based on mode
658 if (randomizePageOrder) {
659 page_index <- page_index + 1
660 } else {
661 current_page_number <- current_page_number + 1
662 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200663 results <- results + res$meta$itemsPerPage
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200664 if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200665 break
666 }
667 }
Marc Kupietz68170952021-06-30 09:37:21 +0200668 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietzd8851222025-05-01 10:57:19 +0200669 KorAPQuery(
670 nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200671 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200672 fields = kqo@fields,
673 requestUrl = kqo@requestUrl,
674 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200675 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200676 vc = kqo@vc,
677 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200678 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200679 apiResponse = res,
Marc Kupietzd8851222025-05-01 10:57:19 +0200680 collectedMatches = collectedMatches
681 )
Marc Kupietze95108e2019-09-18 13:23:58 +0200682})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200683
684#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200685#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200686#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100687#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200688#' @family corpus search functions
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200689#' @param kqo object obtained from [corpusQuery()]
690#' @param verbose print progress information if true
691#' @param ... further arguments passed to [fetchNext()]
692#' @return The updated `kqo` object with all results in `@collectedMatches`
Marc Kupietza8c40f42025-06-24 15:49:52 +0200693#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200694#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200695#' \dontrun{
Marc Kupietzecc86702025-06-24 12:12:51 +0200696#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
697#' q <- KorAPConnection() |>
698#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200699#' fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200700#' q@collectedMatches
Marc Kupietzecc86702025-06-24 12:12:51 +0200701#'
702#' # Fetch also all KWICs
703#' q <- KorAPConnection() |> auth() |>
704#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
705#' fetchAll()
706#' q@collectedMatches
707#'
708#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
709#' q <- KorAPConnection() |>
710#' corpusQuery("<base/s=t>", # this matches each text once
711#' vc = "pubDate in 1958-03-12",
712#' fields = c("textSigle", "title"),
713#' ) |>
714#' fetchAll()
715#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100716#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200717#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200718#' @aliases fetchAll
Marc Kupietz62da2b52019-09-12 17:43:34 +0200719#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200720setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
721 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200722})
723
724#' Fetches the remaining results of a KorAP query.
725#'
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200726#' @param kqo object obtained from [corpusQuery()]
727#' @param verbose print progress information if true
728#' @param ... further arguments passed to [fetchNext()]
729#' @return The updated `kqo` object with remaining results in `@collectedMatches`
730#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200731#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200732#' \dontrun{
733#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200734#' q <- KorAPConnection() |>
735#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200736#' fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200737#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100738#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200739#'
740#' @aliases fetchRest
Marc Kupietze95108e2019-09-18 13:23:58 +0200741#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200742setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
743 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200744})
745
Marc Kupietza29f3d42025-07-18 10:14:43 +0200746#'
747#' Parse XML annotations into linguistic layers
748#'
749#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
750#' from XML annotation snippets returned by the KorAP API.
751#'
752#' @param xml_snippet XML string containing annotation data
753#' @return Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
754#' @keywords internal
755parse_xml_annotations <- function(xml_snippet) {
756 if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
757 return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
758 }
759
760 # Extract content within <span class="match">...</span> using a more robust approach
761 if (grepl('<span class="match">', xml_snippet)) {
762 # Find the start of match span
763 start_pos <- regexpr('<span class="match">', xml_snippet)
764 if (start_pos > 0) {
765 # Find the end by counting nested spans
766 content_start <- start_pos + attr(start_pos, "match.length")
767 remaining <- substr(xml_snippet, content_start, nchar(xml_snippet))
768
769 # Simple approach: extract everything until we hit context-right or end
770 if (grepl('<span class="context-right">', remaining)) {
771 content_to_parse <- gsub('(.*?)<span class="context-right">.*', '\\1', remaining)
772 } else {
773 # Find the closing </span> that matches our opening span
774 # For now, use a simpler approach - take everything until the last </span> sequence
775 content_to_parse <- gsub('(.*)</span>\\s*$', '\\1', remaining)
776 }
777 } else {
778 content_to_parse <- xml_snippet
779 }
780 } else {
781 content_to_parse <- xml_snippet
782 }
783
784 # Initialize result vectors
785 tokens <- character(0)
786 lemmas <- character(0)
787 pos_tags <- character(0)
788 morph_tags <- character(0)
789
790 # Split the content by </span> and process each meaningful part
791 parts <- unlist(strsplit(content_to_parse, '</span>'))
792
793 for (part in parts) {
794 part <- trimws(part)
795 if (nchar(part) == 0) next
796
797 # Look for parts that have title attributes and end with text
798 if (grepl('<span[^>]*title=', part)) {
799 # Extract the text content (everything after the last >)
800 text_content <- gsub('.*>([^<]*)$', '\\1', part)
801 text_content <- trimws(text_content)
802
803 if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
804 tokens <- c(tokens, text_content)
805
806 # Extract all title attributes from this part
807 title_pattern <- 'title="([^"]*)"'
808 title_matches <- gregexpr(title_pattern, part)
809
810 lemma <- NA
811 pos_tag <- NA
812 morph_tag <- NA
813
814 if (title_matches[[1]][1] != -1) {
815 all_titles <- regmatches(part, title_matches)[[1]]
816 for (title_match in all_titles) {
817 title_content <- gsub(title_pattern, '\\1', title_match)
Marc Kupietzc643a122025-07-18 18:18:36 +0200818
Marc Kupietza29f3d42025-07-18 10:14:43 +0200819 # Split by spaces and process each annotation
820 annotations <- unlist(strsplit(title_content, "\\s+"))
821 for (annotation in annotations) {
Marc Kupietzc643a122025-07-18 18:18:36 +0200822 if (grepl('^[^/]+/l:', annotation)) {
823 lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
824 } else if (grepl('^[^/]+/p:', annotation)) {
825 pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
826 } else if (grepl('^[^/]+/m:', annotation)) {
827 morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200828 }
829 }
830 }
831 }
832
833 lemmas <- c(lemmas, lemma)
834 pos_tags <- c(pos_tags, pos_tag)
835 morph_tags <- c(morph_tags, morph_tag)
836 }
837 }
838 }
839
840 # If no tokens found with the splitting approach, try a different method
841 if (length(tokens) == 0) {
842 # Look for the innermost spans that contain actual text
843 innermost_pattern <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
844 innermost_matches <- gregexpr(innermost_pattern, content_to_parse, perl = TRUE)
845
846 if (innermost_matches[[1]][1] != -1) {
847 matches <- regmatches(content_to_parse, innermost_matches)[[1]]
848
849 for (match in matches) {
850 title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
851 text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
852 text <- trimws(text)
853
854 if (nchar(text) > 0) {
855 tokens <- c(tokens, text)
856
857 # Parse space-separated annotations in title
858 lemma <- NA
859 pos_tag <- NA
860 morph_tag <- NA
Marc Kupietzc643a122025-07-18 18:18:36 +0200861
Marc Kupietza29f3d42025-07-18 10:14:43 +0200862 annotations <- unlist(strsplit(title, "\\s+"))
863 for (annotation in annotations) {
Marc Kupietzc643a122025-07-18 18:18:36 +0200864 if (grepl('^[^/]+/l:', annotation)) {
865 lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
866 } else if (grepl('^[^/]+/p:', annotation)) {
867 pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
868 } else if (grepl('^[^/]+/m:', annotation)) {
869 morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200870 }
871 }
Marc Kupietzc643a122025-07-18 18:18:36 +0200872
Marc Kupietza29f3d42025-07-18 10:14:43 +0200873 lemmas <- c(lemmas, lemma)
874 pos_tags <- c(pos_tags, pos_tag)
875 morph_tags <- c(morph_tags, morph_tag)
876 }
877 }
878 }
879 }
880
881 # Ensure all vectors have the same length
882 max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
883 if (max_length > 0) {
884 tokens <- c(tokens, rep(NA, max_length - length(tokens)))
885 lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
886 pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
887 morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
888 }
889
890 return(list(
891 token = tokens,
892 lemma = lemmas,
893 pos = pos_tags,
894 morph = morph_tags
895 ))
896}
897
898#'
899#' Parse XML annotations into linguistic layers with left/match/right structure
900#'
901#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
902#' from XML annotation snippets returned by the KorAP API, split into left context,
903#' match, and right context sections like the tokens field.
904#'
905#' @param xml_snippet XML string containing annotation data
906#' @return Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
907#' @keywords internal
908parse_xml_annotations_structured <- function(xml_snippet) {
909 if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
910 empty_result <- list(left = character(0), match = character(0), right = character(0))
911 return(list(
912 atokens = empty_result,
913 lemma = empty_result,
914 pos = empty_result,
915 morph = empty_result
916 ))
917 }
918
919 # Helper function to extract annotations from a span section
920 extract_annotations_from_section <- function(section_content) {
Marc Kupietz560b5912025-09-01 17:36:13 +0200921 # Remove any <mark>...</mark> tags that may interrupt token boundaries
922 section_no_marks <- gsub('</?mark[^>]*>', '', section_content, perl = TRUE)
923 # Normalize separators between adjacent top-level spans so splitting is robust.
924 # Replace any punctuation/entity/space run between one-or-more closing spans and the next opening span
925 # with a single space, preserving all closing spans.
926 section_norm <- gsub('((?:</span>)+)[[:space:]]*(?:&[^;]+;|[[:punct:]]|[[:space:]])*[[:space:]]*(<span)', '\\1 \\2', section_no_marks, perl = TRUE)
927 # Handle both spaced tokens and nested single tokens by scanning innermost spans with direct text
Marc Kupietza29f3d42025-07-18 10:14:43 +0200928 tokens <- character(0)
929 lemmas <- character(0)
930 pos_tags <- character(0)
931 morph_tags <- character(0)
Marc Kupietz89f796e2025-07-19 09:05:06 +0200932
Marc Kupietz560b5912025-09-01 17:36:13 +0200933 pat_token <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
934 mm <- gregexpr(pat_token, section_norm, perl = TRUE)
935 if (mm[[1]][1] != -1) {
936 starts <- mm[[1]]
937 lens <- attr(mm[[1]], 'match.length')
938 for (k in seq_along(starts)) {
939 s <- starts[k]
940 e <- s + lens[k] - 1
941 fragment <- substr(section_norm, s, e)
942 text_content <- sub(pat_token, '\\2', fragment, perl = TRUE)
943 text_content <- trimws(text_content)
944 title_content <- sub(pat_token, '\\1', fragment, perl = TRUE)
Marc Kupietz89f796e2025-07-19 09:05:06 +0200945
Marc Kupietz560b5912025-09-01 17:36:13 +0200946 if (nchar(text_content) == 0) next
Marc Kupietz89f796e2025-07-19 09:05:06 +0200947
Marc Kupietz560b5912025-09-01 17:36:13 +0200948 lemma <- NA
949 pos_tag <- NA
950 morph_features <- character(0)
Marc Kupietz89f796e2025-07-19 09:05:06 +0200951
Marc Kupietz560b5912025-09-01 17:36:13 +0200952 # parse inner title
953 ann <- unlist(strsplit(title_content, "[[:space:]]+"))
954 for (a in ann) {
955 if (grepl('/l:', a)) {
956 lemma <- sub('.*?/l:(.*)$', '\\1', a, perl = TRUE)
957 } else if (grepl('/p:', a)) {
958 pos_tag <- sub('.*?/p:(.*)$', '\\1', a, perl = TRUE)
959 } else if (grepl('/m:', a)) {
960 morph_features <- c(morph_features, sub('.*?/m:(.*)$', '\\1', a, perl = TRUE))
Marc Kupietza29f3d42025-07-18 10:14:43 +0200961 }
962 }
Marc Kupietz560b5912025-09-01 17:36:13 +0200963
964 # If lemma missing, look back in nearby context for the nearest title containing l:
965 if (is.na(lemma) || nchar(lemma) == 0) {
966 ctx_start <- max(1, s - 500)
967 context <- substr(section_norm, ctx_start, s - 1)
968 tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
969 if (tmm[[1]][1] != -1) {
970 ctx_titles <- regmatches(context, tmm)[[1]]
971 for (ti in rev(ctx_titles)) {
972 cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
973 if (grepl('/l:', cont)) {
974 lemma <- sub('.*?/l:([^ ]+).*', '\\1', cont, perl = TRUE)
975 break
976 }
977 }
978 }
979 }
980
981 # If POS missing, keep NA; morphological features may also appear in outer titles
982 if (length(morph_features) == 0) {
983 ctx_start <- max(1, s - 500)
984 context <- substr(section_norm, ctx_start, s - 1)
985 tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
986 if (tmm[[1]][1] != -1) {
987 ctx_titles <- regmatches(context, tmm)[[1]]
988 for (ti in rev(ctx_titles)) {
989 cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
990 if (grepl('/m:', cont)) {
991 mparts <- unlist(strsplit(cont, "[[:space:]]+"))
992 for (mp in mparts) if (grepl('/m:', mp)) morph_features <- c(morph_features, sub('.*?/m:(.*)$', '\\1', mp, perl = TRUE))
993 break
994 }
995 }
996 }
997 }
998
999 tokens <- c(tokens, text_content)
1000 lemmas <- c(lemmas, if (!is.null(lemma)) lemma else NA)
1001 pos_tags <- c(pos_tags, if (!is.null(pos_tag)) pos_tag else NA)
1002 morph_tags <- c(morph_tags, if (length(morph_features) > 0) paste(morph_features, collapse = "|") else NA)
Marc Kupietza29f3d42025-07-18 10:14:43 +02001003 }
1004 }
1005
1006 # Ensure all vectors have the same length
1007 max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
1008 if (max_length > 0) {
1009 tokens <- c(tokens, rep(NA, max_length - length(tokens)))
1010 lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
1011 pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
1012 morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
1013 }
1014
1015 return(list(
1016 tokens = tokens,
1017 lemmas = lemmas,
1018 pos_tags = pos_tags,
1019 morph_tags = morph_tags
1020 ))
1021 }
1022
1023 # Split the XML into three parts: left context, match content, and right context
1024 # The structure is: <span class="match">...left...<mark>...match...</mark>...right...</span>
Marc Kupietzc643a122025-07-18 18:18:36 +02001025
Marc Kupietza29f3d42025-07-18 10:14:43 +02001026 # First extract the content within the match span using DOTALL modifier
1027 match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*<span class="context-right">'
1028 match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001029
Marc Kupietza29f3d42025-07-18 10:14:43 +02001030 if (match_span_match == -1) {
1031 # Try alternative pattern if no context-right
1032 match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*$'
1033 match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
1034 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001035
Marc Kupietza29f3d42025-07-18 10:14:43 +02001036 if (match_span_match > 0) {
1037 match_span_content <- gsub(match_span_pattern, '\\1', xml_snippet, perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001038
Marc Kupietza29f3d42025-07-18 10:14:43 +02001039 # Now find the <mark> and </mark> positions within this content
1040 mark_start <- regexpr('<mark[^>]*>', match_span_content, perl = TRUE)
Marc Kupietz560b5912025-09-01 17:36:13 +02001041 # Use the LAST closing </mark> to cover multi-part matches
1042 mark_end_gre <- gregexpr('</mark>', match_span_content, perl = TRUE)
1043 mark_end_positions <- mark_end_gre[[1]]
1044 mark_end <- if (!is.null(mark_end_positions) && length(mark_end_positions) > 0 && mark_end_positions[1] != -1)
1045 mark_end_positions[length(mark_end_positions)] else -1
1046 mark_end_len <- if (mark_end != -1) attr(mark_end_gre[[1]], "match.length")[length(mark_end_positions)] else 0
Marc Kupietzc643a122025-07-18 18:18:36 +02001047
Marc Kupietza29f3d42025-07-18 10:14:43 +02001048 if (mark_start > 0 && mark_end > 0) {
Marc Kupietz560b5912025-09-01 17:36:13 +02001049 # Left context: everything before first <mark>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001050 left_content <- substr(match_span_content, 1, mark_start - 1)
Marc Kupietzc643a122025-07-18 18:18:36 +02001051
Marc Kupietz560b5912025-09-01 17:36:13 +02001052 # Match content: everything between first <mark> and last </mark>
1053 match_content <- substr(match_span_content, mark_start, mark_end + mark_end_len - 1)
Marc Kupietzc643a122025-07-18 18:18:36 +02001054
Marc Kupietz560b5912025-09-01 17:36:13 +02001055 # Right context: everything after last </mark>
1056 right_content_start <- mark_end + mark_end_len
Marc Kupietza29f3d42025-07-18 10:14:43 +02001057 right_content <- substr(match_span_content, right_content_start, nchar(match_span_content))
1058 } else {
1059 # No mark tags found, treat entire match span as match content
1060 left_content <- ""
1061 match_content <- match_span_content
1062 right_content <- ""
1063 }
1064 } else {
1065 # No match span found, treat entire content as match
1066 left_content <- ""
1067 match_content <- xml_snippet
1068 right_content <- ""
1069 }
1070
1071 # Process each section
1072 left_annotations <- extract_annotations_from_section(left_content)
1073 match_annotations <- extract_annotations_from_section(match_content)
1074 right_annotations <- extract_annotations_from_section(right_content)
1075
1076 return(list(
1077 atokens = list(
1078 left = left_annotations$tokens,
1079 match = match_annotations$tokens,
1080 right = right_annotations$tokens
1081 ),
1082 lemma = list(
1083 left = left_annotations$lemmas,
1084 match = match_annotations$lemmas,
1085 right = right_annotations$lemmas
1086 ),
1087 pos = list(
1088 left = left_annotations$pos_tags,
1089 match = match_annotations$pos_tags,
1090 right = right_annotations$pos_tags
1091 ),
1092 morph = list(
1093 left = left_annotations$morph_tags,
1094 match = match_annotations$morph_tags,
1095 right = right_annotations$morph_tags
1096 )
1097 ))
1098}
1099
Marc Kupietze52b2952025-07-17 16:53:02 +02001100#' Fetch annotations for all collected matches
1101#'
Marc Kupietz89f796e2025-07-19 09:05:06 +02001102#' `r lifecycle::badge("experimental")`
1103#'
1104#' **`fetchAnnotations`** fetches annotations (only token annotations, for now)
1105#' for all matches in the `@collectedMatches` slot
Marc Kupietzc643a122025-07-18 18:18:36 +02001106#' of a KorAPQuery object and adds annotation columns directly to the `@collectedMatches`
Marc Kupietz89f796e2025-07-19 09:05:06 +02001107#' data frame. The method uses the `matchID` from collected matches.
Marc Kupietza29f3d42025-07-18 10:14:43 +02001108#'
1109#' **Important**: For copyright-restricted corpora, users must be authorized via [auth()]
1110#' and the initial corpus query must have `metadataOnly = FALSE` to ensure snippets are
1111#' available for annotation parsing.
1112#'
1113#' The method parses XML snippet annotations and adds linguistic columns to the data frame:
1114#' - `pos`: data frame with `left`, `match`, `right` columns, each containing list vectors of part-of-speech tags
1115#' - `lemma`: data frame with `left`, `match`, `right` columns, each containing list vectors of lemmas
1116#' - `morph`: data frame with `left`, `match`, `right` columns, each containing list vectors of morphological tags
1117#' - `atokens`: data frame with `left`, `match`, `right` columns, each containing list vectors of token text (from annotations)
1118#' - `annotation_snippet`: original XML snippet from the annotation API
Marc Kupietze52b2952025-07-17 16:53:02 +02001119#'
1120#' @family corpus search functions
Marc Kupietz89f796e2025-07-19 09:05:06 +02001121#' @concept Annotations
Marc Kupietze52b2952025-07-17 16:53:02 +02001122#' @aliases fetchAnnotations
1123#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001124#' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
Marc Kupietze52b2952025-07-17 16:53:02 +02001125#' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
Marc Kupietz93787d52025-09-03 13:33:25 +02001126#' @param overwrite logical; if TRUE, re-fetch and replace any existing
1127#' annotation columns. If FALSE (default), only add missing annotation layers
1128#' and preserve already fetched ones (e.g., keep POS/lemma from a previous
1129#' foundry while adding morph from another).
Marc Kupietze52b2952025-07-17 16:53:02 +02001130#' @param verbose print progress information if true
Marc Kupietz89f796e2025-07-19 09:05:06 +02001131#' @return The updated `kqo` object with annotation columns
1132#' like `pos`, `lemma`, `morph` (and `atokens` and `annotation_snippet`)
1133#' in the `@collectedMatches` slot. Each column is a data frame
1134#' with `left`, `match`, and `right` columns containing list vectors of annotations
1135#' for the left context, matched tokens, and right context, respectively.
1136#' The original XML snippet for each match is also stored in `annotation_snippet`.
Marc Kupietze52b2952025-07-17 16:53:02 +02001137#'
1138#' @examples
1139#' \dontrun{
1140#'
1141#' # Fetch annotations for matches using Tree-Tagger foundry
Marc Kupietza29f3d42025-07-18 10:14:43 +02001142#' # Note: Authorization required for copyright-restricted corpora
Marc Kupietze52b2952025-07-17 16:53:02 +02001143#' q <- KorAPConnection() |>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001144#' auth() |>
1145#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
Marc Kupietze52b2952025-07-17 16:53:02 +02001146#' fetchNext(maxFetch = 10) |>
1147#' fetchAnnotations()
Marc Kupietze52b2952025-07-17 16:53:02 +02001148#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001149#' # Access linguistic annotations for match i:
1150#' pos_tags <- q@collectedMatches$pos # Data frame with left/match/right columns for POS tags
Marc Kupietzc643a122025-07-18 18:18:36 +02001151#' lemmas <- q@collectedMatches$lemma # Data frame with left/match/right columns for lemmas
Marc Kupietza29f3d42025-07-18 10:14:43 +02001152#' morphology <- q@collectedMatches$morph # Data frame with left/match/right columns for morphological tags
1153#' atokens <- q@collectedMatches$atokens # Data frame with left/match/right columns for annotation token text
1154#' raw_snippet <- q@collectedMatches$annotation_snippet[[i]] # Original XML snippet for match i
Marc Kupietzc643a122025-07-18 18:18:36 +02001155#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001156#' # Access specific components:
1157#' match_pos <- q@collectedMatches$pos$match[[i]] # POS tags for the matched tokens in match i
1158#' left_lemmas <- q@collectedMatches$lemma$left[[i]] # Lemmas for the left context in match i
1159#' right_tokens <- q@collectedMatches$atokens$right[[i]] # Token text for the right context in match i
1160#'
Marc Kupietz89f796e2025-07-19 09:05:06 +02001161#' # Use a different foundry (e.g., MarMoT)
Marc Kupietze52b2952025-07-17 16:53:02 +02001162#' q <- KorAPConnection() |>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001163#' auth() |>
1164#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
Marc Kupietze52b2952025-07-17 16:53:02 +02001165#' fetchNext(maxFetch = 10) |>
Marc Kupietz89f796e2025-07-19 09:05:06 +02001166#' fetchAnnotations(foundry = "marmot")
1167#' q@collectedMatches$pos$left[1] # POS tags for the left context of the first match
Marc Kupietze52b2952025-07-17 16:53:02 +02001168#' }
Marc Kupietze52b2952025-07-17 16:53:02 +02001169#' @export
Marc Kupietz93787d52025-09-03 13:33:25 +02001170setMethod("fetchAnnotations", "KorAPQuery", function(kqo, foundry = "tt", overwrite = FALSE, verbose = kqo@korapConnection@verbose) {
Marc Kupietze52b2952025-07-17 16:53:02 +02001171 if (is.null(kqo@collectedMatches) || nrow(kqo@collectedMatches) == 0) {
1172 warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
1173 return(kqo)
1174 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001175
Marc Kupietze52b2952025-07-17 16:53:02 +02001176 df <- kqo@collectedMatches
1177 kco <- kqo@korapConnection
Marc Kupietza29f3d42025-07-18 10:14:43 +02001178
Marc Kupietza29f3d42025-07-18 10:14:43 +02001179 # Initialize annotation columns as data frames (like tokens field)
1180 # Create the structure more explicitly to avoid assignment issues
1181 nrows <- nrow(df)
Marc Kupietzc643a122025-07-18 18:18:36 +02001182
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001183 # Pre-compute the empty character vector list to avoid repeated computation
1184 empty_char_list <- I(replicate(nrows, character(0), simplify = FALSE))
1185
1186 # Helper function to create annotation data frame structure
1187 create_annotation_df <- function(empty_list) {
1188 data.frame(
1189 left = empty_list,
1190 match = empty_list,
1191 right = empty_list,
1192 stringsAsFactors = FALSE
1193 )
1194 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001195
Marc Kupietz93787d52025-09-03 13:33:25 +02001196 # Track which annotation columns already existed to decide overwrite behavior
1197 existing_types <- list(
1198 pos = "pos" %in% colnames(df),
1199 lemma = "lemma" %in% colnames(df),
1200 morph = "morph" %in% colnames(df),
1201 atokens = "atokens" %in% colnames(df),
1202 annotation_snippet = "annotation_snippet" %in% colnames(df)
1203 )
1204
1205 # Initialize annotation columns using the helper function
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001206 annotation_types <- c("pos", "lemma", "morph", "atokens")
1207 for (type in annotation_types) {
Marc Kupietz93787d52025-09-03 13:33:25 +02001208 if (overwrite || !existing_types[[type]]) {
1209 df[[type]] <- create_annotation_df(empty_char_list)
1210 }
Marc Kupietz03d2b1a2025-07-19 09:14:45 +02001211 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001212
Marc Kupietz93787d52025-09-03 13:33:25 +02001213 if (overwrite || !existing_types$annotation_snippet) {
1214 df$annotation_snippet <- replicate(nrows, NA, simplify = FALSE)
1215 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001216
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001217 # Initialize timing for ETA calculation
1218 start_time <- Sys.time()
1219 if (verbose) {
1220 log_info(verbose, paste("Starting to fetch annotations for", nrows, "matches\n"))
1221 }
1222
Marc Kupietz93787d52025-09-03 13:33:25 +02001223 # Helper to decide if existing annotation row is effectively empty
1224 is_empty_annotation_row <- function(ann_df, row_index) {
1225 if (is.null(ann_df) || nrow(ann_df) < row_index) return(TRUE)
1226 left_val <- ann_df$left[[row_index]]
1227 match_val <- ann_df$match[[row_index]]
1228 right_val <- ann_df$right[[row_index]]
1229 all(
1230 (is.null(left_val) || (length(left_val) == 0) || all(is.na(left_val))),
1231 (is.null(match_val) || (length(match_val) == 0) || all(is.na(match_val))),
1232 (is.null(right_val) || (length(right_val) == 0) || all(is.na(right_val)))
1233 )
1234 }
1235
Marc Kupietze52b2952025-07-17 16:53:02 +02001236 for (i in seq_len(nrow(df))) {
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001237 # ETA logging
1238 if (verbose && i > 1) {
1239 eta_info <- calculate_eta(i, nrows, start_time)
1240 log_info(verbose, paste("Fetching annotations for match", i, "of", nrows, eta_info, "\n"))
1241 }
Marc Kupietzff712a92025-07-18 09:07:23 +02001242 # Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
1243 if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001244 # matchID format: "match-match-A00/JUN/39609-p202-203" or encrypted format like
1245 # "match-DNB10/CSL/80400-p2343-2344x_MinDOhu_P6dd2MMZJyyus_7MairdKnr1LxY07Cya-Ow"
1246 # Extract document path and position, handling both regular and encrypted formats
Marc Kupietzc643a122025-07-18 18:18:36 +02001247
Marc Kupietza29f3d42025-07-18 10:14:43 +02001248 # More flexible regex to extract the document path with position and encryption
1249 # Look for pattern: match-(...)-p(\d+)-(\d+)(.*) where (.*) is the encrypted part
1250 # We need to capture the entire path including the encrypted suffix
1251 match_result <- regexpr("match-(.+?-p\\d+-\\d+.*)", df$matchID[i], perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001252
Marc Kupietza29f3d42025-07-18 10:14:43 +02001253 if (match_result > 0) {
1254 # Extract the complete path including encryption (everything after "match-")
1255 doc_path_with_pos_and_encryption <- gsub("^match-(.+)$", "\\1", df$matchID[i], perl = TRUE)
1256 # Convert the dash before position to slash, but keep everything after the position
1257 match_path <- gsub("-p(\\d+-\\d+.*)", "/p\\1", doc_path_with_pos_and_encryption)
Marc Kupietz25121302025-07-19 08:45:43 +02001258 # Use httr2 to construct URL safely
1259 base_url <- paste0(kco@apiUrl, "corpus/", match_path)
1260 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietza29f3d42025-07-18 10:14:43 +02001261 } else {
Marc Kupietz25121302025-07-19 08:45:43 +02001262 # If regex fails, fall back to the old method with httr2
1263 # Format numbers to avoid scientific notation
1264 match_start <- format(df$matchStart[i], scientific = FALSE)
1265 match_end <- format(df$matchEnd[i], scientific = FALSE)
1266 base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
1267 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietzff712a92025-07-18 09:07:23 +02001268 }
1269 } else {
Marc Kupietz25121302025-07-19 08:45:43 +02001270 # Fallback to the old method with httr2
1271 # Format numbers to avoid scientific notation
1272 match_start <- format(df$matchStart[i], scientific = FALSE)
1273 match_end <- format(df$matchEnd[i], scientific = FALSE)
1274 base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
1275 req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietzff712a92025-07-18 09:07:23 +02001276 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001277
Marc Kupietze52b2952025-07-17 16:53:02 +02001278 tryCatch({
1279 res <- apiCall(kco, req)
Marc Kupietzc643a122025-07-18 18:18:36 +02001280
Marc Kupietze52b2952025-07-17 16:53:02 +02001281 if (!is.null(res)) {
Marc Kupietz93787d52025-09-03 13:33:25 +02001282 # Store the raw annotation snippet (respect overwrite flag)
1283 if (overwrite || !existing_types$annotation_snippet || is.null(df$annotation_snippet[[i]]) || is.na(df$annotation_snippet[[i]])) {
1284 df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
1285 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001286
1287 # Parse XML annotations if snippet is available
1288 if (is.list(res) && "snippet" %in% names(res)) {
1289 parsed_annotations <- parse_xml_annotations_structured(res$snippet)
1290
1291 # Store the parsed linguistic data in data frame format (like tokens)
1292 # Use individual assignment to avoid data frame mismatch errors
1293 tryCatch({
1294 # Assign POS annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001295 if (overwrite || !existing_types$pos || is_empty_annotation_row(df$pos, i)) {
1296 df$pos$left[i] <- list(parsed_annotations$pos$left)
1297 df$pos$match[i] <- list(parsed_annotations$pos$match)
1298 df$pos$right[i] <- list(parsed_annotations$pos$right)
1299 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001300
Marc Kupietza29f3d42025-07-18 10:14:43 +02001301 # Assign lemma annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001302 if (overwrite || !existing_types$lemma || is_empty_annotation_row(df$lemma, i)) {
1303 df$lemma$left[i] <- list(parsed_annotations$lemma$left)
1304 df$lemma$match[i] <- list(parsed_annotations$lemma$match)
1305 df$lemma$right[i] <- list(parsed_annotations$lemma$right)
1306 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001307
Marc Kupietza29f3d42025-07-18 10:14:43 +02001308 # Assign morphology annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001309 if (overwrite || !existing_types$morph || is_empty_annotation_row(df$morph, i)) {
1310 df$morph$left[i] <- list(parsed_annotations$morph$left)
1311 df$morph$match[i] <- list(parsed_annotations$morph$match)
1312 df$morph$right[i] <- list(parsed_annotations$morph$right)
1313 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001314
Marc Kupietza29f3d42025-07-18 10:14:43 +02001315 # Assign token annotations
Marc Kupietz93787d52025-09-03 13:33:25 +02001316 if (overwrite || !existing_types$atokens || is_empty_annotation_row(df$atokens, i)) {
1317 df$atokens$left[i] <- list(parsed_annotations$atokens$left)
1318 df$atokens$match[i] <- list(parsed_annotations$atokens$match)
1319 df$atokens$right[i] <- list(parsed_annotations$atokens$right)
1320 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001321 }, error = function(assign_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001322 # Set empty character vectors on assignment error using list assignment
Marc Kupietz93787d52025-09-03 13:33:25 +02001323 if (overwrite || !existing_types$pos) {
1324 df$pos$left[i] <<- list(character(0))
1325 df$pos$match[i] <<- list(character(0))
1326 df$pos$right[i] <<- list(character(0))
1327 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001328
Marc Kupietz93787d52025-09-03 13:33:25 +02001329 if (overwrite || !existing_types$lemma) {
1330 df$lemma$left[i] <<- list(character(0))
1331 df$lemma$match[i] <<- list(character(0))
1332 df$lemma$right[i] <<- list(character(0))
1333 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001334
Marc Kupietz93787d52025-09-03 13:33:25 +02001335 if (overwrite || !existing_types$morph) {
1336 df$morph$left[i] <<- list(character(0))
1337 df$morph$match[i] <<- list(character(0))
1338 df$morph$right[i] <<- list(character(0))
1339 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001340
Marc Kupietz93787d52025-09-03 13:33:25 +02001341 if (overwrite || !existing_types$atokens) {
1342 df$atokens$left[i] <<- list(character(0))
1343 df$atokens$match[i] <<- list(character(0))
1344 df$atokens$right[i] <<- list(character(0))
1345 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001346 })
Marc Kupietza29f3d42025-07-18 10:14:43 +02001347 } else {
1348 # No snippet available, store empty vectors
Marc Kupietz93787d52025-09-03 13:33:25 +02001349 if (overwrite || !existing_types$pos) {
1350 df$pos$left[i] <- list(character(0))
1351 df$pos$match[i] <- list(character(0))
1352 df$pos$right[i] <- list(character(0))
1353 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001354
Marc Kupietz93787d52025-09-03 13:33:25 +02001355 if (overwrite || !existing_types$lemma) {
1356 df$lemma$left[i] <- list(character(0))
1357 df$lemma$match[i] <- list(character(0))
1358 df$lemma$right[i] <- list(character(0))
1359 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001360
Marc Kupietz93787d52025-09-03 13:33:25 +02001361 if (overwrite || !existing_types$morph) {
1362 df$morph$left[i] <- list(character(0))
1363 df$morph$match[i] <- list(character(0))
1364 df$morph$right[i] <- list(character(0))
1365 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001366
Marc Kupietz93787d52025-09-03 13:33:25 +02001367 if (overwrite || !existing_types$atokens) {
1368 df$atokens$left[i] <- list(character(0))
1369 df$atokens$match[i] <- list(character(0))
1370 df$atokens$right[i] <- list(character(0))
1371 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001372 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001373 } else {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001374 # Store NAs for failed requests
Marc Kupietz93787d52025-09-03 13:33:25 +02001375 if (overwrite || !existing_types$pos) {
1376 df$pos$left[i] <- list(NA)
1377 df$pos$match[i] <- list(NA)
1378 df$pos$right[i] <- list(NA)
1379 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001380
Marc Kupietz93787d52025-09-03 13:33:25 +02001381 if (overwrite || !existing_types$lemma) {
1382 df$lemma$left[i] <- list(NA)
1383 df$lemma$match[i] <- list(NA)
1384 df$lemma$right[i] <- list(NA)
1385 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001386
Marc Kupietz93787d52025-09-03 13:33:25 +02001387 if (overwrite || !existing_types$morph) {
1388 df$morph$left[i] <- list(NA)
1389 df$morph$match[i] <- list(NA)
1390 df$morph$right[i] <- list(NA)
1391 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001392
Marc Kupietz93787d52025-09-03 13:33:25 +02001393 if (overwrite || !existing_types$atokens) {
1394 df$atokens$left[i] <- list(NA)
1395 df$atokens$match[i] <- list(NA)
1396 df$atokens$right[i] <- list(NA)
1397 }
1398 if (overwrite || !existing_types$annotation_snippet) {
1399 df$annotation_snippet[[i]] <- NA
1400 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001401 }
1402 }, error = function(e) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001403 # Store NAs for failed requests
Marc Kupietz93787d52025-09-03 13:33:25 +02001404 if (overwrite || !existing_types$pos) {
1405 df$pos$left[i] <- list(NA)
1406 df$pos$match[i] <- list(NA)
1407 df$pos$right[i] <- list(NA)
1408 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001409
Marc Kupietz93787d52025-09-03 13:33:25 +02001410 if (overwrite || !existing_types$lemma) {
1411 df$lemma$left[i] <- list(NA)
1412 df$lemma$match[i] <- list(NA)
1413 df$lemma$right[i] <- list(NA)
1414 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001415
Marc Kupietz93787d52025-09-03 13:33:25 +02001416 if (overwrite || !existing_types$morph) {
1417 df$morph$left[i] <- list(NA)
1418 df$morph$match[i] <- list(NA)
1419 df$morph$right[i] <- list(NA)
1420 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001421
Marc Kupietz93787d52025-09-03 13:33:25 +02001422 if (overwrite || !existing_types$atokens) {
1423 df$atokens$left[i] <- list(NA)
1424 df$atokens$match[i] <- list(NA)
1425 df$atokens$right[i] <- list(NA)
1426 }
1427 if (overwrite || !existing_types$annotation_snippet) {
1428 df$annotation_snippet[[i]] <- NA
1429 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001430 })
1431 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001432
Marc Kupietza29f3d42025-07-18 10:14:43 +02001433 # Validate data frame structure before assignment
1434 if (nrow(df) != nrow(kqo@collectedMatches)) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001435 }
1436
1437 # Update the collectedMatches with annotation data
1438 tryCatch({
1439 kqo@collectedMatches <- df
1440 }, error = function(assign_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001441 # Try a safer approach: add columns individually
1442 tryCatch({
1443 kqo@collectedMatches$pos <- df$pos
Marc Kupietzc643a122025-07-18 18:18:36 +02001444 kqo@collectedMatches$lemma <- df$lemma
Marc Kupietza29f3d42025-07-18 10:14:43 +02001445 kqo@collectedMatches$morph <- df$morph
1446 kqo@collectedMatches$atokens <- df$atokens
1447 kqo@collectedMatches$annotation_snippet <- df$annotation_snippet
1448 }, error = function(col_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001449 warning("Failed to add annotation data to collectedMatches")
1450 })
1451 })
1452
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001453 if (verbose) {
1454 elapsed_time <- Sys.time() - start_time
1455 log_info(verbose, paste("Finished fetching annotations for", nrows, "matches in", format_duration(as.numeric(elapsed_time, units = "secs")), "\n"))
1456 }
1457
Marc Kupietze52b2952025-07-17 16:53:02 +02001458 return(kqo)
1459})
1460
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001461#' Query frequencies of search expressions in virtual corpora
Marc Kupietz3f575282019-10-04 14:46:04 +02001462#'
Marc Kupietz67edcb52021-09-20 21:54:24 +02001463#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001464#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +02001465#' confidence intervals of one ore multiple search terms across one or multiple
1466#' virtual corpora.
1467#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02001468#' @family frequency analysis
Marc Kupietz3f575282019-10-04 14:46:04 +02001469#' @aliases frequencyQuery
Marc Kupietz3f575282019-10-04 14:46:04 +02001470#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +02001471#' \dontrun{
1472#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001473#' KorAPConnection(verbose = TRUE) |>
Marc Kupietz3f575282019-10-04 14:46:04 +02001474#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +01001475#' }
Marc Kupietz3f575282019-10-04 14:46:04 +02001476#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001477# @inheritParams corpusQuery
Marc Kupietz617266d2025-02-27 10:43:07 +01001478#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001479#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
1480#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz67edcb52021-09-20 21:54:24 +02001481#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
1482#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001483#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz3f575282019-10-04 14:46:04 +02001484#' @export
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001485#'
1486#' @return A tibble, with each row containing the following result columns for query and vc combinations:
1487#' - **query**: the query string used for the frequency analysis.
1488#' - **totalResults**: absolute frequency of query matches in the vc.
1489#' - **vc**: virtual corpus used for the query.
1490#' - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
1491#' - **total**: total number of words in vc.
1492#' - **f**: relative frequency of query matches in the vc.
1493#' - **conf.low**: lower bound of the confidence interval for the relative frequency, given `conf.level`.
1494#' - **conf.high**: upper bound of the confidence interval for the relative frequency, given `conf.level`.
1495
Marc Kupietzd8851222025-05-01 10:57:19 +02001496setMethod(
1497 "frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +01001498 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietzd8851222025-05-01 10:57:19 +02001499 (if (as.alternatives) {
1500 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
Marc Kupietzea34b812025-06-25 15:49:00 +02001501 group_by(vc) |>
Marc Kupietz71d6e052019-11-22 18:42:10 +01001502 mutate(total = sum(totalResults))
Marc Kupietzd8851222025-05-01 10:57:19 +02001503 } else {
1504 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
1505 mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
Marc Kupietzea34b812025-06-25 15:49:00 +02001506 }) |>
Marc Kupietz0c29cea2019-10-09 08:44:36 +02001507 ci(conf.level = conf.level)
Marc Kupietzd8851222025-05-01 10:57:19 +02001508 }
1509)
Marc Kupietz3f575282019-10-04 14:46:04 +02001510
Marc Kupietz38a9d682024-12-06 16:17:09 +01001511#' buildWebUIRequestUrlFromString
1512#'
1513#' @rdname KorAPQuery-class
1514#' @importFrom urltools url_encode
1515#' @export
1516buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +02001517 query,
1518 vc = "",
1519 ql = "poliqarp") {
Marc Kupietz38a9d682024-12-06 16:17:09 +01001520 if ("KorAPConnection" %in% class(KorAPUrl)) {
1521 KorAPUrl <- KorAPUrl@KorAPUrl
1522 }
1523
1524 request <-
1525 paste0(
Marc Kupietzd8851222025-05-01 10:57:19 +02001526 "?q=",
Marc Kupietz38a9d682024-12-06 16:17:09 +01001527 urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietzd8851222025-05-01 10:57:19 +02001528 ifelse(vc != "",
1529 paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
1530 ""
1531 ),
1532 "&ql=",
Marc Kupietz38a9d682024-12-06 16:17:09 +01001533 ql
1534 )
1535 paste0(KorAPUrl, request)
1536}
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001537
1538#' buildWebUIRequestUrl
1539#'
1540#' @rdname KorAPQuery-class
Marc Kupietzf9129592025-01-26 19:17:54 +01001541#' @importFrom httr2 url_parse
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001542#' @export
1543buildWebUIRequestUrl <- function(kco,
Marc Kupietzd8851222025-05-01 10:57:19 +02001544 query = if (missing(KorAPUrl)) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001545 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietzd8851222025-05-01 10:57:19 +02001546 } else {
1547 httr2::url_parse(KorAPUrl)$query$q
1548 },
Marc Kupietzf9129592025-01-26 19:17:54 +01001549 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001550 KorAPUrl,
Marc Kupietzf9129592025-01-26 19:17:54 +01001551 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz38a9d682024-12-06 16:17:09 +01001552 buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001553}
1554
Marc Kupietzd8851222025-05-01 10:57:19 +02001555#' format()
Marc Kupietze95108e2019-09-18 13:23:58 +02001556#' @rdname KorAPQuery-class
1557#' @param x KorAPQuery object
1558#' @param ... further arguments passed to or from other methods
Marc Kupietzb73ca0f2025-01-28 20:45:01 +01001559#' @importFrom urltools param_get url_decode
Marc Kupietze95108e2019-09-18 13:23:58 +02001560#' @export
1561format.KorAPQuery <- function(x, ...) {
1562 cat("<KorAPQuery>\n")
1563 q <- x
Marc Kupietzd8851222025-05-01 10:57:19 +02001564 param <- urltools::param_get(q@request) |> lapply(urltools::url_decode)
Marc Kupietzb73ca0f2025-01-28 20:45:01 +01001565 cat(" Query: ", param$q, "\n")
1566 if (!is.null(param$cq) && param$cq != "") {
1567 cat(" Virtual corpus: ", param$cq, "\n")
1568 }
1569 if (!is.null(q@collectedMatches)) {
1570 cat("==============================================================================================================", "\n")
1571 print(summary(q@collectedMatches))
1572 cat("==============================================================================================================", "\n")
1573 }
1574 cat(" Total results: ", q@totalResults, "\n")
1575 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietza29f3d42025-07-18 10:14:43 +02001576 if (!is.null(q@collectedMatches) && "pos" %in% colnames(q@collectedMatches)) {
1577 successful_annotations <- sum(!is.na(q@collectedMatches$annotation_snippet))
1578 parsed_annotations <- sum(!is.na(q@collectedMatches$pos))
1579 cat(" Annotations: ", successful_annotations, " of ", nrow(q@collectedMatches), " matches")
1580 if (parsed_annotations > 0) {
1581 cat(" (", parsed_annotations, " with parsed linguistic data)")
1582 }
1583 cat("\n")
Marc Kupietze52b2952025-07-17 16:53:02 +02001584 }
Marc Kupietz62da2b52019-09-12 17:43:34 +02001585}
1586
Marc Kupietze95108e2019-09-18 13:23:58 +02001587#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +02001588#'
Marc Kupietze95108e2019-09-18 13:23:58 +02001589#' @rdname KorAPQuery-class
1590#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +02001591#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +02001592setMethod("show", "KorAPQuery", function(object) {
1593 format(object)
Marc Kupietzc643a122025-07-18 18:18:36 +02001594 invisible(object)
Marc Kupietze95108e2019-09-18 13:23:58 +02001595})