blob: cbf0169fb18fae6708bddd3e5b145180c86e3811 [file] [log] [blame]
Marc Kupietza8c40f42025-06-24 15:49:52 +02001#' KorAPQuery class (internal)
Marc Kupietze95108e2019-09-18 13:23:58 +02002#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02003#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
Marc Kupietze95108e2019-09-18 13:23:58 +02004#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02005#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +02006#' @include KorAPConnection.R
Marc Kupietz6dfeed92025-06-03 11:58:06 +02007#' @include logging.R
Marc Kupietzf9129592025-01-26 19:17:54 +01008#' @import httr2
Marc Kupietze95108e2019-09-18 13:23:58 +02009#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
Marc Kupietza29f3d42025-07-18 10:14:43 +020024 "hasMoreMatches"
Marc Kupietze95108e2019-09-18 13:23:58 +020025))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietza8c40f42025-06-24 15:49:52 +020027#' Initialize KorAPQuery object
28#' @keywords internal
Marc Kupietze95108e2019-09-18 13:23:58 +020029#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020030#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020031#' @param request query part of the request URL
32#' @param vc definition of a virtual corpus
33#' @param totalResults number of hits the query has yielded
34#' @param nextStartIndex at what index to start the next fetch of query results
35#' @param fields what data / metadata fields should be collected
36#' @param requestUrl complete URL of the API request
37#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
38#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020039#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020040#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020041#'
42#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020043#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +020044setMethod(
45 "initialize", "KorAPQuery",
46 function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
47 "corpusSigle", "textSigle", "pubDate", "pubPlace",
48 "availability", "textClass", "snippet", "tokens"
49 ),
Marc Kupietza29f3d42025-07-18 10:14:43 +020050 requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
Marc Kupietzd8851222025-05-01 10:57:19 +020051 .Object <- callNextMethod()
52 .Object@korapConnection <- korapConnection
53 .Object@request <- request
54 .Object@vc <- vc
55 .Object@totalResults <- totalResults
56 .Object@nextStartIndex <- nextStartIndex
57 .Object@fields <- fields
58 .Object@requestUrl <- requestUrl
59 .Object@webUIRequestUrl <- webUIRequestUrl
60 .Object@apiResponse <- apiResponse
61 .Object@hasMoreMatches <- hasMoreMatches
62 .Object@collectedMatches <- collectedMatches
63 .Object
64 }
65)
Marc Kupietz632cbd42019-09-06 16:04:51 +020066
Marc Kupietzd8851222025-05-01 10:57:19 +020067setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
68setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
69setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
70setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
Marc Kupietze52b2952025-07-17 16:53:02 +020071setGeneric("fetchAnnotations", function(kqo, ...) standardGeneric("fetchAnnotations"))
Marc Kupietzd8851222025-05-01 10:57:19 +020072setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietze95108e2019-09-18 13:23:58 +020073
74maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020075
Marc Kupietz4de53ec2019-10-04 09:12:00 +020076## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietzef1ef4a2025-02-19 12:12:40 +010077utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020078
Marc Kupietza8c40f42025-06-24 15:49:52 +020079#' Search corpus for query terms
Marc Kupietzdbd431a2021-08-29 12:17:45 +020080#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020081#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020082#'
Marc Kupietza8c40f42025-06-24 15:49:52 +020083#' @family corpus search functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +020084#' @aliases corpusQuery
85#'
86#' @importFrom urltools url_encode
87#' @importFrom purrr pmap
Marc Kupietzea34b812025-06-25 15:49:00 +020088#' @importFrom dplyr bind_rows group_by
Marc Kupietzdbd431a2021-08-29 12:17:45 +020089#'
Marc Kupietz617266d2025-02-27 10:43:07 +010090#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz67edcb52021-09-20 21:54:24 +020091#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020092#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020093#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz132f0052023-04-16 14:23:05 +020094#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
95#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
96#' your RKorAPClient application as explained in the
97#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
98#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
99#' `FALSE`.
Marc Kupietz67edcb52021-09-20 21:54:24 +0200100#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Marc Kupietz1623fe82025-06-24 16:31:46 +0200101#' @param fields character vector specifying which metadata fields to retrieve for each match.
102#' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
103#' \describe{
104#' \item{**Text identification**:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
105#' \item{**Publication info**:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
106#' \item{**Temporal data**:}{`pubDate`, `creationDate` - when text was published/created}
107#' \item{**Publication details**:}{`pubPlace`, `publisher`, `reference` - where/how published}
108#' \item{**Text classification**:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
109#' \item{**Adminstrative and technical info**:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
110#' \item{**Content data**:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
111#' \item{**System data**:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
112#' }
113#' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
114#' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default if `metadataOnly` is set to `FALSE`.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100115#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +0200116#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200117#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200118#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200119#' @param context string that specifies the size of the left and the right context returned in `snippet`
120#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
121#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
122#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
123#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
124#' as this is typically legally motivated.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200125#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz67edcb52021-09-20 21:54:24 +0200126#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
127#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200128#'
129#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200130#' \dontrun{
131#'
Marc Kupietz1623fe82025-06-24 16:31:46 +0200132#' # Fetch basic metadata for "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200133#' KorAPConnection() |>
134#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200135#' fetchAll()
Marc Kupietz1623fe82025-06-24 16:31:46 +0200136#'
137#' # Fetch specific metadata fields for bibliographic analysis
Marc Kupietzd3526422025-06-25 09:16:15 +0200138#' query <- KorAPConnection() |>
Marc Kupietz1623fe82025-06-24 16:31:46 +0200139#' corpusQuery("Ameisenplage",
140#' fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
141#' results <- fetchAll(query)
142#' results@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100143#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200144#'
Marc Kupietz6ae76052021-09-21 10:34:00 +0200145#' \dontrun{
146#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200147#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
148#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200149#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200150#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200151#' corpusQuery(
152#' KorAPUrl =
153#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
154#' )
Marc Kupietz6ae76052021-09-21 10:34:00 +0200155#' }
156#'
157#' \dontrun{
Marc Kupietz3c531f62019-09-13 12:17:24 +0200158#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200159#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietzd3526422025-06-25 09:16:15 +0200160#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200161#' {
162#' . ->> kco
Marc Kupietzd3526422025-06-25 09:16:15 +0200163#' } |>
164#' corpusQuery("Ameisenplage") |>
165#' fetchAll() |>
166#' slot("collectedMatches") |>
167#' mutate(year = lubridate::year(pubDate)) |>
168#' dplyr::select(year) |>
169#' group_by(year) |>
170#' summarise(Count = dplyr::n()) |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200171#' mutate(Freq = mapply(function(f, y) {
172#' f / corpusStats(kco, paste("pubDate in", y))@tokens
Marc Kupietzd3526422025-06-25 09:16:15 +0200173#' }, Count, year)) |>
174#' dplyr::select(-Count) |>
175#' complete(year = min(year):max(year), fill = list(Freq = 0)) |>
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200176#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100177#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200178#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200179#'
180#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200181#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200182#'
183#' @export
Marc Kupietzd8851222025-05-01 10:57:19 +0200184setMethod(
185 "corpusQuery", "KorAPConnection",
186 function(kco,
187 query = if (missing(KorAPUrl)) {
188 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
189 } else {
190 httr2::url_parse(KorAPUrl)$query$q
191 },
192 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
193 KorAPUrl,
194 metadataOnly = TRUE,
195 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
196 fields = c(
197 "corpusSigle",
198 "textSigle",
199 "pubDate",
200 "pubPlace",
201 "availability",
202 "textClass",
203 "snippet",
204 "tokens"
205 ),
206 accessRewriteFatal = TRUE,
207 verbose = kco@verbose,
208 expand = length(vc) != length(query),
209 as.df = FALSE,
210 context = NULL) {
211 if (length(query) > 1 || length(vc) > 1) {
212 grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200213
214 # Initialize timing variables for ETA calculation
215 total_queries <- nrow(grid)
216 current_query <- 0
217 start_time <- Sys.time()
218
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200219 results <- purrr::pmap(grid, function(query, vc, ...) {
220 current_query <<- current_query + 1
221
222 # Execute the single query directly (avoiding recursive call)
223 contentFields <- c("snippet", "tokens")
224 query_fields <- fields
225 if (metadataOnly) {
226 query_fields <- query_fields[!query_fields %in% contentFields]
227 }
228 if (!"textSigle" %in% query_fields) {
229 query_fields <- c(query_fields, "textSigle")
230 }
231 request <-
232 paste0(
233 "?q=",
234 url_encode(enc2utf8(query)),
235 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
236 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
237 ifelse(!metadataOnly, "&show-tokens=true", ""),
238 "&ql=", ql
239 )
240 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
241 requestUrl <- paste0(
242 kco@apiUrl,
243 "search",
244 request,
245 "&fields=",
246 paste(query_fields, collapse = ","),
247 if (metadataOnly) "&access-rewrite-disabled=true" else ""
248 )
249
250 # Show individual query progress
251 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
252 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
253 if (is.null(res)) {
254 log_info(verbose, ": API call failed\n")
255 totalResults <- 0
256 } else {
257 totalResults <- as.integer(res$meta$totalResults)
258 log_info(verbose, ": ", totalResults, " hits")
259 if (!is.null(res$meta$cached)) {
260 log_info(verbose, " [cached]")
261 } else if (!is.null(res$meta$benchmark)) {
262 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
263 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
264 formatted_time <- paste0(round(time_value, 2), "s")
265 log_info(verbose, ", took ", formatted_time)
266 } else {
267 log_info(verbose, ", took ", res$meta$benchmark)
268 }
269 }
Marc Kupietz365660e2025-06-25 15:09:55 +0200270
271 # Calculate and display ETA information on the same line if verbose and we have more than one query
272 if (verbose && total_queries > 1) {
273 eta_info <- calculate_eta(current_query, total_queries, start_time)
274 if (eta_info != "") {
275 elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
276 avg_time_per_query <- elapsed_time / current_query
277
278 # Add ETA info to the same line - remove the leading ". " for cleaner formatting
279 clean_eta_info <- sub("^\\. ", ". ", eta_info)
280 log_info(verbose, clean_eta_info)
281 }
282 }
283
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200284 log_info(verbose, "\n")
285 }
286
287 result <- data.frame(
288 query = query,
289 totalResults = totalResults,
290 vc = vc,
291 webUIRequestUrl = webUIRequestUrl,
292 stringsAsFactors = FALSE
293 )
294
Marc Kupietz6ef61a82025-05-29 16:07:03 +0200295 return(result)
296 })
297
298 results %>% bind_rows()
Marc Kupietzd8851222025-05-01 10:57:19 +0200299 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200300 contentFields <- c("snippet", "tokens")
Marc Kupietza96537f2019-11-09 23:07:44 +0100301 if (metadataOnly) {
302 fields <- fields[!fields %in% contentFields]
303 }
Marc Kupietz80dc6432025-02-07 16:57:40 +0100304 if (!"textSigle" %in% fields) {
305 fields <- c(fields, "textSigle")
306 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100307 request <-
Marc Kupietzd8851222025-05-01 10:57:19 +0200308 paste0(
309 "?q=",
310 url_encode(enc2utf8(query)),
311 ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
312 ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
313 ifelse(!metadataOnly, "&show-tokens=true", ""),
314 "&ql=", ql
315 )
Marc Kupietza96537f2019-11-09 23:07:44 +0100316 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
317 requestUrl <- paste0(
318 kco@apiUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +0200319 "search",
Marc Kupietza96537f2019-11-09 23:07:44 +0100320 request,
Marc Kupietzd8851222025-05-01 10:57:19 +0200321 "&fields=",
Marc Kupietza96537f2019-11-09 23:07:44 +0100322 paste(fields, collapse = ","),
Marc Kupietzd8851222025-05-01 10:57:19 +0200323 if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietza96537f2019-11-09 23:07:44 +0100324 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200325 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
326 sep =
327 ""
328 )
329 res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietza4675722022-02-23 23:55:15 +0100330 if (is.null(res)) {
Marc Kupietza4675722022-02-23 23:55:15 +0100331 message("API call failed.")
332 totalResults <- 0
333 } else {
Marc Kupietzd8851222025-05-01 10:57:19 +0200334 totalResults <- as.integer(res$meta$totalResults)
Marc Kupietza47d1502023-04-18 15:26:47 +0200335 log_info(verbose, ": ", totalResults, " hits")
Marc Kupietzd8851222025-05-01 10:57:19 +0200336 if (!is.null(res$meta$cached)) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200337 log_info(verbose, " [cached]\n")
Marc Kupietzd8851222025-05-01 10:57:19 +0200338 } else if (!is.null(res$meta$benchmark)) {
Marc Kupietz7638ca42025-05-25 13:18:16 +0200339 # Round the benchmark time to 2 decimal places for better readability
340 # If it's a string ending with 's', extract the number, round it, and re-add 's'
341 if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
342 time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
343 formatted_time <- paste0(round(time_value, 2), "s")
344 log_info(verbose, ", took ", formatted_time, "\n", sep = "")
345 } else {
346 # Fallback if the format is different than expected
347 log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
348 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200349 } else {
350 log_info(verbose, "\n")
351 }
Marc Kupietza4675722022-02-23 23:55:15 +0100352 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200353 if (as.df) {
Marc Kupietza96537f2019-11-09 23:07:44 +0100354 data.frame(
355 query = query,
Marc Kupietza4675722022-02-23 23:55:15 +0100356 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100357 vc = vc,
358 webUIRequestUrl = webUIRequestUrl,
359 stringsAsFactors = FALSE
360 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200361 } else {
Marc Kupietza96537f2019-11-09 23:07:44 +0100362 KorAPQuery(
363 korapConnection = kco,
364 nextStartIndex = 0,
365 fields = fields,
366 requestUrl = requestUrl,
367 request = request,
Marc Kupietza4675722022-02-23 23:55:15 +0100368 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100369 vc = vc,
370 apiResponse = res,
371 webUIRequestUrl = webUIRequestUrl,
Marc Kupietza4675722022-02-23 23:55:15 +0100372 hasMoreMatches = (totalResults > 0),
Marc Kupietza96537f2019-11-09 23:07:44 +0100373 )
Marc Kupietzd8851222025-05-01 10:57:19 +0200374 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100375 }
Marc Kupietzd8851222025-05-01 10:57:19 +0200376 }
377)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200378
Marc Kupietz05a60792024-12-07 16:23:31 +0100379#' @importFrom purrr map
380repair_data_strcuture <- function(x) {
Marc Kupietzd8851222025-05-01 10:57:19 +0200381 if (is.list(x)) {
382 as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz05a60792024-12-07 16:23:31 +0100383 paste(.x, collapse = " ")
384 } else {
385 .x
386 }))
Marc Kupietzd8851222025-05-01 10:57:19 +0200387 } else {
Marc Kupietz05a60792024-12-07 16:23:31 +0100388 ifelse(is.na(x), "", x)
Marc Kupietzd8851222025-05-01 10:57:19 +0200389 }
Marc Kupietz05a60792024-12-07 16:23:31 +0100390}
391
Marc Kupietz62da2b52019-09-12 17:43:34 +0200392#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200393#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200394#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200395#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200396#' @family corpus search functions
397#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200398#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200399#' @param offset start offset for query results to fetch
400#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200401#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200402#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
403#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200404#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100405#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200406#' \dontrun{
407#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200408#' q <- KorAPConnection() |>
409#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200410#' fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100411#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100412#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100413#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200414#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200415#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200416#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200417#' @aliases fetchNext
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200418#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietzf4881122024-12-17 14:55:39 +0100419#' @importFrom tibble enframe add_column
420#' @importFrom stringr word
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200421#' @importFrom tidyr unnest unchop pivot_wider
422#' @importFrom purrr map
Marc Kupietz632cbd42019-09-06 16:04:51 +0200423#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200424setMethod("fetchNext", "KorAPQuery", function(kqo,
425 offset = kqo@nextStartIndex,
426 maxFetch = maxResultsPerPage,
427 verbose = kqo@korapConnection@verbose,
428 randomizePageOrder = FALSE) {
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100429 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietzd8851222025-05-01 10:57:19 +0200430 results <- key <- name <- tmp_positions <- 0
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100431
Marc Kupietze95108e2019-09-18 13:23:58 +0200432 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
433 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200434 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200435 use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz623d7122025-05-25 12:46:12 +0200436 # Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietze95108e2019-09-18 13:23:58 +0200437 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200438
Marc Kupietz24799fd2025-06-25 14:15:36 +0200439 # Track start time for ETA calculation
440 start_time <- Sys.time()
441
Marc Kupietz623d7122025-05-25 12:46:12 +0200442 # For randomized page order, generate a list of randomized page indices
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200443 if (randomizePageOrder) {
Marc Kupietz623d7122025-05-25 12:46:12 +0200444 # Calculate how many pages we need to fetch based on maxFetch
445 total_pages_to_fetch <- if (!is.na(maxFetch)) {
446 # Either limited by maxFetch or total results, whichever is smaller
447 min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
448 } else {
449 # All pages
450 ceiling(kqo@totalResults / maxResultsPerPage)
451 }
452
453 # Generate randomized page indices (0-based for API)
454 pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
455 page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200456 }
457
Marc Kupietzd8851222025-05-01 10:57:19 +0200458 if (is.null(collectedMatches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200459 collectedMatches <- data.frame()
460 }
Marc Kupietz623d7122025-05-25 12:46:12 +0200461
462 # Initialize the page counter properly based on nextStartIndex and any previously fetched results
463 # We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
464 # For first call, this will be 1, for subsequent calls, it will reflect our actual position
465 current_page_number <- ceiling(offset / maxResultsPerPage) + 1
466
467 # For sequential fetches, keep track of which global page we're on
468 # This is important for correctly showing page numbers in subsequent fetchNext calls
469 page_count_start <- current_page_number
470
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200471 repeat {
Marc Kupietz623d7122025-05-25 12:46:12 +0200472 # Determine which page to fetch next
473 if (randomizePageOrder) {
474 # In randomized mode, get the page from our randomized list using the page_index
475 # Make sure we don't exceed the array bounds
476 if (page_index > length(pages)) {
477 break # No more pages to fetch in randomized mode
478 }
479 current_offset_page <- pages[page_index]
480 # For display purposes in randomized mode, show which page out of the total we're fetching
481 display_page_number <- page_index
482 } else {
483 # In sequential mode, use the current_page_number to calculate the offset
484 current_offset_page <- (current_page_number - 1)
485 display_page_number <- current_page_number
486 }
487
488 # Calculate the actual offset in tokens
489 currentOffset <- current_offset_page * maxResultsPerPage
490
Marc Kupietzef0e9392025-06-18 12:21:49 +0200491 # Build the query with the appropriate count and offset using httr2
492 count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
Marc Kupietzecc86702025-06-24 12:12:51 +0200493
Marc Kupietzef0e9392025-06-18 12:21:49 +0200494 # Parse existing URL to preserve all query parameters
495 parsed_url <- httr2::url_parse(kqo@requestUrl)
496 existing_query <- parsed_url$query
Marc Kupietzecc86702025-06-24 12:12:51 +0200497
Marc Kupietzef0e9392025-06-18 12:21:49 +0200498 # Add/update count and offset parameters
499 existing_query$count <- count_param
500 existing_query$offset <- currentOffset
Marc Kupietzecc86702025-06-24 12:12:51 +0200501
Marc Kupietzef0e9392025-06-18 12:21:49 +0200502 # Rebuild the URL with all parameters
503 query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz68170952021-06-30 09:37:21 +0200504 res <- apiCall(kqo@korapConnection, query)
505 if (length(res$matches) == 0) {
506 break
507 }
508
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200509 if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz16ccf112025-01-26 13:25:27 +0100510 log_info(verbose, "Using fields API: ")
Marc Kupietz05a60792024-12-07 16:23:31 +0100511 currentMatches <- res$matches$fields %>%
512 purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
513 tibble::enframe() %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200514 tidyr::unnest(cols = value) %>%
515 tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200516 dplyr::select(-name)
Marc Kupietzd8851222025-05-01 10:57:19 +0200517 if ("snippet" %in% colnames(res$matches)) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200518 currentMatches$snippet <- res$matches$snippet
519 }
Marc Kupietz3cd2c6c2025-01-08 20:35:39 +0100520 if ("tokens" %in% colnames(res$matches)) {
521 currentMatches$tokens <- res$matches$tokens
522 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200523 } else {
524 currentMatches <- res$matches
525 }
526
Marc Kupietze95108e2019-09-18 13:23:58 +0200527 for (field in kqo@fields) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200528 if (!field %in% colnames(currentMatches)) {
529 currentMatches[, field] <- NA
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200530 }
531 }
Marc Kupietzf4881122024-12-17 14:55:39 +0100532 currentMatches <- currentMatches %>%
533 select(kqo@fields) %>%
534 mutate(
Marc Kupietzff712a92025-07-18 09:07:23 +0200535 matchID = res$matches$matchID,
Marc Kupietz0447da02025-01-08 20:51:09 +0100536 tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
Marc Kupietzf4881122024-12-17 14:55:39 +0100537 matchStart = as.integer(stringr::word(tmp_positions, 1)),
538 matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
539 ) %>%
540 select(-tmp_positions)
541
Marc Kupietz62da2b52019-09-12 17:43:34 +0200542 if (!is.list(collectedMatches)) {
543 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200544 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200545 collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200546 }
Marc Kupietzae9b6172025-05-02 15:50:01 +0200547
Marc Kupietz623d7122025-05-25 12:46:12 +0200548 # Get the actual items per page from the API response
549 # We now consistently use maxResultsPerPage instead
Marc Kupietzacbaab02025-05-01 10:56:35 +0200550
Marc Kupietz623d7122025-05-25 12:46:12 +0200551 # Calculate total pages consistently using fixed maxResultsPerPage
552 # This ensures consistent page counting across the function
553 total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
554
Marc Kupietz24799fd2025-06-25 14:15:36 +0200555 # Calculate ETA using the centralized function from logging.R
556 current_page <- if (randomizePageOrder) page_index else display_page_number
557 total_pages_to_fetch <- if (!is.na(maxFetch)) {
558 # Account for offset - we can only fetch from the remaining results after offset
559 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
560 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
561 } else {
562 total_pages
563 }
Marc Kupietz365660e2025-06-25 15:09:55 +0200564
Marc Kupietz24799fd2025-06-25 14:15:36 +0200565 eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)
Marc Kupietz365660e2025-06-25 15:09:55 +0200566
Marc Kupietz24799fd2025-06-25 14:15:36 +0200567 # Extract timing information for display
Marc Kupietzae9b6172025-05-02 15:50:01 +0200568 time_per_page <- NA
Marc Kupietzae9b6172025-05-02 15:50:01 +0200569 if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
Marc Kupietzae9b6172025-05-02 15:50:01 +0200570 time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
Marc Kupietzacbaab02025-05-01 10:56:35 +0200571 }
572
Marc Kupietz623d7122025-05-25 12:46:12 +0200573 # Create the page display string with proper formatting
Marc Kupietzacbaab02025-05-01 10:56:35 +0200574
Marc Kupietz623d7122025-05-25 12:46:12 +0200575 # For global page tracking, calculate the absolute page number
576 actual_display_number <- if (randomizePageOrder) {
577 current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
578 } else {
579 # In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
580 current_offset_page + 1
581 }
582
583 # For subsequent calls to fetchNext, we need to calculate the correct page numbers
584 # based on the current batch being fetched
585
586 # For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
587 # Simply count from 1 within the current batch
588
589 # The relative page number is simply the current position in this batch
590 if (randomizePageOrder) {
591 relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
592 } else {
593 relative_page_number <- display_page_number - (page_count_start - 1)
594 }
595
596 # How many pages will we fetch in this batch?
Marc Kupietz021663d2025-06-18 17:49:22 +0200597 # If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz623d7122025-05-25 12:46:12 +0200598 pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz021663d2025-06-18 17:49:22 +0200599 # Account for offset - we can only fetch from the remaining results after offset
600 remaining_results_after_offset <- max(0, kqo@totalResults - offset)
601 min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz623d7122025-05-25 12:46:12 +0200602 } else {
603 # Otherwise fetch all remaining pages
604 total_pages - page_count_start + 1
605 }
606
607 # The total pages to be shown in this batch
608 batch_total_pages <- pages_in_this_batch
609
610 page_display <- paste0(
611 "Retrieved page ",
612 sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
613 "/",
614 sprintf("%d", batch_total_pages)
615 )
616
617 # If randomized, also show which actual page we fetched
618 if (randomizePageOrder) {
619 # Determine the maximum width needed for page numbers (based on total pages)
620 # This ensures consistent alignment
621 max_page_width <- nchar(as.character(total_pages))
622 # Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz7638ca42025-05-25 13:18:16 +0200623 page_display <- paste0(
624 page_display,
625 sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
626 )
Marc Kupietz623d7122025-05-25 12:46:12 +0200627 }
628 # Always show the absolute page number and total pages (for clarity)
629 else {
630 # Show the absolute page number (out of total possible pages)
631 page_display <- paste0(page_display, sprintf(
632 " (page %d of %d total)",
633 actual_display_number, total_pages
634 ))
635 }
636
637 # Add caching or timing information
638 if (!is.null(res$meta$cached)) {
639 page_display <- paste0(page_display, " [cached]")
640 } else {
641 page_display <- paste0(
642 page_display,
643 " in ",
644 if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
Marc Kupietz24799fd2025-06-25 14:15:36 +0200645 "s",
646 eta_info
Marc Kupietz623d7122025-05-25 12:46:12 +0200647 )
648 }
649
650 log_info(verbose, paste0(page_display, "\n"))
651
652 # Increment the appropriate counter based on mode
653 if (randomizePageOrder) {
654 page_index <- page_index + 1
655 } else {
656 current_page_number <- current_page_number + 1
657 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200658 results <- results + res$meta$itemsPerPage
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200659 if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200660 break
661 }
662 }
Marc Kupietz68170952021-06-30 09:37:21 +0200663 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietzd8851222025-05-01 10:57:19 +0200664 KorAPQuery(
665 nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200666 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200667 fields = kqo@fields,
668 requestUrl = kqo@requestUrl,
669 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200670 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200671 vc = kqo@vc,
672 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200673 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200674 apiResponse = res,
Marc Kupietzd8851222025-05-01 10:57:19 +0200675 collectedMatches = collectedMatches
676 )
Marc Kupietze95108e2019-09-18 13:23:58 +0200677})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200678
679#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200680#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200681#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100682#'
Marc Kupietza8c40f42025-06-24 15:49:52 +0200683#' @family corpus search functions
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200684#' @param kqo object obtained from [corpusQuery()]
685#' @param verbose print progress information if true
686#' @param ... further arguments passed to [fetchNext()]
687#' @return The updated `kqo` object with all results in `@collectedMatches`
Marc Kupietza8c40f42025-06-24 15:49:52 +0200688#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200689#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200690#' \dontrun{
Marc Kupietzecc86702025-06-24 12:12:51 +0200691#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
692#' q <- KorAPConnection() |>
693#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200694#' fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200695#' q@collectedMatches
Marc Kupietzecc86702025-06-24 12:12:51 +0200696#'
697#' # Fetch also all KWICs
698#' q <- KorAPConnection() |> auth() |>
699#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
700#' fetchAll()
701#' q@collectedMatches
702#'
703#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
704#' q <- KorAPConnection() |>
705#' corpusQuery("<base/s=t>", # this matches each text once
706#' vc = "pubDate in 1958-03-12",
707#' fields = c("textSigle", "title"),
708#' ) |>
709#' fetchAll()
710#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100711#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200712#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200713#' @aliases fetchAll
Marc Kupietz62da2b52019-09-12 17:43:34 +0200714#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200715setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
716 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200717})
718
719#' Fetches the remaining results of a KorAP query.
720#'
Marc Kupietzdc880ac2025-06-24 20:34:43 +0200721#' @param kqo object obtained from [corpusQuery()]
722#' @param verbose print progress information if true
723#' @param ... further arguments passed to [fetchNext()]
724#' @return The updated `kqo` object with remaining results in `@collectedMatches`
725#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200726#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200727#' \dontrun{
728#'
Marc Kupietzd3526422025-06-25 09:16:15 +0200729#' q <- KorAPConnection() |>
730#' corpusQuery("Ameisenplage") |>
Marc Kupietzd8851222025-05-01 10:57:19 +0200731#' fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200732#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100733#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200734#'
735#' @aliases fetchRest
Marc Kupietze95108e2019-09-18 13:23:58 +0200736#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200737setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
738 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200739})
740
Marc Kupietza29f3d42025-07-18 10:14:43 +0200741#'
742#' Parse XML annotations into linguistic layers
743#'
744#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
745#' from XML annotation snippets returned by the KorAP API.
746#'
747#' @param xml_snippet XML string containing annotation data
748#' @return Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
749#' @keywords internal
750parse_xml_annotations <- function(xml_snippet) {
751 if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
752 return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
753 }
754
755 # Extract content within <span class="match">...</span> using a more robust approach
756 if (grepl('<span class="match">', xml_snippet)) {
757 # Find the start of match span
758 start_pos <- regexpr('<span class="match">', xml_snippet)
759 if (start_pos > 0) {
760 # Find the end by counting nested spans
761 content_start <- start_pos + attr(start_pos, "match.length")
762 remaining <- substr(xml_snippet, content_start, nchar(xml_snippet))
763
764 # Simple approach: extract everything until we hit context-right or end
765 if (grepl('<span class="context-right">', remaining)) {
766 content_to_parse <- gsub('(.*?)<span class="context-right">.*', '\\1', remaining)
767 } else {
768 # Find the closing </span> that matches our opening span
769 # For now, use a simpler approach - take everything until the last </span> sequence
770 content_to_parse <- gsub('(.*)</span>\\s*$', '\\1', remaining)
771 }
772 } else {
773 content_to_parse <- xml_snippet
774 }
775 } else {
776 content_to_parse <- xml_snippet
777 }
778
779 # Initialize result vectors
780 tokens <- character(0)
781 lemmas <- character(0)
782 pos_tags <- character(0)
783 morph_tags <- character(0)
784
785 # Split the content by </span> and process each meaningful part
786 parts <- unlist(strsplit(content_to_parse, '</span>'))
787
788 for (part in parts) {
789 part <- trimws(part)
790 if (nchar(part) == 0) next
791
792 # Look for parts that have title attributes and end with text
793 if (grepl('<span[^>]*title=', part)) {
794 # Extract the text content (everything after the last >)
795 text_content <- gsub('.*>([^<]*)$', '\\1', part)
796 text_content <- trimws(text_content)
797
798 if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
799 tokens <- c(tokens, text_content)
800
801 # Extract all title attributes from this part
802 title_pattern <- 'title="([^"]*)"'
803 title_matches <- gregexpr(title_pattern, part)
804
805 lemma <- NA
806 pos_tag <- NA
807 morph_tag <- NA
808
809 if (title_matches[[1]][1] != -1) {
810 all_titles <- regmatches(part, title_matches)[[1]]
811 for (title_match in all_titles) {
812 title_content <- gsub(title_pattern, '\\1', title_match)
Marc Kupietzc643a122025-07-18 18:18:36 +0200813
Marc Kupietza29f3d42025-07-18 10:14:43 +0200814 # Split by spaces and process each annotation
815 annotations <- unlist(strsplit(title_content, "\\s+"))
816 for (annotation in annotations) {
Marc Kupietzc643a122025-07-18 18:18:36 +0200817 if (grepl('^[^/]+/l:', annotation)) {
818 lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
819 } else if (grepl('^[^/]+/p:', annotation)) {
820 pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
821 } else if (grepl('^[^/]+/m:', annotation)) {
822 morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200823 }
824 }
825 }
826 }
827
828 lemmas <- c(lemmas, lemma)
829 pos_tags <- c(pos_tags, pos_tag)
830 morph_tags <- c(morph_tags, morph_tag)
831 }
832 }
833 }
834
835 # If no tokens found with the splitting approach, try a different method
836 if (length(tokens) == 0) {
837 # Look for the innermost spans that contain actual text
838 innermost_pattern <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
839 innermost_matches <- gregexpr(innermost_pattern, content_to_parse, perl = TRUE)
840
841 if (innermost_matches[[1]][1] != -1) {
842 matches <- regmatches(content_to_parse, innermost_matches)[[1]]
843
844 for (match in matches) {
845 title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
846 text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
847 text <- trimws(text)
848
849 if (nchar(text) > 0) {
850 tokens <- c(tokens, text)
851
852 # Parse space-separated annotations in title
853 lemma <- NA
854 pos_tag <- NA
855 morph_tag <- NA
Marc Kupietzc643a122025-07-18 18:18:36 +0200856
Marc Kupietza29f3d42025-07-18 10:14:43 +0200857 annotations <- unlist(strsplit(title, "\\s+"))
858 for (annotation in annotations) {
Marc Kupietzc643a122025-07-18 18:18:36 +0200859 if (grepl('^[^/]+/l:', annotation)) {
860 lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
861 } else if (grepl('^[^/]+/p:', annotation)) {
862 pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
863 } else if (grepl('^[^/]+/m:', annotation)) {
864 morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200865 }
866 }
Marc Kupietzc643a122025-07-18 18:18:36 +0200867
Marc Kupietza29f3d42025-07-18 10:14:43 +0200868 lemmas <- c(lemmas, lemma)
869 pos_tags <- c(pos_tags, pos_tag)
870 morph_tags <- c(morph_tags, morph_tag)
871 }
872 }
873 }
874 }
875
876 # Ensure all vectors have the same length
877 max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
878 if (max_length > 0) {
879 tokens <- c(tokens, rep(NA, max_length - length(tokens)))
880 lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
881 pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
882 morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
883 }
884
885 return(list(
886 token = tokens,
887 lemma = lemmas,
888 pos = pos_tags,
889 morph = morph_tags
890 ))
891}
892
893#'
894#' Parse XML annotations into linguistic layers with left/match/right structure
895#'
896#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
897#' from XML annotation snippets returned by the KorAP API, split into left context,
898#' match, and right context sections like the tokens field.
899#'
900#' @param xml_snippet XML string containing annotation data
901#' @return Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
902#' @keywords internal
903parse_xml_annotations_structured <- function(xml_snippet) {
904 if (is.null(xml_snippet) || is.na(xml_snippet) || xml_snippet == "") {
905 empty_result <- list(left = character(0), match = character(0), right = character(0))
906 return(list(
907 atokens = empty_result,
908 lemma = empty_result,
909 pos = empty_result,
910 morph = empty_result
911 ))
912 }
913
914 # Helper function to extract annotations from a span section
915 extract_annotations_from_section <- function(section_content) {
Marc Kupietz7ff770e2025-07-18 19:07:10 +0200916 # Handle both spaced tokens and nested single tokens
Marc Kupietza29f3d42025-07-18 10:14:43 +0200917 tokens <- character(0)
918 lemmas <- character(0)
919 pos_tags <- character(0)
920 morph_tags <- character(0)
Marc Kupietz7ff770e2025-07-18 19:07:10 +0200921
922 # First try to split by spaces between span groups (for multiple tokens)
923 # Look for spaces that separate token groups
924 if (grepl('</span>\\s+<span', section_content)) {
925 # Multiple tokens separated by spaces
926 token_groups <- unlist(strsplit(section_content, '(?<=</span>)\\s+(?=<span)', perl = TRUE))
927 } else {
928 # Single token (or no spaces between tokens)
929 token_groups <- c(section_content)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200930 }
Marc Kupietz7ff770e2025-07-18 19:07:10 +0200931
932 for (group in token_groups) {
933 group <- trimws(group)
934 if (nchar(group) == 0) next
935
936 # Extract the actual text content (the innermost text)
937 text_match <- regexpr('>([^<>]+)</span>', group, perl = TRUE)
938 if (text_match > 0) {
939 # Find all possible text contents and take the last one (innermost)
940 all_texts <- regmatches(group, gregexpr('>([^<>]+)</span>', group, perl = TRUE))[[1]]
941 if (length(all_texts) > 0) {
942 # Take the last match (innermost text)
943 text_content <- sub('.*>([^<>]+)</span>.*', '\\1', all_texts[length(all_texts)], perl = TRUE)
944 text_content <- trimws(text_content)
945
946 if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
947 tokens <- c(tokens, text_content)
948
949 # Extract all title attributes from this group
950 titles <- regmatches(group, gregexpr('title="([^"]*)"', group, perl = TRUE))[[1]]
951
952 morph_features <- character(0)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200953 lemma <- NA
954 pos_tag <- NA
Marc Kupietz7ff770e2025-07-18 19:07:10 +0200955
956 for (title in titles) {
957 content <- sub('title="([^"]*)"', '\\1', title, perl = TRUE)
958
959 if (grepl('^[^/]+/l:', content)) {
960 lemma <- sub('^[^/]+/l:(.*)$', '\\1', content)
961 } else if (grepl('^[^/]+/p:', content)) {
962 pos_tag <- sub('^[^/]+/p:(.*)$', '\\1', content)
963 } else if (grepl('^[^/]+/m:', content)) {
964 morph_feature <- sub('^[^/]+/m:(.*)$', '\\1', content)
965 morph_features <- c(morph_features, morph_feature)
Marc Kupietza29f3d42025-07-18 10:14:43 +0200966 }
967 }
Marc Kupietz7ff770e2025-07-18 19:07:10 +0200968
Marc Kupietza29f3d42025-07-18 10:14:43 +0200969 lemmas <- c(lemmas, lemma)
970 pos_tags <- c(pos_tags, pos_tag)
Marc Kupietz7ff770e2025-07-18 19:07:10 +0200971 morph_tag <- if (length(morph_features) > 0) paste(morph_features, collapse = "|") else NA
Marc Kupietza29f3d42025-07-18 10:14:43 +0200972 morph_tags <- c(morph_tags, morph_tag)
973 }
974 }
975 }
976 }
977
978 # Ensure all vectors have the same length
979 max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
980 if (max_length > 0) {
981 tokens <- c(tokens, rep(NA, max_length - length(tokens)))
982 lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
983 pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
984 morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
985 }
986
987 return(list(
988 tokens = tokens,
989 lemmas = lemmas,
990 pos_tags = pos_tags,
991 morph_tags = morph_tags
992 ))
993 }
994
995 # Split the XML into three parts: left context, match content, and right context
996 # The structure is: <span class="match">...left...<mark>...match...</mark>...right...</span>
Marc Kupietzc643a122025-07-18 18:18:36 +0200997
Marc Kupietza29f3d42025-07-18 10:14:43 +0200998 # First extract the content within the match span using DOTALL modifier
999 match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*<span class="context-right">'
1000 match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001001
Marc Kupietza29f3d42025-07-18 10:14:43 +02001002 if (match_span_match == -1) {
1003 # Try alternative pattern if no context-right
1004 match_span_pattern <- '(?s)<span class="match">(.*?)</span>\\s*$'
1005 match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
1006 }
Marc Kupietzc643a122025-07-18 18:18:36 +02001007
Marc Kupietza29f3d42025-07-18 10:14:43 +02001008 if (match_span_match > 0) {
1009 match_span_content <- gsub(match_span_pattern, '\\1', xml_snippet, perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001010
Marc Kupietza29f3d42025-07-18 10:14:43 +02001011 # Now find the <mark> and </mark> positions within this content
1012 mark_start <- regexpr('<mark[^>]*>', match_span_content, perl = TRUE)
1013 mark_end <- regexpr('</mark>', match_span_content, perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001014
Marc Kupietza29f3d42025-07-18 10:14:43 +02001015 if (mark_start > 0 && mark_end > 0) {
1016 # Left context: everything before <mark>
1017 left_content <- substr(match_span_content, 1, mark_start - 1)
Marc Kupietzc643a122025-07-18 18:18:36 +02001018
Marc Kupietza29f3d42025-07-18 10:14:43 +02001019 # Match content: everything between <mark> and </mark> (including the mark tags for now)
1020 match_content <- substr(match_span_content, mark_start, mark_end + attr(mark_end, "match.length") - 1)
Marc Kupietzc643a122025-07-18 18:18:36 +02001021
Marc Kupietza29f3d42025-07-18 10:14:43 +02001022 # Right context: everything after </mark>
1023 right_content_start <- mark_end + attr(mark_end, "match.length")
1024 right_content <- substr(match_span_content, right_content_start, nchar(match_span_content))
1025 } else {
1026 # No mark tags found, treat entire match span as match content
1027 left_content <- ""
1028 match_content <- match_span_content
1029 right_content <- ""
1030 }
1031 } else {
1032 # No match span found, treat entire content as match
1033 left_content <- ""
1034 match_content <- xml_snippet
1035 right_content <- ""
1036 }
1037
1038 # Process each section
1039 left_annotations <- extract_annotations_from_section(left_content)
1040 match_annotations <- extract_annotations_from_section(match_content)
1041 right_annotations <- extract_annotations_from_section(right_content)
1042
1043 return(list(
1044 atokens = list(
1045 left = left_annotations$tokens,
1046 match = match_annotations$tokens,
1047 right = right_annotations$tokens
1048 ),
1049 lemma = list(
1050 left = left_annotations$lemmas,
1051 match = match_annotations$lemmas,
1052 right = right_annotations$lemmas
1053 ),
1054 pos = list(
1055 left = left_annotations$pos_tags,
1056 match = match_annotations$pos_tags,
1057 right = right_annotations$pos_tags
1058 ),
1059 morph = list(
1060 left = left_annotations$morph_tags,
1061 match = match_annotations$morph_tags,
1062 right = right_annotations$morph_tags
1063 )
1064 ))
1065}
1066
Marc Kupietze52b2952025-07-17 16:53:02 +02001067#' Fetch annotations for all collected matches
1068#'
1069#' **`fetchAnnotations`** fetches annotations for all matches in the `@collectedMatches` slot
Marc Kupietzc643a122025-07-18 18:18:36 +02001070#' of a KorAPQuery object and adds annotation columns directly to the `@collectedMatches`
1071#' data frame. The method automatically uses the `matchID` from collected matches when
1072#' available for safer and more reliable annotation retrieval, falling back to constructing
Marc Kupietza29f3d42025-07-18 10:14:43 +02001073#' URLs from `matchStart` and `matchEnd` if necessary.
1074#'
1075#' **Important**: For copyright-restricted corpora, users must be authorized via [auth()]
1076#' and the initial corpus query must have `metadataOnly = FALSE` to ensure snippets are
1077#' available for annotation parsing.
1078#'
1079#' The method parses XML snippet annotations and adds linguistic columns to the data frame:
1080#' - `pos`: data frame with `left`, `match`, `right` columns, each containing list vectors of part-of-speech tags
1081#' - `lemma`: data frame with `left`, `match`, `right` columns, each containing list vectors of lemmas
1082#' - `morph`: data frame with `left`, `match`, `right` columns, each containing list vectors of morphological tags
1083#' - `atokens`: data frame with `left`, `match`, `right` columns, each containing list vectors of token text (from annotations)
1084#' - `annotation_snippet`: original XML snippet from the annotation API
Marc Kupietze52b2952025-07-17 16:53:02 +02001085#'
1086#' @family corpus search functions
1087#' @aliases fetchAnnotations
1088#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001089#' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
Marc Kupietze52b2952025-07-17 16:53:02 +02001090#' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
1091#' @param verbose print progress information if true
Marc Kupietza29f3d42025-07-18 10:14:43 +02001092#' @return The updated `kqo` object with annotation columns added to `@collectedMatches`
Marc Kupietze52b2952025-07-17 16:53:02 +02001093#'
1094#' @examples
1095#' \dontrun{
1096#'
1097#' # Fetch annotations for matches using Tree-Tagger foundry
Marc Kupietza29f3d42025-07-18 10:14:43 +02001098#' # Note: Authorization required for copyright-restricted corpora
Marc Kupietze52b2952025-07-17 16:53:02 +02001099#' q <- KorAPConnection() |>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001100#' auth() |>
1101#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
Marc Kupietze52b2952025-07-17 16:53:02 +02001102#' fetchNext(maxFetch = 10) |>
1103#' fetchAnnotations()
Marc Kupietze52b2952025-07-17 16:53:02 +02001104#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001105#' # Access linguistic annotations for match i:
1106#' pos_tags <- q@collectedMatches$pos # Data frame with left/match/right columns for POS tags
Marc Kupietzc643a122025-07-18 18:18:36 +02001107#' lemmas <- q@collectedMatches$lemma # Data frame with left/match/right columns for lemmas
Marc Kupietza29f3d42025-07-18 10:14:43 +02001108#' morphology <- q@collectedMatches$morph # Data frame with left/match/right columns for morphological tags
1109#' atokens <- q@collectedMatches$atokens # Data frame with left/match/right columns for annotation token text
1110#' raw_snippet <- q@collectedMatches$annotation_snippet[[i]] # Original XML snippet for match i
Marc Kupietzc643a122025-07-18 18:18:36 +02001111#'
Marc Kupietza29f3d42025-07-18 10:14:43 +02001112#' # Access specific components:
1113#' match_pos <- q@collectedMatches$pos$match[[i]] # POS tags for the matched tokens in match i
1114#' left_lemmas <- q@collectedMatches$lemma$left[[i]] # Lemmas for the left context in match i
1115#' right_tokens <- q@collectedMatches$atokens$right[[i]] # Token text for the right context in match i
1116#'
1117#' # Use a different foundry (e.g., mate-parser)
Marc Kupietze52b2952025-07-17 16:53:02 +02001118#' q <- KorAPConnection() |>
Marc Kupietza29f3d42025-07-18 10:14:43 +02001119#' auth() |>
1120#' corpusQuery("Ameisenplage", metadataOnly = FALSE) |>
Marc Kupietze52b2952025-07-17 16:53:02 +02001121#' fetchNext(maxFetch = 10) |>
1122#' fetchAnnotations(foundry = "mate")
Marc Kupietza29f3d42025-07-18 10:14:43 +02001123#' q@collectedMatches
Marc Kupietze52b2952025-07-17 16:53:02 +02001124#' }
Marc Kupietze52b2952025-07-17 16:53:02 +02001125#' @export
1126setMethod("fetchAnnotations", "KorAPQuery", function(kqo, foundry = "tt", verbose = kqo@korapConnection@verbose) {
1127 if (is.null(kqo@collectedMatches) || nrow(kqo@collectedMatches) == 0) {
1128 warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
1129 return(kqo)
1130 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001131
Marc Kupietze52b2952025-07-17 16:53:02 +02001132 df <- kqo@collectedMatches
1133 kco <- kqo@korapConnection
Marc Kupietza29f3d42025-07-18 10:14:43 +02001134
Marc Kupietza29f3d42025-07-18 10:14:43 +02001135 # Initialize annotation columns as data frames (like tokens field)
1136 # Create the structure more explicitly to avoid assignment issues
1137 nrows <- nrow(df)
Marc Kupietzc643a122025-07-18 18:18:36 +02001138
Marc Kupietza29f3d42025-07-18 10:14:43 +02001139 df$pos <- data.frame(
1140 left = I(replicate(nrows, character(0), simplify = FALSE)),
1141 match = I(replicate(nrows, character(0), simplify = FALSE)),
1142 right = I(replicate(nrows, character(0), simplify = FALSE)),
1143 stringsAsFactors = FALSE
1144 )
Marc Kupietzc643a122025-07-18 18:18:36 +02001145
Marc Kupietza29f3d42025-07-18 10:14:43 +02001146 df$lemma <- data.frame(
1147 left = I(replicate(nrows, character(0), simplify = FALSE)),
1148 match = I(replicate(nrows, character(0), simplify = FALSE)),
1149 right = I(replicate(nrows, character(0), simplify = FALSE)),
1150 stringsAsFactors = FALSE
1151 )
Marc Kupietzc643a122025-07-18 18:18:36 +02001152
Marc Kupietza29f3d42025-07-18 10:14:43 +02001153 df$morph <- data.frame(
1154 left = I(replicate(nrows, character(0), simplify = FALSE)),
1155 match = I(replicate(nrows, character(0), simplify = FALSE)),
1156 right = I(replicate(nrows, character(0), simplify = FALSE)),
1157 stringsAsFactors = FALSE
1158 )
Marc Kupietzc643a122025-07-18 18:18:36 +02001159
Marc Kupietza29f3d42025-07-18 10:14:43 +02001160 df$atokens <- data.frame(
1161 left = I(replicate(nrows, character(0), simplify = FALSE)),
1162 match = I(replicate(nrows, character(0), simplify = FALSE)),
1163 right = I(replicate(nrows, character(0), simplify = FALSE)),
1164 stringsAsFactors = FALSE
1165 )
Marc Kupietzc643a122025-07-18 18:18:36 +02001166
Marc Kupietza29f3d42025-07-18 10:14:43 +02001167 df$annotation_snippet <- replicate(nrows, NA, simplify = FALSE)
1168
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001169 # Initialize timing for ETA calculation
1170 start_time <- Sys.time()
1171 if (verbose) {
1172 log_info(verbose, paste("Starting to fetch annotations for", nrows, "matches\n"))
1173 }
1174
Marc Kupietze52b2952025-07-17 16:53:02 +02001175 for (i in seq_len(nrow(df))) {
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001176 # ETA logging
1177 if (verbose && i > 1) {
1178 eta_info <- calculate_eta(i, nrows, start_time)
1179 log_info(verbose, paste("Fetching annotations for match", i, "of", nrows, eta_info, "\n"))
1180 }
Marc Kupietzff712a92025-07-18 09:07:23 +02001181 # Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
1182 if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001183 # matchID format: "match-match-A00/JUN/39609-p202-203" or encrypted format like
1184 # "match-DNB10/CSL/80400-p2343-2344x_MinDOhu_P6dd2MMZJyyus_7MairdKnr1LxY07Cya-Ow"
1185 # Extract document path and position, handling both regular and encrypted formats
Marc Kupietzc643a122025-07-18 18:18:36 +02001186
Marc Kupietza29f3d42025-07-18 10:14:43 +02001187 # More flexible regex to extract the document path with position and encryption
1188 # Look for pattern: match-(...)-p(\d+)-(\d+)(.*) where (.*) is the encrypted part
1189 # We need to capture the entire path including the encrypted suffix
1190 match_result <- regexpr("match-(.+?-p\\d+-\\d+.*)", df$matchID[i], perl = TRUE)
Marc Kupietzc643a122025-07-18 18:18:36 +02001191
Marc Kupietza29f3d42025-07-18 10:14:43 +02001192 if (match_result > 0) {
1193 # Extract the complete path including encryption (everything after "match-")
1194 doc_path_with_pos_and_encryption <- gsub("^match-(.+)$", "\\1", df$matchID[i], perl = TRUE)
1195 # Convert the dash before position to slash, but keep everything after the position
1196 match_path <- gsub("-p(\\d+-\\d+.*)", "/p\\1", doc_path_with_pos_and_encryption)
1197 req <- paste0(kco@apiUrl, "corpus/", match_path, "?foundry=", foundry)
Marc Kupietza29f3d42025-07-18 10:14:43 +02001198 } else {
1199 # If regex fails, fall back to the old method
Marc Kupietza29f3d42025-07-18 10:14:43 +02001200 req <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", df$matchStart[i], "-", df$matchEnd[i], "?foundry=", foundry)
Marc Kupietzff712a92025-07-18 09:07:23 +02001201 }
1202 } else {
1203 # Fallback to the old method
1204 req <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", df$matchStart[i], "-", df$matchEnd[i], "?foundry=", foundry)
Marc Kupietzff712a92025-07-18 09:07:23 +02001205 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001206
Marc Kupietze52b2952025-07-17 16:53:02 +02001207 tryCatch({
1208 res <- apiCall(kco, req)
Marc Kupietzc643a122025-07-18 18:18:36 +02001209
Marc Kupietze52b2952025-07-17 16:53:02 +02001210 if (!is.null(res)) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001211 # Store the raw annotation snippet
1212 df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
1213
1214 # Parse XML annotations if snippet is available
1215 if (is.list(res) && "snippet" %in% names(res)) {
1216 parsed_annotations <- parse_xml_annotations_structured(res$snippet)
1217
1218 # Store the parsed linguistic data in data frame format (like tokens)
1219 # Use individual assignment to avoid data frame mismatch errors
1220 tryCatch({
1221 # Assign POS annotations
1222 df$pos$left[i] <- list(parsed_annotations$pos$left)
1223 df$pos$match[i] <- list(parsed_annotations$pos$match)
1224 df$pos$right[i] <- list(parsed_annotations$pos$right)
Marc Kupietzc643a122025-07-18 18:18:36 +02001225
Marc Kupietza29f3d42025-07-18 10:14:43 +02001226 # Assign lemma annotations
1227 df$lemma$left[i] <- list(parsed_annotations$lemma$left)
1228 df$lemma$match[i] <- list(parsed_annotations$lemma$match)
1229 df$lemma$right[i] <- list(parsed_annotations$lemma$right)
Marc Kupietzc643a122025-07-18 18:18:36 +02001230
Marc Kupietza29f3d42025-07-18 10:14:43 +02001231 # Assign morphology annotations
1232 df$morph$left[i] <- list(parsed_annotations$morph$left)
1233 df$morph$match[i] <- list(parsed_annotations$morph$match)
1234 df$morph$right[i] <- list(parsed_annotations$morph$right)
Marc Kupietzc643a122025-07-18 18:18:36 +02001235
Marc Kupietza29f3d42025-07-18 10:14:43 +02001236 # Assign token annotations
1237 df$atokens$left[i] <- list(parsed_annotations$atokens$left)
1238 df$atokens$match[i] <- list(parsed_annotations$atokens$match)
1239 df$atokens$right[i] <- list(parsed_annotations$atokens$right)
1240 }, error = function(assign_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001241 # Set empty character vectors on assignment error using list assignment
1242 df$pos$left[i] <<- list(character(0))
1243 df$pos$match[i] <<- list(character(0))
1244 df$pos$right[i] <<- list(character(0))
Marc Kupietzc643a122025-07-18 18:18:36 +02001245
Marc Kupietza29f3d42025-07-18 10:14:43 +02001246 df$lemma$left[i] <<- list(character(0))
1247 df$lemma$match[i] <<- list(character(0))
1248 df$lemma$right[i] <<- list(character(0))
Marc Kupietzc643a122025-07-18 18:18:36 +02001249
Marc Kupietza29f3d42025-07-18 10:14:43 +02001250 df$morph$left[i] <<- list(character(0))
1251 df$morph$match[i] <<- list(character(0))
1252 df$morph$right[i] <<- list(character(0))
Marc Kupietzc643a122025-07-18 18:18:36 +02001253
Marc Kupietza29f3d42025-07-18 10:14:43 +02001254 df$atokens$left[i] <<- list(character(0))
1255 df$atokens$match[i] <<- list(character(0))
1256 df$atokens$right[i] <<- list(character(0))
1257 })
Marc Kupietza29f3d42025-07-18 10:14:43 +02001258 } else {
1259 # No snippet available, store empty vectors
1260 df$pos$left[i] <- list(character(0))
1261 df$pos$match[i] <- list(character(0))
1262 df$pos$right[i] <- list(character(0))
Marc Kupietzc643a122025-07-18 18:18:36 +02001263
Marc Kupietza29f3d42025-07-18 10:14:43 +02001264 df$lemma$left[i] <- list(character(0))
1265 df$lemma$match[i] <- list(character(0))
1266 df$lemma$right[i] <- list(character(0))
Marc Kupietzc643a122025-07-18 18:18:36 +02001267
Marc Kupietza29f3d42025-07-18 10:14:43 +02001268 df$morph$left[i] <- list(character(0))
1269 df$morph$match[i] <- list(character(0))
1270 df$morph$right[i] <- list(character(0))
Marc Kupietzc643a122025-07-18 18:18:36 +02001271
Marc Kupietza29f3d42025-07-18 10:14:43 +02001272 df$atokens$left[i] <- list(character(0))
1273 df$atokens$match[i] <- list(character(0))
1274 df$atokens$right[i] <- list(character(0))
1275 }
Marc Kupietze52b2952025-07-17 16:53:02 +02001276 } else {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001277 # Store NAs for failed requests
1278 df$pos$left[i] <- list(NA)
1279 df$pos$match[i] <- list(NA)
1280 df$pos$right[i] <- list(NA)
Marc Kupietzc643a122025-07-18 18:18:36 +02001281
Marc Kupietza29f3d42025-07-18 10:14:43 +02001282 df$lemma$left[i] <- list(NA)
1283 df$lemma$match[i] <- list(NA)
1284 df$lemma$right[i] <- list(NA)
Marc Kupietzc643a122025-07-18 18:18:36 +02001285
Marc Kupietza29f3d42025-07-18 10:14:43 +02001286 df$morph$left[i] <- list(NA)
1287 df$morph$match[i] <- list(NA)
1288 df$morph$right[i] <- list(NA)
Marc Kupietzc643a122025-07-18 18:18:36 +02001289
Marc Kupietza29f3d42025-07-18 10:14:43 +02001290 df$atokens$left[i] <- list(NA)
1291 df$atokens$match[i] <- list(NA)
1292 df$atokens$right[i] <- list(NA)
1293 df$annotation_snippet[[i]] <- NA
Marc Kupietze52b2952025-07-17 16:53:02 +02001294 }
1295 }, error = function(e) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001296 # Store NAs for failed requests
1297 df$pos$left[i] <- list(NA)
1298 df$pos$match[i] <- list(NA)
1299 df$pos$right[i] <- list(NA)
Marc Kupietzc643a122025-07-18 18:18:36 +02001300
Marc Kupietza29f3d42025-07-18 10:14:43 +02001301 df$lemma$left[i] <- list(NA)
1302 df$lemma$match[i] <- list(NA)
1303 df$lemma$right[i] <- list(NA)
Marc Kupietzc643a122025-07-18 18:18:36 +02001304
Marc Kupietza29f3d42025-07-18 10:14:43 +02001305 df$morph$left[i] <- list(NA)
1306 df$morph$match[i] <- list(NA)
1307 df$morph$right[i] <- list(NA)
Marc Kupietzc643a122025-07-18 18:18:36 +02001308
Marc Kupietza29f3d42025-07-18 10:14:43 +02001309 df$atokens$left[i] <- list(NA)
1310 df$atokens$match[i] <- list(NA)
1311 df$atokens$right[i] <- list(NA)
1312 df$annotation_snippet[[i]] <- NA
Marc Kupietze52b2952025-07-17 16:53:02 +02001313 })
1314 }
Marc Kupietza29f3d42025-07-18 10:14:43 +02001315
Marc Kupietza29f3d42025-07-18 10:14:43 +02001316 # Validate data frame structure before assignment
1317 if (nrow(df) != nrow(kqo@collectedMatches)) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001318 }
1319
1320 # Update the collectedMatches with annotation data
1321 tryCatch({
1322 kqo@collectedMatches <- df
1323 }, error = function(assign_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001324 # Try a safer approach: add columns individually
1325 tryCatch({
1326 kqo@collectedMatches$pos <- df$pos
Marc Kupietzc643a122025-07-18 18:18:36 +02001327 kqo@collectedMatches$lemma <- df$lemma
Marc Kupietza29f3d42025-07-18 10:14:43 +02001328 kqo@collectedMatches$morph <- df$morph
1329 kqo@collectedMatches$atokens <- df$atokens
1330 kqo@collectedMatches$annotation_snippet <- df$annotation_snippet
1331 }, error = function(col_error) {
Marc Kupietza29f3d42025-07-18 10:14:43 +02001332 warning("Failed to add annotation data to collectedMatches")
1333 })
1334 })
1335
Marc Kupietze8c0fef2025-07-18 19:59:04 +02001336 if (verbose) {
1337 elapsed_time <- Sys.time() - start_time
1338 log_info(verbose, paste("Finished fetching annotations for", nrows, "matches in", format_duration(as.numeric(elapsed_time, units = "secs")), "\n"))
1339 }
1340
Marc Kupietze52b2952025-07-17 16:53:02 +02001341 return(kqo)
1342})
1343
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001344#' Query frequencies of search expressions in virtual corpora
Marc Kupietz3f575282019-10-04 14:46:04 +02001345#'
Marc Kupietz67edcb52021-09-20 21:54:24 +02001346#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001347#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +02001348#' confidence intervals of one ore multiple search terms across one or multiple
1349#' virtual corpora.
1350#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02001351#' @family frequency analysis
Marc Kupietz3f575282019-10-04 14:46:04 +02001352#' @aliases frequencyQuery
Marc Kupietz3f575282019-10-04 14:46:04 +02001353#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +02001354#' \dontrun{
1355#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001356#' KorAPConnection(verbose = TRUE) |>
Marc Kupietz3f575282019-10-04 14:46:04 +02001357#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +01001358#' }
Marc Kupietz3f575282019-10-04 14:46:04 +02001359#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001360# @inheritParams corpusQuery
Marc Kupietz617266d2025-02-27 10:43:07 +01001361#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001362#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
1363#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz67edcb52021-09-20 21:54:24 +02001364#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
1365#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001366#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz3f575282019-10-04 14:46:04 +02001367#' @export
Marc Kupietzad8d2ed2025-04-05 15:37:38 +02001368#'
1369#' @return A tibble, with each row containing the following result columns for query and vc combinations:
1370#' - **query**: the query string used for the frequency analysis.
1371#' - **totalResults**: absolute frequency of query matches in the vc.
1372#' - **vc**: virtual corpus used for the query.
1373#' - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
1374#' - **total**: total number of words in vc.
1375#' - **f**: relative frequency of query matches in the vc.
1376#' - **conf.low**: lower bound of the confidence interval for the relative frequency, given `conf.level`.
1377#' - **conf.high**: upper bound of the confidence interval for the relative frequency, given `conf.level`.
1378
Marc Kupietzd8851222025-05-01 10:57:19 +02001379setMethod(
1380 "frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +01001381 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietzd8851222025-05-01 10:57:19 +02001382 (if (as.alternatives) {
1383 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
Marc Kupietzea34b812025-06-25 15:49:00 +02001384 group_by(vc) |>
Marc Kupietz71d6e052019-11-22 18:42:10 +01001385 mutate(total = sum(totalResults))
Marc Kupietzd8851222025-05-01 10:57:19 +02001386 } else {
1387 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
1388 mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
Marc Kupietzea34b812025-06-25 15:49:00 +02001389 }) |>
Marc Kupietz0c29cea2019-10-09 08:44:36 +02001390 ci(conf.level = conf.level)
Marc Kupietzd8851222025-05-01 10:57:19 +02001391 }
1392)
Marc Kupietz3f575282019-10-04 14:46:04 +02001393
Marc Kupietz38a9d682024-12-06 16:17:09 +01001394#' buildWebUIRequestUrlFromString
1395#'
1396#' @rdname KorAPQuery-class
1397#' @importFrom urltools url_encode
1398#' @export
1399buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietzd8851222025-05-01 10:57:19 +02001400 query,
1401 vc = "",
1402 ql = "poliqarp") {
Marc Kupietz38a9d682024-12-06 16:17:09 +01001403 if ("KorAPConnection" %in% class(KorAPUrl)) {
1404 KorAPUrl <- KorAPUrl@KorAPUrl
1405 }
1406
1407 request <-
1408 paste0(
Marc Kupietzd8851222025-05-01 10:57:19 +02001409 "?q=",
Marc Kupietz38a9d682024-12-06 16:17:09 +01001410 urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietzd8851222025-05-01 10:57:19 +02001411 ifelse(vc != "",
1412 paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
1413 ""
1414 ),
1415 "&ql=",
Marc Kupietz38a9d682024-12-06 16:17:09 +01001416 ql
1417 )
1418 paste0(KorAPUrl, request)
1419}
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001420
1421#' buildWebUIRequestUrl
1422#'
1423#' @rdname KorAPQuery-class
Marc Kupietzf9129592025-01-26 19:17:54 +01001424#' @importFrom httr2 url_parse
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001425#' @export
1426buildWebUIRequestUrl <- function(kco,
Marc Kupietzd8851222025-05-01 10:57:19 +02001427 query = if (missing(KorAPUrl)) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001428 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietzd8851222025-05-01 10:57:19 +02001429 } else {
1430 httr2::url_parse(KorAPUrl)$query$q
1431 },
Marc Kupietzf9129592025-01-26 19:17:54 +01001432 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001433 KorAPUrl,
Marc Kupietzf9129592025-01-26 19:17:54 +01001434 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz38a9d682024-12-06 16:17:09 +01001435 buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001436}
1437
Marc Kupietzd8851222025-05-01 10:57:19 +02001438#' format()
Marc Kupietze95108e2019-09-18 13:23:58 +02001439#' @rdname KorAPQuery-class
1440#' @param x KorAPQuery object
1441#' @param ... further arguments passed to or from other methods
Marc Kupietzb73ca0f2025-01-28 20:45:01 +01001442#' @importFrom urltools param_get url_decode
Marc Kupietze95108e2019-09-18 13:23:58 +02001443#' @export
1444format.KorAPQuery <- function(x, ...) {
1445 cat("<KorAPQuery>\n")
1446 q <- x
Marc Kupietzd8851222025-05-01 10:57:19 +02001447 param <- urltools::param_get(q@request) |> lapply(urltools::url_decode)
Marc Kupietzb73ca0f2025-01-28 20:45:01 +01001448 cat(" Query: ", param$q, "\n")
1449 if (!is.null(param$cq) && param$cq != "") {
1450 cat(" Virtual corpus: ", param$cq, "\n")
1451 }
1452 if (!is.null(q@collectedMatches)) {
1453 cat("==============================================================================================================", "\n")
1454 print(summary(q@collectedMatches))
1455 cat("==============================================================================================================", "\n")
1456 }
1457 cat(" Total results: ", q@totalResults, "\n")
1458 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietza29f3d42025-07-18 10:14:43 +02001459 if (!is.null(q@collectedMatches) && "pos" %in% colnames(q@collectedMatches)) {
1460 successful_annotations <- sum(!is.na(q@collectedMatches$annotation_snippet))
1461 parsed_annotations <- sum(!is.na(q@collectedMatches$pos))
1462 cat(" Annotations: ", successful_annotations, " of ", nrow(q@collectedMatches), " matches")
1463 if (parsed_annotations > 0) {
1464 cat(" (", parsed_annotations, " with parsed linguistic data)")
1465 }
1466 cat("\n")
Marc Kupietze52b2952025-07-17 16:53:02 +02001467 }
Marc Kupietz62da2b52019-09-12 17:43:34 +02001468}
1469
Marc Kupietze95108e2019-09-18 13:23:58 +02001470#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +02001471#'
Marc Kupietze95108e2019-09-18 13:23:58 +02001472#' @rdname KorAPQuery-class
1473#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +02001474#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +02001475setMethod("show", "KorAPQuery", function(object) {
1476 format(object)
Marc Kupietzc643a122025-07-18 18:18:36 +02001477 invisible(object)
Marc Kupietze95108e2019-09-18 13:23:58 +02001478})