blob: 97c7029652e14002f401ac2e7c6811730e5bea7d [file] [log] [blame]
Marc Kupietze95108e2019-09-18 13:23:58 +02001#' Class KorAPQuery
2#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +01003#' This class provides methods to perform different kinds of queries on the KorAP API server.
Marc Kupietz67edcb52021-09-20 21:54:24 +02004#' `KorAPQuery` objects, which are typically created by the [corpusQuery()] method,
Marc Kupietza6e4ee62021-03-05 09:00:15 +01005#' represent the current state of a query to a KorAP server.
Marc Kupietze95108e2019-09-18 13:23:58 +02006#'
7#' @include KorAPConnection.R
Marc Kupietzf9129592025-01-26 19:17:54 +01008#' @import httr2
Marc Kupietze95108e2019-09-18 13:23:58 +02009#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
24 "hasMoreMatches"
25))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietze95108e2019-09-18 13:23:58 +020027#' Method initialize
28#'
29#' @rdname KorAPQuery-class
30#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020031#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020032#' @param request query part of the request URL
33#' @param vc definition of a virtual corpus
34#' @param totalResults number of hits the query has yielded
35#' @param nextStartIndex at what index to start the next fetch of query results
36#' @param fields what data / metadata fields should be collected
37#' @param requestUrl complete URL of the API request
38#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
39#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020040#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020041#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020042#'
43#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020044#' @export
45setMethod("initialize", "KorAPQuery",
Marc Kupietzb8972182019-09-20 21:33:46 +020046 function(.Object, korapConnection = NULL, request = NULL, vc="", totalResults=0, nextStartIndex=0, fields=c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietz2078bde2023-08-27 16:46:15 +020047 "availability", "textClass", "snippet", "tokens"),
Marc Kupietze95108e2019-09-18 13:23:58 +020048 requestUrl="", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches= FALSE, collectedMatches = NULL) {
49 .Object <- callNextMethod()
Marc Kupietzb8972182019-09-20 21:33:46 +020050 .Object@korapConnection = korapConnection
Marc Kupietze95108e2019-09-18 13:23:58 +020051 .Object@request = request
52 .Object@vc = vc
53 .Object@totalResults = totalResults
54 .Object@nextStartIndex = nextStartIndex
55 .Object@fields = fields
56 .Object@requestUrl = requestUrl
57 .Object@webUIRequestUrl = webUIRequestUrl
58 .Object@apiResponse = apiResponse
59 .Object@hasMoreMatches = hasMoreMatches
60 .Object@collectedMatches = collectedMatches
61 .Object
62 })
Marc Kupietz632cbd42019-09-06 16:04:51 +020063
Marc Kupietze95108e2019-09-18 13:23:58 +020064setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery") )
65setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll") )
66setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") )
67setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
Marc Kupietz3f575282019-10-04 14:46:04 +020068setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery") )
Marc Kupietze95108e2019-09-18 13:23:58 +020069
70maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020071
Marc Kupietz4de53ec2019-10-04 09:12:00 +020072## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietzef1ef4a2025-02-19 12:12:40 +010073utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020074
Marc Kupietzdbd431a2021-08-29 12:17:45 +020075#' Corpus query
76#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020077#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020078#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020079#' @rdname KorAPQuery-class
80#' @aliases corpusQuery
81#'
82#' @importFrom urltools url_encode
83#' @importFrom purrr pmap
84#' @importFrom dplyr bind_rows
85#'
Marc Kupietz617266d2025-02-27 10:43:07 +010086#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz67edcb52021-09-20 21:54:24 +020087#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020088#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020089#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz132f0052023-04-16 14:23:05 +020090#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
91#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
92#' your RKorAPClient application as explained in the
93#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
94#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
95#' `FALSE`.
Marc Kupietz67edcb52021-09-20 21:54:24 +020096#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Akron5e135462019-09-27 16:31:38 +020097#' @param fields (meta)data fields that will be fetched for every match.
Marc Kupietz43a6ade2020-02-18 17:01:44 +010098#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +020099#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200100#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200101#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200102#' @param context string that specifies the size of the left and the right context returned in `snippet`
103#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
104#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
105#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
106#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
107#' as this is typically legally motivated.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200108#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz67edcb52021-09-20 21:54:24 +0200109#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
110#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200111#'
112#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200113#' \dontrun{
114#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200115#' # Fetch metadata of every query hit for "Ameisenplage" and show a summary
Marc Kupietz617266d2025-02-27 10:43:07 +0100116#' KorAPConnection() %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietz657d8e72020-02-25 18:31:50 +0100117#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200118#'
Marc Kupietz6ae76052021-09-21 10:34:00 +0200119#' \dontrun{
120#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200121#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
122#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200123#'
Marc Kupietz617266d2025-02-27 10:43:07 +0100124#' KorAPConnection(verbose = TRUE) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200125#' corpusQuery(KorAPUrl =
126#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp")
Marc Kupietz6ae76052021-09-21 10:34:00 +0200127#' }
128#'
129#' \dontrun{
Marc Kupietz3c531f62019-09-13 12:17:24 +0200130#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200131#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietz617266d2025-02-27 10:43:07 +0100132#' KorAPConnection(verbose=TRUE) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200133#' { . ->> kco } %>%
134#' corpusQuery("Ameisenplage") %>%
135#' fetchAll() %>%
136#' slot("collectedMatches") %>%
137#' mutate(year = lubridate::year(pubDate)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200138#' dplyr::select(year) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200139#' group_by(year) %>%
Marc Kupietzcb3c59e2020-06-02 10:10:43 +0200140#' summarise(Count = dplyr::n()) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200141#' mutate(Freq = mapply(function(f, y)
142#' f / corpusStats(kco, paste("pubDate in", y))@tokens, Count, year)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200143#' dplyr::select(-Count) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200144#' complete(year = min(year):max(year), fill = list(Freq = 0)) %>%
145#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100146#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200147#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200148#'
149#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200150#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200151#'
152#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200153setMethod("corpusQuery", "KorAPConnection",
Marc Kupietza96537f2019-11-09 23:07:44 +0100154 function(kco,
155 query = if (missing(KorAPUrl))
156 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
157 else
Marc Kupietzf9129592025-01-26 19:17:54 +0100158 httr2::url_parse(KorAPUrl)$query$q,
159 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietza96537f2019-11-09 23:07:44 +0100160 KorAPUrl,
161 metadataOnly = TRUE,
Marc Kupietzf9129592025-01-26 19:17:54 +0100162 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
Marc Kupietza96537f2019-11-09 23:07:44 +0100163 fields = c(
164 "corpusSigle",
165 "textSigle",
166 "pubDate",
167 "pubPlace",
168 "availability",
169 "textClass",
Marc Kupietz2078bde2023-08-27 16:46:15 +0200170 "snippet",
171 "tokens"
Marc Kupietza96537f2019-11-09 23:07:44 +0100172 ),
173 accessRewriteFatal = TRUE,
174 verbose = kco@verbose,
175 expand = length(vc) != length(query),
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200176 as.df = FALSE,
177 context = NULL) {
Marc Kupietza96537f2019-11-09 23:07:44 +0100178 if (length(query) > 1 || length(vc) > 1) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200179 grid <- if (expand) expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc)
180 purrr::pmap(grid, function(query, vc, ...)
181 corpusQuery(kco, query=query, vc=vc, ql=ql, verbose=verbose, as.df = TRUE)) %>%
182 bind_rows()
183 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200184 contentFields <- c("snippet", "tokens")
Marc Kupietza96537f2019-11-09 23:07:44 +0100185 if (metadataOnly) {
186 fields <- fields[!fields %in% contentFields]
187 }
Marc Kupietz80dc6432025-02-07 16:57:40 +0100188 if (!"textSigle" %in% fields) {
189 fields <- c(fields, "textSigle")
190 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100191 request <-
192 paste0('?q=',
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200193 url_encode(enc2utf8(query)),
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200194 ifelse (!metadataOnly && ! is.null(context) && context != '', paste0('&context=', url_encode(enc2utf8(context))), ''),
Marc Kupietz2078bde2023-08-27 16:46:15 +0200195 ifelse (vc != '', paste0('&cq=', url_encode(enc2utf8(vc))), ''),
196 ifelse (!metadataOnly, '&show-tokens=true', ''),
197 '&ql=', ql)
Marc Kupietza96537f2019-11-09 23:07:44 +0100198 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
199 requestUrl <- paste0(
200 kco@apiUrl,
201 'search',
202 request,
203 '&fields=',
204 paste(fields, collapse = ","),
205 if (metadataOnly) '&access-rewrite-disabled=true' else ''
206 )
Marc Kupietz16ccf112025-01-26 13:25:27 +0100207 log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep =
Marc Kupietza96537f2019-11-09 23:07:44 +0100208 "")
209 res = apiCall(kco, paste0(requestUrl, '&count=0'))
Marc Kupietza4675722022-02-23 23:55:15 +0100210 if (is.null(res)) {
Marc Kupietza4675722022-02-23 23:55:15 +0100211 message("API call failed.")
212 totalResults <- 0
213 } else {
Marc Kupietz41d4e352024-03-11 21:48:55 +0100214 totalResults <-as.integer(res$meta$totalResults)
Marc Kupietza47d1502023-04-18 15:26:47 +0200215 log_info(verbose, ": ", totalResults, " hits")
Marc Kupietza4675722022-02-23 23:55:15 +0100216 if(!is.null(res$meta$cached))
Marc Kupietza47d1502023-04-18 15:26:47 +0200217 log_info(verbose, " [cached]\n")
Marc Kupietza4675722022-02-23 23:55:15 +0100218 else
Marc Kupietz16ccf112025-01-26 13:25:27 +0100219 if(! is.null(res$meta$benchmark))
220 log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
221 else
222 log_info(verbose, "\n")
Marc Kupietza4675722022-02-23 23:55:15 +0100223 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100224 if (as.df)
225 data.frame(
226 query = query,
Marc Kupietza4675722022-02-23 23:55:15 +0100227 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100228 vc = vc,
229 webUIRequestUrl = webUIRequestUrl,
230 stringsAsFactors = FALSE
231 )
232 else
233 KorAPQuery(
234 korapConnection = kco,
235 nextStartIndex = 0,
236 fields = fields,
237 requestUrl = requestUrl,
238 request = request,
Marc Kupietza4675722022-02-23 23:55:15 +0100239 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100240 vc = vc,
241 apiResponse = res,
242 webUIRequestUrl = webUIRequestUrl,
Marc Kupietza4675722022-02-23 23:55:15 +0100243 hasMoreMatches = (totalResults > 0),
Marc Kupietza96537f2019-11-09 23:07:44 +0100244 )
245 }
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200246 })
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200247
Marc Kupietz05a60792024-12-07 16:23:31 +0100248#' @importFrom purrr map
249repair_data_strcuture <- function(x) {
250 if (is.list(x))
251 as.character (purrr::map(x, ~ if (length(.x) > 1) {
252 paste(.x, collapse = " ")
253 } else {
254 .x
255 }))
256 else
257 ifelse(is.na(x), "", x)
258}
259
Marc Kupietz62da2b52019-09-12 17:43:34 +0200260#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200261#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200262#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200263#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200264#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200265#' @param offset start offset for query results to fetch
266#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200267#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200268#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
269#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200270#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100271#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200272#' \dontrun{
273#'
Marc Kupietz617266d2025-02-27 10:43:07 +0100274#' q <- KorAPConnection() %>% corpusQuery("Ameisenplage") %>% fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100275#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100276#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100277#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200278#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200279#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200280#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200281#' @aliases fetchNext
282#' @rdname KorAPQuery-class
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200283#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietzf4881122024-12-17 14:55:39 +0100284#' @importFrom tibble enframe add_column
285#' @importFrom stringr word
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200286#' @importFrom tidyr unnest unchop pivot_wider
287#' @importFrom purrr map
Marc Kupietz632cbd42019-09-06 16:04:51 +0200288#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200289setMethod("fetchNext", "KorAPQuery", function(kqo,
290 offset = kqo@nextStartIndex,
291 maxFetch = maxResultsPerPage,
292 verbose = kqo@korapConnection@verbose,
293 randomizePageOrder = FALSE) {
Marc Kupietza7a8f1b2024-12-18 15:56:19 +0100294 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
295 results <- key <- name <- pubDate <- tmp_positions <- 0
296
Marc Kupietze95108e2019-09-18 13:23:58 +0200297 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
298 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200299 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200300 use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz705488d2021-06-30 18:26:36 +0200301 page <- kqo@nextStartIndex / maxResultsPerPage + 1
Marc Kupietze95108e2019-09-18 13:23:58 +0200302 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200303
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200304 if (randomizePageOrder) {
305 pages <- head(sample.int(ceiling(kqo@totalResults / maxResultsPerPage)), maxFetch) - 1
306 }
307
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200308 if(is.null(collectedMatches)) {
309 collectedMatches <- data.frame()
310 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200311 repeat {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200312 page = nrow(collectedMatches) %/% maxResultsPerPage + 1
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200313 currentOffset = ifelse(randomizePageOrder, pages[page], page - 1) * maxResultsPerPage
314 query <- paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', currentOffset, '&cutoff=true')
Marc Kupietz68170952021-06-30 09:37:21 +0200315 res <- apiCall(kqo@korapConnection, query)
316 if (length(res$matches) == 0) {
317 break
318 }
319
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200320 if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz16ccf112025-01-26 13:25:27 +0100321 log_info(verbose, "Using fields API: ")
Marc Kupietz05a60792024-12-07 16:23:31 +0100322 currentMatches <- res$matches$fields %>%
323 purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
324 tibble::enframe() %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200325 tidyr::unnest(cols = value) %>%
326 tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200327 dplyr::select(-name)
328 if("snippet" %in% colnames(res$matches)) {
329 currentMatches$snippet <- res$matches$snippet
330 }
Marc Kupietz3cd2c6c2025-01-08 20:35:39 +0100331 if ("tokens" %in% colnames(res$matches)) {
332 currentMatches$tokens <- res$matches$tokens
333 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200334 } else {
335 currentMatches <- res$matches
336 }
337
Marc Kupietze95108e2019-09-18 13:23:58 +0200338 for (field in kqo@fields) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200339 if (!field %in% colnames(currentMatches)) {
340 currentMatches[, field] <- NA
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200341 }
342 }
Marc Kupietzf4881122024-12-17 14:55:39 +0100343 currentMatches <- currentMatches %>%
344 select(kqo@fields) %>%
345 mutate(
Marc Kupietz0447da02025-01-08 20:51:09 +0100346 tmp_positions = gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", res$matches$matchID),
Marc Kupietzf4881122024-12-17 14:55:39 +0100347 matchStart = as.integer(stringr::word(tmp_positions, 1)),
348 matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
349 ) %>%
350 select(-tmp_positions)
351
Marc Kupietz62da2b52019-09-12 17:43:34 +0200352 if (!is.list(collectedMatches)) {
353 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200354 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200355 collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200356 }
Marc Kupietz16ccf112025-01-26 13:25:27 +0100357 log_info(verbose, paste0(
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200358 "Retrieved page ",
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200359 ceiling(nrow(collectedMatches) / res$meta$itemsPerPage),
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200360 "/",
361 if (!is.na(maxFetch) && maxFetch < kqo@totalResults)
362 sprintf("%d (%d)", ceiling(maxFetch / res$meta$itemsPerPage), ceiling(kqo@totalResults / res$meta$itemsPerPage))
363 else
364 sprintf("%d", ceiling(kqo@totalResults / res$meta$itemsPerPage)),
365 ' in ',
366 res$meta$benchmark,
367 '\n'
368 ))
Marc Kupietzacbaab02025-05-01 10:56:35 +0200369 # Estimate remaining time
370 time_per_page <- as.numeric(sub("s", "", res$meta$benchmark)) # Assuming benchmark is like "0.123s"
371 items_per_page <- res$meta$itemsPerPage
372 total_pages <- ceiling(kqo@totalResults / items_per_page)
373 actual_total_pages <- if (!is.na(maxFetch) && maxFetch < kqo@totalResults) {
374 ceiling(maxFetch / items_per_page)
375 } else {
376 total_pages
377 }
378 current_page_number <- ceiling(nrow(collectedMatches) / items_per_page)
379 remaining_pages <- actual_total_pages - current_page_number
380
381 estimated_remaining_seconds <- remaining_pages * time_per_page
382 estimated_completion_time <- Sys.time() + estimated_remaining_seconds
383
384 # Format time nicely
385 format_duration <- function(seconds) {
386 if (is.na(seconds) || seconds < 0) {
387 return("N/A")
388 }
389 days <- floor(seconds / (24 * 3600))
390 seconds <- seconds %% (24 * 3600)
391 hours <- floor(seconds / 3600)
392 seconds <- seconds %% 3600
393 minutes <- floor(seconds / 60)
394 seconds <- floor(seconds %% 60)
395 paste0(
396 if (days > 0) paste0(days, "d ") else "",
397 if (hours > 0 || days > 0) paste0(sprintf("%02d", hours), ":") else "",
398 if (minutes > 0 || hours > 0 || days > 0) paste0(sprintf("%02d", minutes), ":") else "",
399 paste0(sprintf("%02d", seconds), if (minutes > 0 || hours > 0 || days > 0) "" else "s")
400 )
401 }
402
403 eta_str <- format_duration(estimated_remaining_seconds)
404 completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
405
406 log_info(verbose, paste0(
407 "Retrieved page ",
408 current_page_number,
409 "/",
410 if (!is.na(maxFetch) && maxFetch < kqo@totalResults) {
411 sprintf("%d(%d)", ceiling(maxFetch / items_per_page), total_pages)
412 } else {
413 sprintf("%d", total_pages)
414 },
415 if (!is.null(res$meta$cached)) {
416 " [cached]\n"
417 } else {
418 paste0(
419 " in ",
420 sprintf("%3.1f", time_per_page),
421 "s ETA: ", eta_str, " (", completion_time_str, ")",
422 "\n"
423 )
424 }
425 ))
426
Marc Kupietz16ccf112025-01-26 13:25:27 +0100427
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200428 page <- page + 1
429 results <- results + res$meta$itemsPerPage
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200430 if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200431 break
432 }
433 }
Marc Kupietz68170952021-06-30 09:37:21 +0200434 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietze95108e2019-09-18 13:23:58 +0200435 KorAPQuery(nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200436 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200437 fields = kqo@fields,
438 requestUrl = kqo@requestUrl,
439 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200440 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200441 vc = kqo@vc,
442 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200443 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200444 apiResponse = res,
445 collectedMatches = collectedMatches)
446})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200447
448#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200449#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200450#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100451#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200452#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200453#' \dontrun{
454#'
Marc Kupietz617266d2025-02-27 10:43:07 +0100455#' q <- KorAPConnection() %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200456#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100457#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200458#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200459#' @aliases fetchAll
460#' @rdname KorAPQuery-class
Marc Kupietz62da2b52019-09-12 17:43:34 +0200461#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200462setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
463 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200464})
465
466#' Fetches the remaining results of a KorAP query.
467#'
468#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200469#' \dontrun{
470#'
Marc Kupietz617266d2025-02-27 10:43:07 +0100471#' q <- KorAPConnection() %>% corpusQuery("Ameisenplage") %>% fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200472#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100473#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200474#'
475#' @aliases fetchRest
476#' @rdname KorAPQuery-class
477#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200478setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
479 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200480})
481
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200482#' Query frequencies of search expressions in virtual corpora
Marc Kupietz3f575282019-10-04 14:46:04 +0200483#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200484#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200485#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +0200486#' confidence intervals of one ore multiple search terms across one or multiple
487#' virtual corpora.
488#'
489#' @aliases frequencyQuery
Marc Kupietz3f575282019-10-04 14:46:04 +0200490#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200491#' \dontrun{
492#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200493#' KorAPConnection(verbose = TRUE) |>
Marc Kupietz3f575282019-10-04 14:46:04 +0200494#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +0100495#' }
Marc Kupietz3f575282019-10-04 14:46:04 +0200496#'
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200497# @inheritParams corpusQuery
Marc Kupietz617266d2025-02-27 10:43:07 +0100498#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200499#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
500#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz67edcb52021-09-20 21:54:24 +0200501#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
502#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200503#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz3f575282019-10-04 14:46:04 +0200504#' @export
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200505#'
506#' @return A tibble, with each row containing the following result columns for query and vc combinations:
507#' - **query**: the query string used for the frequency analysis.
508#' - **totalResults**: absolute frequency of query matches in the vc.
509#' - **vc**: virtual corpus used for the query.
510#' - **webUIRequestUrl**: URL of the corresponding web UI request with respect to query and vc.
511#' - **total**: total number of words in vc.
512#' - **f**: relative frequency of query matches in the vc.
513#' - **conf.low**: lower bound of the confidence interval for the relative frequency, given `conf.level`.
514#' - **conf.high**: upper bound of the confidence interval for the relative frequency, given `conf.level`.
515
Marc Kupietz3f575282019-10-04 14:46:04 +0200516setMethod("frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +0100517 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
518 (if (as.alternatives) {
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200519 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
Marc Kupietz71d6e052019-11-22 18:42:10 +0100520 group_by(vc) %>%
521 mutate(total = sum(totalResults))
522 } else {
Marc Kupietzad8d2ed2025-04-05 15:37:38 +0200523 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) |>
Marc Kupietz71d6e052019-11-22 18:42:10 +0100524 mutate(total = corpusStats(kco, vc=vc, as.df=TRUE)$tokens)
525 } ) %>%
Marc Kupietz0c29cea2019-10-09 08:44:36 +0200526 ci(conf.level = conf.level)
Marc Kupietz3f575282019-10-04 14:46:04 +0200527})
528
Marc Kupietz38a9d682024-12-06 16:17:09 +0100529#' buildWebUIRequestUrlFromString
530#'
531#' @rdname KorAPQuery-class
532#' @importFrom urltools url_encode
533#' @export
534buildWebUIRequestUrlFromString <- function(KorAPUrl,
535 query,
536 vc = "",
537 ql = "poliqarp"
538) {
539 if ("KorAPConnection" %in% class(KorAPUrl)) {
540 KorAPUrl <- KorAPUrl@KorAPUrl
541 }
542
543 request <-
544 paste0(
545 '?q=',
546 urltools::url_encode(enc2utf8(as.character(query))),
547 ifelse(vc != '',
548 paste0('&cq=', urltools::url_encode(enc2utf8(vc))),
549 ''),
550 '&ql=',
551 ql
552 )
553 paste0(KorAPUrl, request)
554}
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200555
556#' buildWebUIRequestUrl
557#'
558#' @rdname KorAPQuery-class
Marc Kupietzf9129592025-01-26 19:17:54 +0100559#' @importFrom httr2 url_parse
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200560#' @export
561buildWebUIRequestUrl <- function(kco,
562 query = if (missing(KorAPUrl))
563 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
564 else
Marc Kupietzf9129592025-01-26 19:17:54 +0100565 httr2::url_parse(KorAPUrl)$query$q,
566 vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200567 KorAPUrl,
Marc Kupietzf9129592025-01-26 19:17:54 +0100568 ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz38a9d682024-12-06 16:17:09 +0100569
570 buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200571}
572
Marc Kupietze95108e2019-09-18 13:23:58 +0200573#´ format()
574#' @rdname KorAPQuery-class
575#' @param x KorAPQuery object
576#' @param ... further arguments passed to or from other methods
Marc Kupietzb73ca0f2025-01-28 20:45:01 +0100577#' @importFrom urltools param_get url_decode
Marc Kupietze95108e2019-09-18 13:23:58 +0200578#' @export
579format.KorAPQuery <- function(x, ...) {
580 cat("<KorAPQuery>\n")
581 q <- x
Marc Kupietzb73ca0f2025-01-28 20:45:01 +0100582 param = urltools::param_get(q@request) |> lapply(urltools::url_decode)
583 cat(" Query: ", param$q, "\n")
584 if (!is.null(param$cq) && param$cq != "") {
585 cat(" Virtual corpus: ", param$cq, "\n")
586 }
587 if (!is.null(q@collectedMatches)) {
588 cat("==============================================================================================================", "\n")
589 print(summary(q@collectedMatches))
590 cat("==============================================================================================================", "\n")
591 }
592 cat(" Total results: ", q@totalResults, "\n")
593 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200594}
595
Marc Kupietze95108e2019-09-18 13:23:58 +0200596#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200597#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200598#' @rdname KorAPQuery-class
599#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200600#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200601setMethod("show", "KorAPQuery", function(object) {
602 format(object)
603})
Marc Kupietz006b47c2021-01-13 17:00:59 +0100604