blob: 432dea40b323f5d62907fd75ff7de7fc359db0f4 [file] [log] [blame]
Marc Kupietze95108e2019-09-18 13:23:58 +02001#' Class KorAPQuery
2#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +01003#' This class provides methods to perform different kinds of queries on the KorAP API server.
Marc Kupietz67edcb52021-09-20 21:54:24 +02004#' `KorAPQuery` objects, which are typically created by the [corpusQuery()] method,
Marc Kupietza6e4ee62021-03-05 09:00:15 +01005#' represent the current state of a query to a KorAP server.
Marc Kupietze95108e2019-09-18 13:23:58 +02006#'
7#' @include KorAPConnection.R
Marc Kupietze95108e2019-09-18 13:23:58 +02008#' @import httr
9#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
24 "hasMoreMatches"
25))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietze95108e2019-09-18 13:23:58 +020027#' Method initialize
28#'
29#' @rdname KorAPQuery-class
30#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020031#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020032#' @param request query part of the request URL
33#' @param vc definition of a virtual corpus
34#' @param totalResults number of hits the query has yielded
35#' @param nextStartIndex at what index to start the next fetch of query results
36#' @param fields what data / metadata fields should be collected
37#' @param requestUrl complete URL of the API request
38#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
39#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020040#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020041#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020042#'
43#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020044#' @export
45setMethod("initialize", "KorAPQuery",
Marc Kupietzb8972182019-09-20 21:33:46 +020046 function(.Object, korapConnection = NULL, request = NULL, vc="", totalResults=0, nextStartIndex=0, fields=c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietz2078bde2023-08-27 16:46:15 +020047 "availability", "textClass", "snippet", "tokens"),
Marc Kupietze95108e2019-09-18 13:23:58 +020048 requestUrl="", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches= FALSE, collectedMatches = NULL) {
49 .Object <- callNextMethod()
Marc Kupietzb8972182019-09-20 21:33:46 +020050 .Object@korapConnection = korapConnection
Marc Kupietze95108e2019-09-18 13:23:58 +020051 .Object@request = request
52 .Object@vc = vc
53 .Object@totalResults = totalResults
54 .Object@nextStartIndex = nextStartIndex
55 .Object@fields = fields
56 .Object@requestUrl = requestUrl
57 .Object@webUIRequestUrl = webUIRequestUrl
58 .Object@apiResponse = apiResponse
59 .Object@hasMoreMatches = hasMoreMatches
60 .Object@collectedMatches = collectedMatches
61 .Object
62 })
Marc Kupietz632cbd42019-09-06 16:04:51 +020063
Marc Kupietze95108e2019-09-18 13:23:58 +020064setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery") )
65setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll") )
66setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") )
67setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
Marc Kupietz3f575282019-10-04 14:46:04 +020068setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery") )
Marc Kupietze95108e2019-09-18 13:23:58 +020069
70maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020071
Marc Kupietz4de53ec2019-10-04 09:12:00 +020072## quiets concerns of R CMD check re: the .'s that appear in pipelines
73if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020074
Marc Kupietzdbd431a2021-08-29 12:17:45 +020075#' Corpus query
76#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020077#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020078#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020079#' @rdname KorAPQuery-class
80#' @aliases corpusQuery
81#'
82#' @importFrom urltools url_encode
83#' @importFrom purrr pmap
84#' @importFrom dplyr bind_rows
85#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020086#' @param kco [KorAPConnection()] object (obtained e.g. from `new("KorAPConnection")`
87#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020088#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020089#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz132f0052023-04-16 14:23:05 +020090#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
91#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
92#' your RKorAPClient application as explained in the
93#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
94#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
95#' `FALSE`.
Marc Kupietz67edcb52021-09-20 21:54:24 +020096#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Akron5e135462019-09-27 16:31:38 +020097#' @param fields (meta)data fields that will be fetched for every match.
Marc Kupietz43a6ade2020-02-18 17:01:44 +010098#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +020099#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200100#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietz67edcb52021-09-20 21:54:24 +0200101#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200102#' @param context string that specifies the size of the left and the right context returned in `snippet`
103#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
104#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
105#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
106#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
107#' as this is typically legally motivated.
Marc Kupietz67edcb52021-09-20 21:54:24 +0200108#' @return Depending on the `as.df` parameter, a table or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
109#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
110#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200111#'
112#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200113#' \dontrun{
114#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200115#' # Fetch metadata of every query hit for "Ameisenplage" and show a summary
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200116#' new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietz657d8e72020-02-25 18:31:50 +0100117#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200118#'
Marc Kupietz6ae76052021-09-21 10:34:00 +0200119#' \dontrun{
120#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200121#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
122#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200123#'
124#' new("KorAPConnection", verbose = TRUE) %>%
125#' corpusQuery(KorAPUrl =
126#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp")
Marc Kupietz6ae76052021-09-21 10:34:00 +0200127#' }
128#'
129#' \dontrun{
Marc Kupietz3c531f62019-09-13 12:17:24 +0200130#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200131#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200132#' new("KorAPConnection", verbose=TRUE) %>%
133#' { . ->> kco } %>%
134#' corpusQuery("Ameisenplage") %>%
135#' fetchAll() %>%
136#' slot("collectedMatches") %>%
137#' mutate(year = lubridate::year(pubDate)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200138#' dplyr::select(year) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200139#' group_by(year) %>%
Marc Kupietzcb3c59e2020-06-02 10:10:43 +0200140#' summarise(Count = dplyr::n()) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200141#' mutate(Freq = mapply(function(f, y)
142#' f / corpusStats(kco, paste("pubDate in", y))@tokens, Count, year)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200143#' dplyr::select(-Count) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200144#' complete(year = min(year):max(year), fill = list(Freq = 0)) %>%
145#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100146#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200147#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200148#'
149#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200150#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200151#'
152#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200153setMethod("corpusQuery", "KorAPConnection",
Marc Kupietza96537f2019-11-09 23:07:44 +0100154 function(kco,
155 query = if (missing(KorAPUrl))
156 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
157 else
158 httr::parse_url(KorAPUrl)$query$q,
159 vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq,
160 KorAPUrl,
161 metadataOnly = TRUE,
162 ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql,
163 fields = c(
164 "corpusSigle",
165 "textSigle",
166 "pubDate",
167 "pubPlace",
168 "availability",
169 "textClass",
Marc Kupietz2078bde2023-08-27 16:46:15 +0200170 "snippet",
171 "tokens"
Marc Kupietza96537f2019-11-09 23:07:44 +0100172 ),
173 accessRewriteFatal = TRUE,
174 verbose = kco@verbose,
175 expand = length(vc) != length(query),
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200176 as.df = FALSE,
177 context = NULL) {
Marc Kupietza96537f2019-11-09 23:07:44 +0100178 if (length(query) > 1 || length(vc) > 1) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200179 grid <- if (expand) expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc)
180 purrr::pmap(grid, function(query, vc, ...)
181 corpusQuery(kco, query=query, vc=vc, ql=ql, verbose=verbose, as.df = TRUE)) %>%
182 bind_rows()
183 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200184 contentFields <- c("snippet", "tokens")
Marc Kupietza96537f2019-11-09 23:07:44 +0100185 if (metadataOnly) {
186 fields <- fields[!fields %in% contentFields]
187 }
188 request <-
189 paste0('?q=',
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200190 url_encode(enc2utf8(query)),
Marc Kupietzd9b2fd72023-04-17 19:08:50 +0200191 ifelse (!metadataOnly && ! is.null(context) && context != '', paste0('&context=', url_encode(enc2utf8(context))), ''),
Marc Kupietz2078bde2023-08-27 16:46:15 +0200192 ifelse (vc != '', paste0('&cq=', url_encode(enc2utf8(vc))), ''),
193 ifelse (!metadataOnly, '&show-tokens=true', ''),
194 '&ql=', ql)
Marc Kupietza96537f2019-11-09 23:07:44 +0100195 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
196 requestUrl <- paste0(
197 kco@apiUrl,
198 'search',
199 request,
200 '&fields=',
201 paste(fields, collapse = ","),
202 if (metadataOnly) '&access-rewrite-disabled=true' else ''
203 )
Marc Kupietza47d1502023-04-18 15:26:47 +0200204 log_info(verbose, "Searching \"", query, "\" in \"", vc, "\"", sep =
Marc Kupietza96537f2019-11-09 23:07:44 +0100205 "")
206 res = apiCall(kco, paste0(requestUrl, '&count=0'))
Marc Kupietza4675722022-02-23 23:55:15 +0100207 if (is.null(res)) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200208 log_info(verbose, " [failed]\n")
Marc Kupietza4675722022-02-23 23:55:15 +0100209 message("API call failed.")
210 totalResults <- 0
211 } else {
Marc Kupietz41d4e352024-03-11 21:48:55 +0100212 totalResults <-as.integer(res$meta$totalResults)
Marc Kupietza47d1502023-04-18 15:26:47 +0200213 log_info(verbose, ": ", totalResults, " hits")
Marc Kupietza4675722022-02-23 23:55:15 +0100214 if(!is.null(res$meta$cached))
Marc Kupietza47d1502023-04-18 15:26:47 +0200215 log_info(verbose, " [cached]\n")
Marc Kupietza4675722022-02-23 23:55:15 +0100216 else
Marc Kupietza47d1502023-04-18 15:26:47 +0200217 log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
Marc Kupietza4675722022-02-23 23:55:15 +0100218 }
Marc Kupietza96537f2019-11-09 23:07:44 +0100219 if (as.df)
220 data.frame(
221 query = query,
Marc Kupietza4675722022-02-23 23:55:15 +0100222 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100223 vc = vc,
224 webUIRequestUrl = webUIRequestUrl,
225 stringsAsFactors = FALSE
226 )
227 else
228 KorAPQuery(
229 korapConnection = kco,
230 nextStartIndex = 0,
231 fields = fields,
232 requestUrl = requestUrl,
233 request = request,
Marc Kupietza4675722022-02-23 23:55:15 +0100234 totalResults = totalResults,
Marc Kupietza96537f2019-11-09 23:07:44 +0100235 vc = vc,
236 apiResponse = res,
237 webUIRequestUrl = webUIRequestUrl,
Marc Kupietza4675722022-02-23 23:55:15 +0100238 hasMoreMatches = (totalResults > 0),
Marc Kupietza96537f2019-11-09 23:07:44 +0100239 )
240 }
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200241 })
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200242
Marc Kupietz05a60792024-12-07 16:23:31 +0100243#' @importFrom purrr map
244repair_data_strcuture <- function(x) {
245 if (is.list(x))
246 as.character (purrr::map(x, ~ if (length(.x) > 1) {
247 paste(.x, collapse = " ")
248 } else {
249 .x
250 }))
251 else
252 ifelse(is.na(x), "", x)
253}
254
Marc Kupietz62da2b52019-09-12 17:43:34 +0200255#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200256#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200257#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200258#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200259#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200260#' @param offset start offset for query results to fetch
261#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200262#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200263#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
264#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200265#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100266#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200267#' \dontrun{
268#'
269#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100270#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100271#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100272#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200273#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200274#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200275#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200276#' @aliases fetchNext
277#' @rdname KorAPQuery-class
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200278#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietzf4881122024-12-17 14:55:39 +0100279#' @importFrom tibble enframe add_column
280#' @importFrom stringr word
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200281#' @importFrom tidyr unnest unchop pivot_wider
282#' @importFrom purrr map
Marc Kupietz632cbd42019-09-06 16:04:51 +0200283#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200284setMethod("fetchNext", "KorAPQuery", function(kqo,
285 offset = kqo@nextStartIndex,
286 maxFetch = maxResultsPerPage,
287 verbose = kqo@korapConnection@verbose,
288 randomizePageOrder = FALSE) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200289 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
290 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200291 }
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200292 use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz705488d2021-06-30 18:26:36 +0200293 page <- kqo@nextStartIndex / maxResultsPerPage + 1
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200294 results <- 0
Marc Kupietz25aebc32019-09-16 18:40:50 +0200295 pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietze95108e2019-09-18 13:23:58 +0200296 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200297
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200298 if (randomizePageOrder) {
299 pages <- head(sample.int(ceiling(kqo@totalResults / maxResultsPerPage)), maxFetch) - 1
300 }
301
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200302 if(is.null(collectedMatches)) {
303 collectedMatches <- data.frame()
304 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200305 repeat {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200306 page = nrow(collectedMatches) %/% maxResultsPerPage + 1
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200307 currentOffset = ifelse(randomizePageOrder, pages[page], page - 1) * maxResultsPerPage
308 query <- paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', currentOffset, '&cutoff=true')
Marc Kupietz68170952021-06-30 09:37:21 +0200309 res <- apiCall(kqo@korapConnection, query)
310 if (length(res$matches) == 0) {
311 break
312 }
313
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200314 if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) || as.numeric(use_korap_api) >= 1.0)) {
315 if (verbose) cat("Using fields API: ")
Marc Kupietz05a60792024-12-07 16:23:31 +0100316 currentMatches <- res$matches$fields %>%
317 purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
318 tibble::enframe() %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200319 tidyr::unnest(cols = value) %>%
320 tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200321 dplyr::select(-name)
322 if("snippet" %in% colnames(res$matches)) {
323 currentMatches$snippet <- res$matches$snippet
324 }
325 } else {
326 currentMatches <- res$matches
327 }
328
Marc Kupietze95108e2019-09-18 13:23:58 +0200329 for (field in kqo@fields) {
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200330 if (!field %in% colnames(currentMatches)) {
331 currentMatches[, field] <- NA
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200332 }
333 }
Marc Kupietzf4881122024-12-17 14:55:39 +0100334 currentMatches <- currentMatches %>%
335 select(kqo@fields) %>%
336 mutate(
337 tmp_positions = gsub(".*-p(\\d+)-(\\d+)", "\\1 \\2", res$matches$matchID),
338 matchStart = as.integer(stringr::word(tmp_positions, 1)),
339 matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
340 ) %>%
341 select(-tmp_positions)
342
Marc Kupietz62da2b52019-09-12 17:43:34 +0200343 if (!is.list(collectedMatches)) {
344 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200345 } else {
Marc Kupietz2078bde2023-08-27 16:46:15 +0200346 collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200347 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200348 if (verbose) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200349 cat(paste0(
350 "Retrieved page ",
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200351 ceiling(nrow(collectedMatches) / res$meta$itemsPerPage),
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200352 "/",
353 if (!is.na(maxFetch) && maxFetch < kqo@totalResults)
354 sprintf("%d (%d)", ceiling(maxFetch / res$meta$itemsPerPage), ceiling(kqo@totalResults / res$meta$itemsPerPage))
355 else
356 sprintf("%d", ceiling(kqo@totalResults / res$meta$itemsPerPage)),
357 ' in ',
358 res$meta$benchmark,
359 '\n'
360 ))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200361 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200362 page <- page + 1
363 results <- results + res$meta$itemsPerPage
Marc Kupietze8bd49b2024-06-28 07:24:44 +0200364 if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200365 break
366 }
367 }
Marc Kupietz68170952021-06-30 09:37:21 +0200368 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietze95108e2019-09-18 13:23:58 +0200369 KorAPQuery(nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200370 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200371 fields = kqo@fields,
372 requestUrl = kqo@requestUrl,
373 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200374 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200375 vc = kqo@vc,
376 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200377 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200378 apiResponse = res,
379 collectedMatches = collectedMatches)
380})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200381
382#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200383#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200384#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100385#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200386#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200387#' \dontrun{
388#'
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200389#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200390#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100391#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200392#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200393#' @aliases fetchAll
394#' @rdname KorAPQuery-class
Marc Kupietz62da2b52019-09-12 17:43:34 +0200395#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200396setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
397 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200398})
399
400#' Fetches the remaining results of a KorAP query.
401#'
402#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200403#' \dontrun{
404#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100405#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200406#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100407#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200408#'
409#' @aliases fetchRest
410#' @rdname KorAPQuery-class
411#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200412setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
413 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200414})
415
Marc Kupietz3f575282019-10-04 14:46:04 +0200416#' Query relative frequency of search term(s)
417#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200418#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
419#' [ci()] to compute a table with the relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +0200420#' confidence intervals of one ore multiple search terms across one or multiple
421#' virtual corpora.
422#'
423#' @aliases frequencyQuery
424#' @rdname KorAPQuery-class
425#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +0200426#' \dontrun{
427#'
Marc Kupietz3f575282019-10-04 14:46:04 +0200428#' new("KorAPConnection", verbose = TRUE) %>%
429#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +0100430#' }
Marc Kupietz3f575282019-10-04 14:46:04 +0200431#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200432#' @param kco [KorAPConnection()] object (obtained e.g. from `new("KorAPConnection")`
433#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
434#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
435#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietz3f575282019-10-04 14:46:04 +0200436#' @export
437setMethod("frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +0100438 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
439 (if (as.alternatives) {
440 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) %>%
441 group_by(vc) %>%
442 mutate(total = sum(totalResults))
443 } else {
444 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) %>%
445 mutate(total = corpusStats(kco, vc=vc, as.df=TRUE)$tokens)
446 } ) %>%
Marc Kupietz0c29cea2019-10-09 08:44:36 +0200447 ci(conf.level = conf.level)
Marc Kupietz3f575282019-10-04 14:46:04 +0200448})
449
Marc Kupietz38a9d682024-12-06 16:17:09 +0100450#' buildWebUIRequestUrlFromString
451#'
452#' @rdname KorAPQuery-class
453#' @importFrom urltools url_encode
454#' @export
455buildWebUIRequestUrlFromString <- function(KorAPUrl,
456 query,
457 vc = "",
458 ql = "poliqarp"
459) {
460 if ("KorAPConnection" %in% class(KorAPUrl)) {
461 KorAPUrl <- KorAPUrl@KorAPUrl
462 }
463
464 request <-
465 paste0(
466 '?q=',
467 urltools::url_encode(enc2utf8(as.character(query))),
468 ifelse(vc != '',
469 paste0('&cq=', urltools::url_encode(enc2utf8(vc))),
470 ''),
471 '&ql=',
472 ql
473 )
474 paste0(KorAPUrl, request)
475}
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200476
477#' buildWebUIRequestUrl
478#'
479#' @rdname KorAPQuery-class
Marc Kupietz38a9d682024-12-06 16:17:09 +0100480#' @importFrom httr parse_url
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200481#' @export
482buildWebUIRequestUrl <- function(kco,
483 query = if (missing(KorAPUrl))
484 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
485 else
486 httr::parse_url(KorAPUrl)$query$q,
487 vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq,
488 KorAPUrl,
Marc Kupietz38a9d682024-12-06 16:17:09 +0100489 ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql) {
490
491 buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200492}
493
Marc Kupietze95108e2019-09-18 13:23:58 +0200494#´ format()
495#' @rdname KorAPQuery-class
496#' @param x KorAPQuery object
497#' @param ... further arguments passed to or from other methods
498#' @export
499format.KorAPQuery <- function(x, ...) {
500 cat("<KorAPQuery>\n")
501 q <- x
502 aurl = parse_url(q@request)
Marc Kupietz0d4c9092020-03-23 09:02:30 +0100503 cat(" Query: ", aurl$query$q, "\n")
504 if (!is.null(aurl$query$cq) && aurl$query$cq != "") {
505 cat(" Virtual corpus: ", aurl$query$cq, "\n")
Marc Kupietze95108e2019-09-18 13:23:58 +0200506 }
507 if (!is.null(q@collectedMatches)) {
508 cat("==============================================================================================================", "\n")
509 print(summary(q@collectedMatches))
510 cat("==============================================================================================================", "\n")
511 }
512 cat(" Total results: ", q@totalResults, "\n")
513 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200514}
515
Marc Kupietze95108e2019-09-18 13:23:58 +0200516#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200517#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200518#' @rdname KorAPQuery-class
519#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200520#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200521setMethod("show", "KorAPQuery", function(object) {
522 format(object)
523})
Marc Kupietz006b47c2021-01-13 17:00:59 +0100524