blob: e6f240efbd81fa8e609d382588edae9e50a672b6 [file] [log] [blame]
Marc Kupietze95108e2019-09-18 13:23:58 +02001#' Class KorAPQuery
2#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +01003#' This class provides methods to perform different kinds of queries on the KorAP API server.
Marc Kupietz67edcb52021-09-20 21:54:24 +02004#' `KorAPQuery` objects, which are typically created by the [corpusQuery()] method,
Marc Kupietza6e4ee62021-03-05 09:00:15 +01005#' represent the current state of a query to a KorAP server.
Marc Kupietze95108e2019-09-18 13:23:58 +02006#'
7#' @include KorAPConnection.R
Marc Kupietze95108e2019-09-18 13:23:58 +02008#' @import httr
9#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
24 "hasMoreMatches"
25))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietze95108e2019-09-18 13:23:58 +020027#' Method initialize
28#'
29#' @rdname KorAPQuery-class
30#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020031#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020032#' @param request query part of the request URL
33#' @param vc definition of a virtual corpus
34#' @param totalResults number of hits the query has yielded
35#' @param nextStartIndex at what index to start the next fetch of query results
36#' @param fields what data / metadata fields should be collected
37#' @param requestUrl complete URL of the API request
38#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
39#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020040#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020041#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020042#'
43#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020044#' @export
45setMethod("initialize", "KorAPQuery",
Marc Kupietzb8972182019-09-20 21:33:46 +020046 function(.Object, korapConnection = NULL, request = NULL, vc="", totalResults=0, nextStartIndex=0, fields=c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietze95108e2019-09-18 13:23:58 +020047 "availability", "textClass", "snippet"),
48 requestUrl="", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches= FALSE, collectedMatches = NULL) {
49 .Object <- callNextMethod()
Marc Kupietzb8972182019-09-20 21:33:46 +020050 .Object@korapConnection = korapConnection
Marc Kupietze95108e2019-09-18 13:23:58 +020051 .Object@request = request
52 .Object@vc = vc
53 .Object@totalResults = totalResults
54 .Object@nextStartIndex = nextStartIndex
55 .Object@fields = fields
56 .Object@requestUrl = requestUrl
57 .Object@webUIRequestUrl = webUIRequestUrl
58 .Object@apiResponse = apiResponse
59 .Object@hasMoreMatches = hasMoreMatches
60 .Object@collectedMatches = collectedMatches
61 .Object
62 })
Marc Kupietz632cbd42019-09-06 16:04:51 +020063
Marc Kupietze95108e2019-09-18 13:23:58 +020064setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery") )
65setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll") )
66setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") )
67setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
Marc Kupietz3f575282019-10-04 14:46:04 +020068setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery") )
Marc Kupietze95108e2019-09-18 13:23:58 +020069
70maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020071
Marc Kupietz4de53ec2019-10-04 09:12:00 +020072## quiets concerns of R CMD check re: the .'s that appear in pipelines
73if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020074
Marc Kupietzdbd431a2021-08-29 12:17:45 +020075#' Corpus query
76#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020077#' **`corpusQuery`** performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020078#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020079#' @rdname KorAPQuery-class
80#' @aliases corpusQuery
81#'
82#' @importFrom urltools url_encode
83#' @importFrom purrr pmap
84#' @importFrom dplyr bind_rows
85#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020086#' @param kco [KorAPConnection()] object (obtained e.g. from `new("KorAPConnection")`
87#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz632cbd42019-09-06 16:04:51 +020088#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020089#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz7776dec2019-09-27 16:59:02 +020090#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz67edcb52021-09-20 21:54:24 +020091#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Akron5e135462019-09-27 16:31:38 +020092#' @param fields (meta)data fields that will be fetched for every match.
Marc Kupietz43a6ade2020-02-18 17:01:44 +010093#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +020094#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +020095#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietz67edcb52021-09-20 21:54:24 +020096#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations
97#' @return Depending on the `as.df` parameter, a table or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
98#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
99#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200100#'
101#' @examples
Marc Kupietz603491f2019-09-18 14:01:02 +0200102#' # Fetch metadata of every query hit for "Ameisenplage" and show a summary
Marc Kupietz657d8e72020-02-25 18:31:50 +0100103#' \donttest{
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200104#' new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietz657d8e72020-02-25 18:31:50 +0100105#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200106#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200107#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
108#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200109#'
110#' new("KorAPConnection", verbose = TRUE) %>%
111#' corpusQuery(KorAPUrl =
112#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp")
Marc Kupietz3c531f62019-09-13 12:17:24 +0200113#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200114#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietz05b22772020-02-18 21:58:42 +0100115#' \donttest{
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200116#' new("KorAPConnection", verbose=TRUE) %>%
117#' { . ->> kco } %>%
118#' corpusQuery("Ameisenplage") %>%
119#' fetchAll() %>%
120#' slot("collectedMatches") %>%
121#' mutate(year = lubridate::year(pubDate)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200122#' dplyr::select(year) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200123#' group_by(year) %>%
Marc Kupietzcb3c59e2020-06-02 10:10:43 +0200124#' summarise(Count = dplyr::n()) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200125#' mutate(Freq = mapply(function(f, y)
126#' f / corpusStats(kco, paste("pubDate in", y))@tokens, Count, year)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200127#' dplyr::select(-Count) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200128#' complete(year = min(year):max(year), fill = list(Freq = 0)) %>%
129#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100130#' }
Marc Kupietz67edcb52021-09-20 21:54:24 +0200131#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz632cbd42019-09-06 16:04:51 +0200132#'
133#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200134#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz632cbd42019-09-06 16:04:51 +0200135#'
136#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200137setMethod("corpusQuery", "KorAPConnection",
Marc Kupietza96537f2019-11-09 23:07:44 +0100138 function(kco,
139 query = if (missing(KorAPUrl))
140 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
141 else
142 httr::parse_url(KorAPUrl)$query$q,
143 vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq,
144 KorAPUrl,
145 metadataOnly = TRUE,
146 ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql,
147 fields = c(
148 "corpusSigle",
149 "textSigle",
150 "pubDate",
151 "pubPlace",
152 "availability",
153 "textClass",
154 "snippet"
155 ),
156 accessRewriteFatal = TRUE,
157 verbose = kco@verbose,
158 expand = length(vc) != length(query),
159 as.df = FALSE) {
160 if (length(query) > 1 || length(vc) > 1) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200161 grid <- if (expand) expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc)
162 purrr::pmap(grid, function(query, vc, ...)
163 corpusQuery(kco, query=query, vc=vc, ql=ql, verbose=verbose, as.df = TRUE)) %>%
164 bind_rows()
165 } else {
Marc Kupietza96537f2019-11-09 23:07:44 +0100166 contentFields <- c("snippet")
167 if (metadataOnly) {
168 fields <- fields[!fields %in% contentFields]
169 }
170 request <-
171 paste0('?q=',
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200172 url_encode(enc2utf8(query)),
173 ifelse (vc != '', paste0('&cq=', url_encode(enc2utf8(vc))), ''), '&ql=', ql)
Marc Kupietza96537f2019-11-09 23:07:44 +0100174 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
175 requestUrl <- paste0(
176 kco@apiUrl,
177 'search',
178 request,
179 '&fields=',
180 paste(fields, collapse = ","),
181 if (metadataOnly) '&access-rewrite-disabled=true' else ''
182 )
183 log.info(verbose, "Searching \"", query, "\" in \"", vc, "\"", sep =
184 "")
185 res = apiCall(kco, paste0(requestUrl, '&count=0'))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200186 log.info(verbose, ": ", res$meta$totalResults, " hits")
Marc Kupietzf5769b62019-12-13 09:19:45 +0100187 if(!is.null(res$meta$cached))
188 log.info(verbose, " [cached]\n")
189 else
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200190 log.info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
Marc Kupietza96537f2019-11-09 23:07:44 +0100191 if (as.df)
192 data.frame(
193 query = query,
194 totalResults = res$meta$totalResults,
195 vc = vc,
196 webUIRequestUrl = webUIRequestUrl,
197 stringsAsFactors = FALSE
198 )
199 else
200 KorAPQuery(
201 korapConnection = kco,
202 nextStartIndex = 0,
203 fields = fields,
204 requestUrl = requestUrl,
205 request = request,
206 totalResults = res$meta$totalResults,
207 vc = vc,
208 apiResponse = res,
209 webUIRequestUrl = webUIRequestUrl,
210 hasMoreMatches = (res$meta$totalResults > 0),
211 )
212 }
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200213 })
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200214
Marc Kupietz62da2b52019-09-12 17:43:34 +0200215#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200216#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200217#' **`fetchNext`** fetches the next bunch of results of a KorAP query.
Marc Kupietz3f575282019-10-04 14:46:04 +0200218#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200219#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz62da2b52019-09-12 17:43:34 +0200220#' @param offset start offset for query results to fetch
221#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200222#' @param verbose print progress information if true
Marc Kupietz67edcb52021-09-20 21:54:24 +0200223#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
224#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz62da2b52019-09-12 17:43:34 +0200225#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100226#' @examples
Marc Kupietz657d8e72020-02-25 18:31:50 +0100227#' \donttest{q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100228#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100229#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100230#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200231#' @references
Marc Kupietz67edcb52021-09-20 21:54:24 +0200232#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz62da2b52019-09-12 17:43:34 +0200233#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200234#' @aliases fetchNext
235#' @rdname KorAPQuery-class
Marc Kupietzcb3c59e2020-06-02 10:10:43 +0200236#' @importFrom dplyr rowwise bind_rows select summarise n
Marc Kupietz632cbd42019-09-06 16:04:51 +0200237#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200238setMethod("fetchNext", "KorAPQuery", function(kqo,
239 offset = kqo@nextStartIndex,
240 maxFetch = maxResultsPerPage,
241 verbose = kqo@korapConnection@verbose,
242 randomizePageOrder = FALSE) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200243 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
244 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200245 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200246
Marc Kupietz705488d2021-06-30 18:26:36 +0200247 page <- kqo@nextStartIndex / maxResultsPerPage + 1
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200248 results <- 0
Marc Kupietz25aebc32019-09-16 18:40:50 +0200249 pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietze95108e2019-09-18 13:23:58 +0200250 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200251
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200252 if (randomizePageOrder) {
253 pages <- head(sample.int(ceiling(kqo@totalResults / maxResultsPerPage)), maxFetch) - 1
254 }
255
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200256 repeat {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200257 page = length(collectedMatches[,1]) %/% maxResultsPerPage + 1
258 currentOffset = ifelse(randomizePageOrder, pages[page], page - 1) * maxResultsPerPage
259 query <- paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', currentOffset, '&cutoff=true')
Marc Kupietz68170952021-06-30 09:37:21 +0200260 res <- apiCall(kqo@korapConnection, query)
261 if (length(res$matches) == 0) {
262 break
263 }
264
Marc Kupietze95108e2019-09-18 13:23:58 +0200265 for (field in kqo@fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200266 if (!field %in% colnames(res$matches)) {
267 res$matches[, field] <- NA
268 }
269 }
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200270 currentMatches <-
Marc Kupietzf75ab0b2020-06-02 12:31:18 +0200271 res$matches %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200272 dplyr::select(kqo@fields)
Marc Kupietz36d12d92019-09-27 18:13:27 +0200273 if ("pubDate" %in% kqo@fields) {
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200274 currentMatches$pubDate <- currentMatches$pubDate %>% as.Date(format = "%Y-%m-%d")
275 factorCols <- currentMatches %>% select(-pubDate) %>% colnames()
Marc Kupietz36d12d92019-09-27 18:13:27 +0200276 } else {
277 factorCols <- colnames(currentMatches)
278 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200279 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200280 if (!is.list(collectedMatches)) {
281 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200282 } else {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200283 collectedMatches <- rbind(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200284 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200285 if (verbose) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200286 cat(paste0(
287 "Retrieved page ",
288 ceiling(length(collectedMatches[, 1]) / res$meta$itemsPerPage),
289 "/",
290 if (!is.na(maxFetch) && maxFetch < kqo@totalResults)
291 sprintf("%d (%d)", ceiling(maxFetch / res$meta$itemsPerPage), ceiling(kqo@totalResults / res$meta$itemsPerPage))
292 else
293 sprintf("%d", ceiling(kqo@totalResults / res$meta$itemsPerPage)),
294 ' in ',
295 res$meta$benchmark,
296 '\n'
297 ))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200298 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200299 page <- page + 1
300 results <- results + res$meta$itemsPerPage
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200301 if (length(collectedMatches[,1]) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200302 break
303 }
304 }
Marc Kupietz68170952021-06-30 09:37:21 +0200305 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietze95108e2019-09-18 13:23:58 +0200306 KorAPQuery(nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200307 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200308 fields = kqo@fields,
309 requestUrl = kqo@requestUrl,
310 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200311 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200312 vc = kqo@vc,
313 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200314 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200315 apiResponse = res,
316 collectedMatches = collectedMatches)
317})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200318
319#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200320#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200321#' **`fetchAll`** fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100322#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200323#' @examples
Marc Kupietz05b22772020-02-18 21:58:42 +0100324#' \donttest{
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200325#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200326#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100327#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200328#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200329#' @aliases fetchAll
330#' @rdname KorAPQuery-class
Marc Kupietz62da2b52019-09-12 17:43:34 +0200331#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200332setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
333 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200334})
335
336#' Fetches the remaining results of a KorAP query.
337#'
338#' @examples
Marc Kupietz05b22772020-02-18 21:58:42 +0100339#' \donttest{
340#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200341#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100342#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200343#'
344#' @aliases fetchRest
345#' @rdname KorAPQuery-class
346#' @export
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200347setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
348 return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietze95108e2019-09-18 13:23:58 +0200349})
350
Marc Kupietz3f575282019-10-04 14:46:04 +0200351#' Query relative frequency of search term(s)
352#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200353#' **`frequencyQuery`** combines [corpusQuery()], [corpusStats()] and
354#' [ci()] to compute a table with the relative frequencies and
Marc Kupietz3f575282019-10-04 14:46:04 +0200355#' confidence intervals of one ore multiple search terms across one or multiple
356#' virtual corpora.
357#'
358#' @aliases frequencyQuery
359#' @rdname KorAPQuery-class
360#' @examples
Marc Kupietz05b22772020-02-18 21:58:42 +0100361#' \donttest{
Marc Kupietz3f575282019-10-04 14:46:04 +0200362#' new("KorAPConnection", verbose = TRUE) %>%
363#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +0100364#' }
Marc Kupietz3f575282019-10-04 14:46:04 +0200365#'
Marc Kupietz67edcb52021-09-20 21:54:24 +0200366#' @param kco [KorAPConnection()] object (obtained e.g. from `new("KorAPConnection")`
367#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
368#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
369#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietz3f575282019-10-04 14:46:04 +0200370#' @export
371setMethod("frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +0100372 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
373 (if (as.alternatives) {
374 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) %>%
375 group_by(vc) %>%
376 mutate(total = sum(totalResults))
377 } else {
378 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) %>%
379 mutate(total = corpusStats(kco, vc=vc, as.df=TRUE)$tokens)
380 } ) %>%
Marc Kupietz0c29cea2019-10-09 08:44:36 +0200381 ci(conf.level = conf.level)
Marc Kupietz3f575282019-10-04 14:46:04 +0200382})
383
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200384
385#' buildWebUIRequestUrl
386#'
387#' @rdname KorAPQuery-class
388#' @importFrom urltools url_encode
389#' @export
390buildWebUIRequestUrl <- function(kco,
391 query = if (missing(KorAPUrl))
392 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
393 else
394 httr::parse_url(KorAPUrl)$query$q,
395 vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq,
396 KorAPUrl,
397 metadataOnly = TRUE,
398 ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql,
399 fields = c(
400 "corpusSigle",
401 "textSigle",
402 "pubDate",
403 "pubPlace",
404 "availability",
405 "textClass",
406 "snippet"
407 ),
408 accessRewriteFatal = TRUE) {
409 request <-
410 paste0(
411 '?q=',
412 urltools::url_encode(enc2utf8(as.character(query))),
413 ifelse(vc != '',
414 paste0('&cq=', urltools::url_encode(enc2utf8(vc))),
415 ''),
416 '&ql=',
417 ql
418 )
419 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
420 requestUrl <- paste0(
421 kco@apiUrl,
422 'search',
423 request,
424 '&fields=',
425 paste(fields, collapse = ","),
426 if (metadataOnly)
427 '&access-rewrite-disabled=true'
428 else
429 ''
430 )
431 webUIRequestUrl
432}
433
Marc Kupietze95108e2019-09-18 13:23:58 +0200434#´ format()
435#' @rdname KorAPQuery-class
436#' @param x KorAPQuery object
437#' @param ... further arguments passed to or from other methods
438#' @export
439format.KorAPQuery <- function(x, ...) {
440 cat("<KorAPQuery>\n")
441 q <- x
442 aurl = parse_url(q@request)
Marc Kupietz0d4c9092020-03-23 09:02:30 +0100443 cat(" Query: ", aurl$query$q, "\n")
444 if (!is.null(aurl$query$cq) && aurl$query$cq != "") {
445 cat(" Virtual corpus: ", aurl$query$cq, "\n")
Marc Kupietze95108e2019-09-18 13:23:58 +0200446 }
447 if (!is.null(q@collectedMatches)) {
448 cat("==============================================================================================================", "\n")
449 print(summary(q@collectedMatches))
450 cat("==============================================================================================================", "\n")
451 }
452 cat(" Total results: ", q@totalResults, "\n")
453 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200454}
455
Marc Kupietze95108e2019-09-18 13:23:58 +0200456#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200457#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200458#' @rdname KorAPQuery-class
459#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200460#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200461setMethod("show", "KorAPQuery", function(object) {
462 format(object)
463})
Marc Kupietz006b47c2021-01-13 17:00:59 +0100464