blob: f71a4f39b62c8867ae6ce227a1250943182e5a81 [file] [log] [blame]
Marc Kupietze95108e2019-09-18 13:23:58 +02001#' Class KorAPQuery
2#'
3#' \code{KorAPQuery} objetcs represent the current state of a query to a KorAP server.
4#' New \code{KorAPQuery} objects are typically created by the \code{\link{corpusQuery}} method.
5#'
6#' @include KorAPConnection.R
Marc Kupietz4de53ec2019-10-04 09:12:00 +02007#' @import jsonlite
Marc Kupietz69cc54a2019-09-30 12:06:54 +02008#' @import tidyr
9#' @import dplyr
Marc Kupietze95108e2019-09-18 13:23:58 +020010#' @import httr
11#'
Marc Kupietz4de53ec2019-10-04 09:12:00 +020012#' @include RKorAPClient.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020013
Marc Kupietze95108e2019-09-18 13:23:58 +020014#' @export
15KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020016 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020017 "request",
18 "vc",
19 "totalResults",
20 "nextStartIndex",
21 "fields",
22 "requestUrl",
23 "webUIRequestUrl",
24 "apiResponse",
25 "collectedMatches",
26 "hasMoreMatches"
27))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020028
Marc Kupietze95108e2019-09-18 13:23:58 +020029#' Method initialize
30#'
31#' @rdname KorAPQuery-class
32#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020033#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020034#' @param request query part of the request URL
35#' @param vc definition of a virtual corpus
36#' @param totalResults number of hits the query has yielded
37#' @param nextStartIndex at what index to start the next fetch of query results
38#' @param fields what data / metadata fields should be collected
39#' @param requestUrl complete URL of the API request
40#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
41#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020042#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020043#' @param collectedMatches matches already fetched from the KorAP-API-server
44#' @export
45setMethod("initialize", "KorAPQuery",
Marc Kupietzb8972182019-09-20 21:33:46 +020046 function(.Object, korapConnection = NULL, request = NULL, vc="", totalResults=0, nextStartIndex=0, fields=c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietze95108e2019-09-18 13:23:58 +020047 "availability", "textClass", "snippet"),
48 requestUrl="", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches= FALSE, collectedMatches = NULL) {
49 .Object <- callNextMethod()
Marc Kupietzb8972182019-09-20 21:33:46 +020050 .Object@korapConnection = korapConnection
Marc Kupietze95108e2019-09-18 13:23:58 +020051 .Object@request = request
52 .Object@vc = vc
53 .Object@totalResults = totalResults
54 .Object@nextStartIndex = nextStartIndex
55 .Object@fields = fields
56 .Object@requestUrl = requestUrl
57 .Object@webUIRequestUrl = webUIRequestUrl
58 .Object@apiResponse = apiResponse
59 .Object@hasMoreMatches = hasMoreMatches
60 .Object@collectedMatches = collectedMatches
61 .Object
62 })
Marc Kupietz632cbd42019-09-06 16:04:51 +020063
Marc Kupietze95108e2019-09-18 13:23:58 +020064setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery") )
65setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll") )
66setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") )
67setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
68
69maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020070
Marc Kupietz632cbd42019-09-06 16:04:51 +020071QueryParameterFromUrl <- function(url, parameter) {
72 regex <- paste0(".*[?&]", parameter, "=([^&]*).*")
73 if (grepl(regex, url)) {
74 return(gsub(regex, '\\1', url, perl = TRUE))
75 } else {
76 return("")
77 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020078}
79
Marc Kupietz4de53ec2019-10-04 09:12:00 +020080## quiets concerns of R CMD check re: the .'s that appear in pipelines
81if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020082
Marc Kupietze95108e2019-09-18 13:23:58 +020083#' Method corpusQuery
84#'
85#' Perform a corpus query via a connection to a KorAP-API-server.
86#'
87#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
Akron5e135462019-09-27 16:31:38 +020088#' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.
Marc Kupietz632cbd42019-09-06 16:04:51 +020089#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
90#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietz7776dec2019-09-27 16:59:02 +020091#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz3c531f62019-09-13 12:17:24 +020092#' @param ql string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values.
Akron5e135462019-09-27 16:31:38 +020093#' @param fields (meta)data fields that will be fetched for every match.
94#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficent rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +020095#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +020096#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietze95108e2019-09-18 13:23:58 +020097#' @return A \code{\link{KorAPQuery}} object that, among other information, contains the total number of results in \code{@totalResults}. The resulting object can be used to fetch all query results (with \code{\link{fetchAll}}) or the next page of results (with \code{\link{fetchNext}}).
98#' A corresponding URL to be used within a web browser is contained in \code{@webUIRequestUrl}
Marc Kupietz62da2b52019-09-12 17:43:34 +020099#' Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +0200100#'
101#' @examples
Marc Kupietz603491f2019-09-18 14:01:02 +0200102#' # Fetch metadata of every query hit for "Ameisenplage" and show a summary
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200103#' new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietz3c531f62019-09-13 12:17:24 +0200104#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200105#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
106#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200107#'
108#' new("KorAPConnection", verbose = TRUE) %>%
109#' corpusQuery(KorAPUrl =
110#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp")
Marc Kupietz3c531f62019-09-13 12:17:24 +0200111#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200112#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200113#' new("KorAPConnection", verbose=TRUE) %>%
114#' { . ->> kco } %>%
115#' corpusQuery("Ameisenplage") %>%
116#' fetchAll() %>%
117#' slot("collectedMatches") %>%
118#' mutate(year = lubridate::year(pubDate)) %>%
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200119#' select(year) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200120#' group_by(year) %>%
121#' summarise(Count = n()) %>%
122#' mutate(Freq = mapply(function(f, y)
123#' f / corpusStats(kco, paste("pubDate in", y))@tokens, Count, year)) %>%
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200124#' select(-Count) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200125#' complete(year = min(year):max(year), fill = list(Freq = 0)) %>%
126#' plot(type = "l")
Marc Kupietz37b8ef12019-09-16 18:37:49 +0200127#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200128#' @seealso \code{\link{KorAPConnection}}, \code{\link{fetchNext}}, \code{\link{fetchRest}}, \code{\link{fetchAll}}, \code{\link{corpusStats}}
Marc Kupietz632cbd42019-09-06 16:04:51 +0200129#'
130#' @references
131#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
132#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200133#' @aliases corpusQuery
Marc Kupietz632cbd42019-09-06 16:04:51 +0200134#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200135setMethod("corpusQuery", "KorAPConnection",
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200136 function(kco,
137 query = ifelse(missing(KorAPUrl),
138 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE),
139 httr::parse_url(KorAPUrl)$query$q),
140 vc = ifelse(missing(KorAPUrl), "", httr::parse_url(KorAPUrl)$query$cq),
141 KorAPUrl,
142 metadataOnly = TRUE,
143 ql = ifelse(missing(KorAPUrl), "poliqarp", httr::parse_url(KorAPUrl)$query$ql),
144 fields = c("corpusSigle", "textSigle", "pubDate", "pubPlace",
145 "availability", "textClass", "snippet"),
146 accessRewriteFatal = TRUE,
147 verbose = kco@verbose,
148 as.df = FALSE) {
149 ifelse(length(query) > 1 , {
150 #grid <- expand_grid(query=query, vc=vc)
151 return(
152 do.call(rbind,
153 Map(function(q, cq) corpusQuery(kco, query=q, vc=cq,
154 verbose=verbose, as.df = TRUE), query, vc))
155 )}, {
156 contentFields <- c("snippet")
157 fields <- fields[!fields %in% contentFields]
158 request <- paste0('?q=', URLencode(query, reserved=TRUE),
159 ifelse(vc != '', paste0('&cq=', URLencode(vc, reserved=TRUE)), ''), '&ql=', ql)
160 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
161 requestUrl <- paste0(kco@apiUrl, 'search', request,
162 '&fields=', paste(fields, collapse = ","),
163 ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
164 log.info(verbose, "Searching \"", query, "\" in \"", vc, "\"", sep="")
165 res = apiCall(kco, paste0(requestUrl, '&count=0'))
166 log.info(verbose, " took ", res$meta$benchmark, "\n", sep="")
167 ifelse(as.df,
168 return(data.frame(query=query,
169 totalResults=res$meta$totalResults,
170 vc=vc,
171 webUIRequestUrl=webUIRequestUrl, stringsAsFactors = FALSE)),
172 return(KorAPQuery(
173 korapConnection = kco,
174 nextStartIndex = 0,
175 fields = fields,
176 requestUrl = requestUrl,
177 request = request,
178 totalResults = res$meta$totalResults,
179 vc = vc,
180 apiResponse = res,
181 webUIRequestUrl = webUIRequestUrl,
182 hasMoreMatches = (res$meta$totalResults > 0),
183 )))})
184 })
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200185
Marc Kupietz62da2b52019-09-12 17:43:34 +0200186#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200187#'
188#' @param kqo object obtained from \code{\link{corpusQuery}}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200189#' @param offset start offset for query results to fetch
190#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200191#' @param verbose print progress information if true
Marc Kupietze95108e2019-09-18 13:23:58 +0200192#' @return The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200193#'
194#' @references
195#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
196#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200197#' @aliases fetchNext
198#' @rdname KorAPQuery-class
Marc Kupietz632cbd42019-09-06 16:04:51 +0200199#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200200setMethod("fetchNext", "KorAPQuery", function(kqo, offset = kqo@nextStartIndex, maxFetch = maxResultsPerPage, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200201 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
202 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200203 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200204
205 page <- 1
206 results <- 0
Marc Kupietz25aebc32019-09-16 18:40:50 +0200207 pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietze95108e2019-09-18 13:23:58 +0200208 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200209
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200210 repeat {
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200211 res <- apiCall(kqo@korapConnection, paste0(kqo@requestUrl, '&count=', min(ifelse(!is.na(maxFetch), maxFetch - results, maxResultsPerPage), maxResultsPerPage) ,'&offset=', offset + results))
Marc Kupietze95108e2019-09-18 13:23:58 +0200212 if (res$meta$totalResults == 0) { return(kqo) }
213 for (field in kqo@fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200214 if (!field %in% colnames(res$matches)) {
215 res$matches[, field] <- NA
216 }
217 }
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200218 currentMatches <-
219 kqo@fields %>%
220 map_dfr( ~tibble(!!.x := logical() ) ) %>%
221 bind_rows(res$matches) %>%
222 select(kqo@fields)
Marc Kupietz36d12d92019-09-27 18:13:27 +0200223 if ("pubDate" %in% kqo@fields) {
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200224 currentMatches$pubDate <- currentMatches$pubDate %>% as.Date(format = "%Y-%m-%d")
225 factorCols <- currentMatches %>% select(-pubDate) %>% colnames()
Marc Kupietz36d12d92019-09-27 18:13:27 +0200226 } else {
227 factorCols <- colnames(currentMatches)
228 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200229 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200230 if (!is.list(collectedMatches)) {
231 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200232 } else {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200233 collectedMatches <- rbind(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200234 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200235 if (verbose) {
Marc Kupietzf6f71312019-09-23 18:35:27 +0200236 cat(paste0("Retrieved page ", page, "/", ceiling((res$meta$totalResults) / res$meta$itemsPerPage), ' in ', res$meta$benchmark, '\n'))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200237 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200238 page <- page + 1
239 results <- results + res$meta$itemsPerPage
Marc Kupietz62da2b52019-09-12 17:43:34 +0200240 if (offset + results >= res$meta$totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200241 break
242 }
243 }
Marc Kupietze95108e2019-09-18 13:23:58 +0200244 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, res$meta$totalResults)
245 KorAPQuery(nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200246 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200247 fields = kqo@fields,
248 requestUrl = kqo@requestUrl,
249 request = kqo@request,
250 totalResults = res$meta$totalResults,
251 vc = kqo@vc,
252 webUIRequestUrl = kqo@webUIRequestUrl,
253 hasMoreMatches = (res$meta$totalResults > nextStartIndex),
254 apiResponse = res,
255 collectedMatches = collectedMatches)
256})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200257
258#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200259#'
260#' @examples
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200261#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200262#' q@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200263#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200264#' @aliases fetchAll
265#' @rdname KorAPQuery-class
Marc Kupietz62da2b52019-09-12 17:43:34 +0200266#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200267setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200268 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose))
269})
270
271#' Fetches the remaining results of a KorAP query.
272#'
273#' @examples
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200274#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200275#' q@collectedMatches
276#'
277#' @aliases fetchRest
278#' @rdname KorAPQuery-class
279#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200280setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200281 return(fetchNext(kqo, maxFetch = NA, verbose = verbose))
282})
283
284#´ format()
285#' @rdname KorAPQuery-class
286#' @param x KorAPQuery object
287#' @param ... further arguments passed to or from other methods
288#' @export
289format.KorAPQuery <- function(x, ...) {
290 cat("<KorAPQuery>\n")
291 q <- x
292 aurl = parse_url(q@request)
293 cat(" Query: ", aurl$query$q, "\n")
294 if (!is.null(aurl$query$vc) && aurl$query$vc != "") {
295 cat("Virtual corpus: ", aurl$query$vc, "\n")
296 }
297 if (!is.null(q@collectedMatches)) {
298 cat("==============================================================================================================", "\n")
299 print(summary(q@collectedMatches))
300 cat("==============================================================================================================", "\n")
301 }
302 cat(" Total results: ", q@totalResults, "\n")
303 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200304}
305
Marc Kupietze95108e2019-09-18 13:23:58 +0200306#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200307#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200308#' @rdname KorAPQuery-class
309#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200310#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200311setMethod("show", "KorAPQuery", function(object) {
312 format(object)
313})