blob: 3cc0b376b570ff0b9485f18f1fd8fff9212f092e [file] [log] [blame]
Marc Kupietze95108e2019-09-18 13:23:58 +02001#' Class KorAPQuery
2#'
3#' \code{KorAPQuery} objetcs represent the current state of a query to a KorAP server.
4#' New \code{KorAPQuery} objects are typically created by the \code{\link{corpusQuery}} method.
5#'
6#' @include KorAPConnection.R
Marc Kupietz69cc54a2019-09-30 12:06:54 +02007#' @import tidyr
8#' @import dplyr
Marc Kupietze95108e2019-09-18 13:23:58 +02009#' @import httr
10#'
11#'
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020012
Marc Kupietze95108e2019-09-18 13:23:58 +020013#' @export
14KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020015 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020016 "request",
17 "vc",
18 "totalResults",
19 "nextStartIndex",
20 "fields",
21 "requestUrl",
22 "webUIRequestUrl",
23 "apiResponse",
24 "collectedMatches",
25 "hasMoreMatches"
26))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020027
Marc Kupietze95108e2019-09-18 13:23:58 +020028#' Method initialize
29#'
30#' @rdname KorAPQuery-class
31#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020032#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020033#' @param request query part of the request URL
34#' @param vc definition of a virtual corpus
35#' @param totalResults number of hits the query has yielded
36#' @param nextStartIndex at what index to start the next fetch of query results
37#' @param fields what data / metadata fields should be collected
38#' @param requestUrl complete URL of the API request
39#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
40#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020041#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020042#' @param collectedMatches matches already fetched from the KorAP-API-server
43#' @export
44setMethod("initialize", "KorAPQuery",
Marc Kupietzb8972182019-09-20 21:33:46 +020045 function(.Object, korapConnection = NULL, request = NULL, vc="", totalResults=0, nextStartIndex=0, fields=c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietze95108e2019-09-18 13:23:58 +020046 "availability", "textClass", "snippet"),
47 requestUrl="", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches= FALSE, collectedMatches = NULL) {
48 .Object <- callNextMethod()
Marc Kupietzb8972182019-09-20 21:33:46 +020049 .Object@korapConnection = korapConnection
Marc Kupietze95108e2019-09-18 13:23:58 +020050 .Object@request = request
51 .Object@vc = vc
52 .Object@totalResults = totalResults
53 .Object@nextStartIndex = nextStartIndex
54 .Object@fields = fields
55 .Object@requestUrl = requestUrl
56 .Object@webUIRequestUrl = webUIRequestUrl
57 .Object@apiResponse = apiResponse
58 .Object@hasMoreMatches = hasMoreMatches
59 .Object@collectedMatches = collectedMatches
60 .Object
61 })
Marc Kupietz632cbd42019-09-06 16:04:51 +020062
Marc Kupietze95108e2019-09-18 13:23:58 +020063setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery") )
64setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll") )
65setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") )
66setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
67
68maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020069
Marc Kupietz632cbd42019-09-06 16:04:51 +020070QueryParameterFromUrl <- function(url, parameter) {
71 regex <- paste0(".*[?&]", parameter, "=([^&]*).*")
72 if (grepl(regex, url)) {
73 return(gsub(regex, '\\1', url, perl = TRUE))
74 } else {
75 return("")
76 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020077}
78
Marc Kupietz632cbd42019-09-06 16:04:51 +020079KorAPQueryStringFromUrl <- function(KorAPUrl) {
80 return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE)))
81}
82
Marc Kupietze95108e2019-09-18 13:23:58 +020083#' Method corpusQuery
84#'
85#' Perform a corpus query via a connection to a KorAP-API-server.
86#'
87#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
Akron5e135462019-09-27 16:31:38 +020088#' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.
Marc Kupietz632cbd42019-09-06 16:04:51 +020089#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
90#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietz7776dec2019-09-27 16:59:02 +020091#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz3c531f62019-09-13 12:17:24 +020092#' @param ql string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values.
Akron5e135462019-09-27 16:31:38 +020093#' @param fields (meta)data fields that will be fetched for every match.
94#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficent rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +020095#' @param verbose print some info
Marc Kupietze95108e2019-09-18 13:23:58 +020096#' @return A \code{\link{KorAPQuery}} object that, among other information, contains the total number of results in \code{@totalResults}. The resulting object can be used to fetch all query results (with \code{\link{fetchAll}}) or the next page of results (with \code{\link{fetchNext}}).
97#' A corresponding URL to be used within a web browser is contained in \code{@webUIRequestUrl}
Marc Kupietz62da2b52019-09-12 17:43:34 +020098#' Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +020099#'
100#' @examples
Marc Kupietz603491f2019-09-18 14:01:02 +0200101#' # Fetch metadata of every query hit for "Ameisenplage" and show a summary
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200102#' new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietz3c531f62019-09-13 12:17:24 +0200103#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200104#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
105#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200106#'
107#' new("KorAPConnection", verbose = TRUE) %>%
108#' corpusQuery(KorAPUrl =
109#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp")
Marc Kupietz3c531f62019-09-13 12:17:24 +0200110#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200111#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200112#' new("KorAPConnection", verbose=TRUE) %>%
113#' { . ->> kco } %>%
114#' corpusQuery("Ameisenplage") %>%
115#' fetchAll() %>%
116#' slot("collectedMatches") %>%
117#' mutate(year = lubridate::year(pubDate)) %>%
118#' dplyr::select(year) %>%
119#' group_by(year) %>%
120#' summarise(Count = n()) %>%
121#' mutate(Freq = mapply(function(f, y)
122#' f / corpusStats(kco, paste("pubDate in", y))@tokens, Count, year)) %>%
123#' dplyr::select(-Count) %>%
124#' complete(year = min(year):max(year), fill = list(Freq = 0)) %>%
125#' plot(type = "l")
Marc Kupietz37b8ef12019-09-16 18:37:49 +0200126#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200127#' @seealso \code{\link{KorAPConnection}}, \code{\link{fetchNext}}, \code{\link{fetchRest}}, \code{\link{fetchAll}}, \code{\link{corpusStats}}
Marc Kupietz632cbd42019-09-06 16:04:51 +0200128#'
129#' @references
130#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
131#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200132#' @aliases corpusQuery
Marc Kupietz632cbd42019-09-06 16:04:51 +0200133#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200134setMethod("corpusQuery", "KorAPConnection",
135 function(kco, query, vc="", KorAPUrl, metadataOnly = TRUE, ql = "poliqarp", fields = defaultFields,
Marc Kupietz5a519822019-09-20 21:43:52 +0200136 accessRewriteFatal = TRUE, verbose = kco@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200137 defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace",
138 "availability", "textClass", "snippet")
139 contentFields <- c("snippet")
Marc Kupietz36d12d92019-09-27 18:13:27 +0200140 fields <- fields[!fields %in% contentFields]
Marc Kupietze95108e2019-09-18 13:23:58 +0200141
142 if (missing(query) && missing(KorAPUrl) || ! (missing(query) || missing(KorAPUrl))) {
143 stop("Exactly one of the parameters query and KorAPUrl must be specified.")
144 }
145 if (missing(query)) {
146 query <- QueryParameterFromUrl(KorAPUrl, "q")
Marc Kupietz468cab82019-09-30 11:16:51 +0200147 vc <- QueryParameterFromUrl(KorAPUrl, "cq")
Marc Kupietze95108e2019-09-18 13:23:58 +0200148 ql <- QueryParameterFromUrl(KorAPUrl, "ql")
149 }
150 request <- paste0('?q=', URLencode(query, reserved=TRUE),
151 ifelse(vc != '', paste0('&cq=', URLencode(vc, reserved=TRUE)), ''), '&ql=', ql)
152 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
153 requestUrl <- paste0(kco@apiUrl, 'search', request,
Marc Kupietz36d12d92019-09-27 18:13:27 +0200154 '&fields=', paste(fields, collapse = ","),
Marc Kupietze95108e2019-09-18 13:23:58 +0200155 ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
156 if (verbose) {
Marc Kupietz56456c62019-09-18 21:45:14 +0200157 cat("Searching \"", query, "\" in \"", vc, "\"", sep="")
Marc Kupietze95108e2019-09-18 13:23:58 +0200158 }
Marc Kupietzdb9ab042019-09-26 12:26:36 +0200159 res = apiCall(kco, paste0(requestUrl, '&count=0'))
Marc Kupietz56456c62019-09-18 21:45:14 +0200160 if (verbose) {
161 cat(" took ", res$meta$benchmark, "\n", sep="")
162 }
Marc Kupietze95108e2019-09-18 13:23:58 +0200163 KorAPQuery(
Marc Kupietzb8972182019-09-20 21:33:46 +0200164 korapConnection = kco,
Marc Kupietze95108e2019-09-18 13:23:58 +0200165 nextStartIndex = 0,
Marc Kupietz36d12d92019-09-27 18:13:27 +0200166 fields = fields,
Marc Kupietze95108e2019-09-18 13:23:58 +0200167 requestUrl = requestUrl,
168 request = request,
169 totalResults = res$meta$totalResults,
170 vc = vc,
171 apiResponse = res,
172 webUIRequestUrl = webUIRequestUrl,
173 hasMoreMatches = (res$meta$totalResults > 0),
174 )
175 })
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200176
Marc Kupietz62da2b52019-09-12 17:43:34 +0200177#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200178#'
179#' @param kqo object obtained from \code{\link{corpusQuery}}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200180#' @param offset start offset for query results to fetch
181#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200182#' @param verbose print progress information if true
Marc Kupietze95108e2019-09-18 13:23:58 +0200183#' @return The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200184#'
185#' @references
186#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
187#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200188#' @aliases fetchNext
189#' @rdname KorAPQuery-class
Marc Kupietz632cbd42019-09-06 16:04:51 +0200190#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200191setMethod("fetchNext", "KorAPQuery", function(kqo, offset = kqo@nextStartIndex, maxFetch = maxResultsPerPage, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200192 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
193 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200194 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200195
196 page <- 1
197 results <- 0
Marc Kupietz25aebc32019-09-16 18:40:50 +0200198 pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietze95108e2019-09-18 13:23:58 +0200199 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200200
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200201 repeat {
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200202 res <- apiCall(kqo@korapConnection, paste0(kqo@requestUrl, '&count=', min(ifelse(!is.na(maxFetch), maxFetch - results, maxResultsPerPage), maxResultsPerPage) ,'&offset=', offset + results))
Marc Kupietze95108e2019-09-18 13:23:58 +0200203 if (res$meta$totalResults == 0) { return(kqo) }
204 for (field in kqo@fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200205 if (!field %in% colnames(res$matches)) {
206 res$matches[, field] <- NA
207 }
208 }
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200209 currentMatches <-
210 kqo@fields %>%
211 map_dfr( ~tibble(!!.x := logical() ) ) %>%
212 bind_rows(res$matches) %>%
213 select(kqo@fields)
Marc Kupietz36d12d92019-09-27 18:13:27 +0200214 if ("pubDate" %in% kqo@fields) {
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200215 currentMatches$pubDate <- currentMatches$pubDate %>% as.Date(format = "%Y-%m-%d")
216 factorCols <- currentMatches %>% select(-pubDate) %>% colnames()
Marc Kupietz36d12d92019-09-27 18:13:27 +0200217 } else {
218 factorCols <- colnames(currentMatches)
219 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200220 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200221 if (!is.list(collectedMatches)) {
222 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200223 } else {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200224 collectedMatches <- rbind(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200225 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200226 if (verbose) {
Marc Kupietzf6f71312019-09-23 18:35:27 +0200227 cat(paste0("Retrieved page ", page, "/", ceiling((res$meta$totalResults) / res$meta$itemsPerPage), ' in ', res$meta$benchmark, '\n'))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200228 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200229 page <- page + 1
230 results <- results + res$meta$itemsPerPage
Marc Kupietz62da2b52019-09-12 17:43:34 +0200231 if (offset + results >= res$meta$totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200232 break
233 }
234 }
Marc Kupietze95108e2019-09-18 13:23:58 +0200235 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, res$meta$totalResults)
236 KorAPQuery(nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200237 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200238 fields = kqo@fields,
239 requestUrl = kqo@requestUrl,
240 request = kqo@request,
241 totalResults = res$meta$totalResults,
242 vc = kqo@vc,
243 webUIRequestUrl = kqo@webUIRequestUrl,
244 hasMoreMatches = (res$meta$totalResults > nextStartIndex),
245 apiResponse = res,
246 collectedMatches = collectedMatches)
247})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200248
249#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200250#'
251#' @examples
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200252#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200253#' q@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200254#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200255#' @aliases fetchAll
256#' @rdname KorAPQuery-class
Marc Kupietz62da2b52019-09-12 17:43:34 +0200257#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200258setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200259 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose))
260})
261
262#' Fetches the remaining results of a KorAP query.
263#'
264#' @examples
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200265#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200266#' q@collectedMatches
267#'
268#' @aliases fetchRest
269#' @rdname KorAPQuery-class
270#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200271setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200272 return(fetchNext(kqo, maxFetch = NA, verbose = verbose))
273})
274
275#´ format()
276#' @rdname KorAPQuery-class
277#' @param x KorAPQuery object
278#' @param ... further arguments passed to or from other methods
279#' @export
280format.KorAPQuery <- function(x, ...) {
281 cat("<KorAPQuery>\n")
282 q <- x
283 aurl = parse_url(q@request)
284 cat(" Query: ", aurl$query$q, "\n")
285 if (!is.null(aurl$query$vc) && aurl$query$vc != "") {
286 cat("Virtual corpus: ", aurl$query$vc, "\n")
287 }
288 if (!is.null(q@collectedMatches)) {
289 cat("==============================================================================================================", "\n")
290 print(summary(q@collectedMatches))
291 cat("==============================================================================================================", "\n")
292 }
293 cat(" Total results: ", q@totalResults, "\n")
294 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200295}
296
Marc Kupietze95108e2019-09-18 13:23:58 +0200297#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200298#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200299#' @rdname KorAPQuery-class
300#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200301#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200302setMethod("show", "KorAPQuery", function(object) {
303 format(object)
304})