blob: 723ef3e14aa3429f0f4cd7f9d7f9f3665058a198 [file] [log] [blame]
Marc Kupietze95108e2019-09-18 13:23:58 +02001#' Class KorAPQuery
2#'
3#' \code{KorAPQuery} objetcs represent the current state of a query to a KorAP server.
4#' New \code{KorAPQuery} objects are typically created by the \code{\link{corpusQuery}} method.
5#'
6#' @include KorAPConnection.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02007#' @import jsonlite
Marc Kupietze95108e2019-09-18 13:23:58 +02008#' @import httr
9#'
10#'
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
24 "hasMoreMatches"
25))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietze95108e2019-09-18 13:23:58 +020027#' Method initialize
28#'
29#' @rdname KorAPQuery-class
30#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020031#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020032#' @param request query part of the request URL
33#' @param vc definition of a virtual corpus
34#' @param totalResults number of hits the query has yielded
35#' @param nextStartIndex at what index to start the next fetch of query results
36#' @param fields what data / metadata fields should be collected
37#' @param requestUrl complete URL of the API request
38#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
39#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020040#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020041#' @param collectedMatches matches already fetched from the KorAP-API-server
42#' @export
43setMethod("initialize", "KorAPQuery",
Marc Kupietzb8972182019-09-20 21:33:46 +020044 function(.Object, korapConnection = NULL, request = NULL, vc="", totalResults=0, nextStartIndex=0, fields=c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietze95108e2019-09-18 13:23:58 +020045 "availability", "textClass", "snippet"),
46 requestUrl="", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches= FALSE, collectedMatches = NULL) {
47 .Object <- callNextMethod()
Marc Kupietzb8972182019-09-20 21:33:46 +020048 .Object@korapConnection = korapConnection
Marc Kupietze95108e2019-09-18 13:23:58 +020049 .Object@request = request
50 .Object@vc = vc
51 .Object@totalResults = totalResults
52 .Object@nextStartIndex = nextStartIndex
53 .Object@fields = fields
54 .Object@requestUrl = requestUrl
55 .Object@webUIRequestUrl = webUIRequestUrl
56 .Object@apiResponse = apiResponse
57 .Object@hasMoreMatches = hasMoreMatches
58 .Object@collectedMatches = collectedMatches
59 .Object
60 })
Marc Kupietz632cbd42019-09-06 16:04:51 +020061
Marc Kupietze95108e2019-09-18 13:23:58 +020062setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery") )
63setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll") )
64setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") )
65setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
66
67maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020068
Marc Kupietz632cbd42019-09-06 16:04:51 +020069QueryParameterFromUrl <- function(url, parameter) {
70 regex <- paste0(".*[?&]", parameter, "=([^&]*).*")
71 if (grepl(regex, url)) {
72 return(gsub(regex, '\\1', url, perl = TRUE))
73 } else {
74 return("")
75 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020076}
77
Marc Kupietz632cbd42019-09-06 16:04:51 +020078KorAPQueryStringFromUrl <- function(KorAPUrl) {
79 return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE)))
80}
81
Marc Kupietze95108e2019-09-18 13:23:58 +020082#' Method corpusQuery
83#'
84#' Perform a corpus query via a connection to a KorAP-API-server.
85#'
86#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
Akron5e135462019-09-27 16:31:38 +020087#' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.
Marc Kupietz632cbd42019-09-06 16:04:51 +020088#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
89#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietz7776dec2019-09-27 16:59:02 +020090#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz3c531f62019-09-13 12:17:24 +020091#' @param ql string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values.
Akron5e135462019-09-27 16:31:38 +020092#' @param fields (meta)data fields that will be fetched for every match.
93#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficent rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +020094#' @param verbose print some info
Marc Kupietze95108e2019-09-18 13:23:58 +020095#' @return A \code{\link{KorAPQuery}} object that, among other information, contains the total number of results in \code{@totalResults}. The resulting object can be used to fetch all query results (with \code{\link{fetchAll}}) or the next page of results (with \code{\link{fetchNext}}).
96#' A corresponding URL to be used within a web browser is contained in \code{@webUIRequestUrl}
Marc Kupietz62da2b52019-09-12 17:43:34 +020097#' Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +020098#'
99#' @examples
Marc Kupietz603491f2019-09-18 14:01:02 +0200100#' # Fetch metadata of every query hit for "Ameisenplage" and show a summary
Marc Kupietze95108e2019-09-18 13:23:58 +0200101#' kco <- new("KorAPConnection")
102#' kqo <- corpusQuery(kco, "Ameisenplage")
103#' kqo <- fetchAll(kqo)
104#' kqo
Marc Kupietz3c531f62019-09-13 12:17:24 +0200105#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200106#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
107#' # and show the number of query hits (but don't fetch them).
108#' kco <- new("KorAPConnection")
Marc Kupietze95108e2019-09-18 13:23:58 +0200109#' kqo <- corpusQuery(kco,
Marc Kupietz37b8ef12019-09-16 18:37:49 +0200110#' KorAPUrl = "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp")
Marc Kupietze95108e2019-09-18 13:23:58 +0200111#' kqo
Marc Kupietz3c531f62019-09-13 12:17:24 +0200112#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200113#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietz24169052019-09-29 17:52:10 +0200114#' kco <- new("KorAPConnection", verbose=TRUE)
115#' q <- fetchAll(corpusQuery(kco, "Ameisenplage"))
Marc Kupietze95108e2019-09-18 13:23:58 +0200116#' df <- as.data.frame(table(as.numeric(format(q@collectedMatches$pubDate,"%Y")), dnn="year"),
Marc Kupietz37b8ef12019-09-16 18:37:49 +0200117#' stringsAsFactors = FALSE)
Marc Kupietz24169052019-09-29 17:52:10 +0200118#' df$Freq <- mapply(function(f, y) f / corpusStats(kco, paste("pubDate in", y))@tokens,
119#' df$Freq, df$year)
120#' df <- merge(data.frame(year=min(df$year):max(df$year)), df, all = TRUE)
121#' df[is.na(df$Freq),]$Freq <- 0
122#' plot(df, type="l")
Marc Kupietz37b8ef12019-09-16 18:37:49 +0200123#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200124#' @seealso \code{\link{KorAPConnection}}, \code{\link{fetchNext}}, \code{\link{fetchRest}}, \code{\link{fetchAll}}, \code{\link{corpusStats}}
Marc Kupietz632cbd42019-09-06 16:04:51 +0200125#'
126#' @references
127#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
128#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200129#' @aliases corpusQuery
Marc Kupietz632cbd42019-09-06 16:04:51 +0200130#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200131setMethod("corpusQuery", "KorAPConnection",
132 function(kco, query, vc="", KorAPUrl, metadataOnly = TRUE, ql = "poliqarp", fields = defaultFields,
Marc Kupietz5a519822019-09-20 21:43:52 +0200133 accessRewriteFatal = TRUE, verbose = kco@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200134 defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace",
135 "availability", "textClass", "snippet")
136 contentFields <- c("snippet")
Marc Kupietz36d12d92019-09-27 18:13:27 +0200137 fields <- fields[!fields %in% contentFields]
Marc Kupietze95108e2019-09-18 13:23:58 +0200138
139 if (missing(query) && missing(KorAPUrl) || ! (missing(query) || missing(KorAPUrl))) {
140 stop("Exactly one of the parameters query and KorAPUrl must be specified.")
141 }
142 if (missing(query)) {
143 query <- QueryParameterFromUrl(KorAPUrl, "q")
144 vc <- QueryParameterFromUrl(KorAPUrl, "vc")
145 ql <- QueryParameterFromUrl(KorAPUrl, "ql")
146 }
147 request <- paste0('?q=', URLencode(query, reserved=TRUE),
148 ifelse(vc != '', paste0('&cq=', URLencode(vc, reserved=TRUE)), ''), '&ql=', ql)
149 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
150 requestUrl <- paste0(kco@apiUrl, 'search', request,
Marc Kupietz36d12d92019-09-27 18:13:27 +0200151 '&fields=', paste(fields, collapse = ","),
Marc Kupietze95108e2019-09-18 13:23:58 +0200152 ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
153 if (verbose) {
Marc Kupietz56456c62019-09-18 21:45:14 +0200154 cat("Searching \"", query, "\" in \"", vc, "\"", sep="")
Marc Kupietze95108e2019-09-18 13:23:58 +0200155 }
Marc Kupietzdb9ab042019-09-26 12:26:36 +0200156 res = apiCall(kco, paste0(requestUrl, '&count=0'))
Marc Kupietz56456c62019-09-18 21:45:14 +0200157 if (verbose) {
158 cat(" took ", res$meta$benchmark, "\n", sep="")
159 }
Marc Kupietze95108e2019-09-18 13:23:58 +0200160 KorAPQuery(
Marc Kupietzb8972182019-09-20 21:33:46 +0200161 korapConnection = kco,
Marc Kupietze95108e2019-09-18 13:23:58 +0200162 nextStartIndex = 0,
Marc Kupietz36d12d92019-09-27 18:13:27 +0200163 fields = fields,
Marc Kupietze95108e2019-09-18 13:23:58 +0200164 requestUrl = requestUrl,
165 request = request,
166 totalResults = res$meta$totalResults,
167 vc = vc,
168 apiResponse = res,
169 webUIRequestUrl = webUIRequestUrl,
170 hasMoreMatches = (res$meta$totalResults > 0),
171 )
172 })
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200173
Marc Kupietz62da2b52019-09-12 17:43:34 +0200174#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200175#'
176#' @param kqo object obtained from \code{\link{corpusQuery}}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200177#' @param offset start offset for query results to fetch
178#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200179#' @param verbose print progress information if true
Marc Kupietze95108e2019-09-18 13:23:58 +0200180#' @return The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200181#'
182#' @references
183#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
184#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200185#' @aliases fetchNext
186#' @rdname KorAPQuery-class
Marc Kupietz632cbd42019-09-06 16:04:51 +0200187#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200188setMethod("fetchNext", "KorAPQuery", function(kqo, offset = kqo@nextStartIndex, maxFetch = maxResultsPerPage, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200189 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
190 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200191 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200192
193 page <- 1
194 results <- 0
Marc Kupietz25aebc32019-09-16 18:40:50 +0200195 pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietze95108e2019-09-18 13:23:58 +0200196 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200197
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200198 repeat {
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200199 res <- apiCall(kqo@korapConnection, paste0(kqo@requestUrl, '&count=', min(ifelse(!is.na(maxFetch), maxFetch - results, maxResultsPerPage), maxResultsPerPage) ,'&offset=', offset + results))
Marc Kupietze95108e2019-09-18 13:23:58 +0200200 if (res$meta$totalResults == 0) { return(kqo) }
201 for (field in kqo@fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200202 if (!field %in% colnames(res$matches)) {
203 res$matches[, field] <- NA
204 }
205 }
Marc Kupietze95108e2019-09-18 13:23:58 +0200206 currentMatches <- res$matches[kqo@fields]
Marc Kupietz36d12d92019-09-27 18:13:27 +0200207 if ("pubDate" %in% kqo@fields) {
208 currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d")
209 factorCols <- colnames(subset(currentMatches, select=-c(pubDate)))
210 } else {
211 factorCols <- colnames(currentMatches)
212 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200213 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200214 if (!is.list(collectedMatches)) {
215 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200216 } else {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200217 collectedMatches <- rbind(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200218 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200219 if (verbose) {
Marc Kupietzf6f71312019-09-23 18:35:27 +0200220 cat(paste0("Retrieved page ", page, "/", ceiling((res$meta$totalResults) / res$meta$itemsPerPage), ' in ', res$meta$benchmark, '\n'))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200221 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200222 page <- page + 1
223 results <- results + res$meta$itemsPerPage
Marc Kupietz62da2b52019-09-12 17:43:34 +0200224 if (offset + results >= res$meta$totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200225 break
226 }
227 }
Marc Kupietze95108e2019-09-18 13:23:58 +0200228 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, res$meta$totalResults)
229 KorAPQuery(nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200230 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200231 fields = kqo@fields,
232 requestUrl = kqo@requestUrl,
233 request = kqo@request,
234 totalResults = res$meta$totalResults,
235 vc = kqo@vc,
236 webUIRequestUrl = kqo@webUIRequestUrl,
237 hasMoreMatches = (res$meta$totalResults > nextStartIndex),
238 apiResponse = res,
239 collectedMatches = collectedMatches)
240})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200241
242#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200243#'
244#' @examples
Marc Kupietze95108e2019-09-18 13:23:58 +0200245#' q <- fetchAll(corpusQuery(new("KorAPConnection"), "Ameisenplage"))
246#' q@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200247#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200248#' @aliases fetchAll
249#' @rdname KorAPQuery-class
Marc Kupietz62da2b52019-09-12 17:43:34 +0200250#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200251setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200252 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose))
253})
254
255#' Fetches the remaining results of a KorAP query.
256#'
257#' @examples
258#' q <- fetchRest(fetchNext(corpusQuery(new("KorAPConnection"), "Ameisenplage")))
259#' q@collectedMatches
260#'
261#' @aliases fetchRest
262#' @rdname KorAPQuery-class
263#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200264setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200265 return(fetchNext(kqo, maxFetch = NA, verbose = verbose))
266})
267
268#´ format()
269#' @rdname KorAPQuery-class
270#' @param x KorAPQuery object
271#' @param ... further arguments passed to or from other methods
272#' @export
273format.KorAPQuery <- function(x, ...) {
274 cat("<KorAPQuery>\n")
275 q <- x
276 aurl = parse_url(q@request)
277 cat(" Query: ", aurl$query$q, "\n")
278 if (!is.null(aurl$query$vc) && aurl$query$vc != "") {
279 cat("Virtual corpus: ", aurl$query$vc, "\n")
280 }
281 if (!is.null(q@collectedMatches)) {
282 cat("==============================================================================================================", "\n")
283 print(summary(q@collectedMatches))
284 cat("==============================================================================================================", "\n")
285 }
286 cat(" Total results: ", q@totalResults, "\n")
287 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200288}
289
Marc Kupietze95108e2019-09-18 13:23:58 +0200290#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200291#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200292#' @rdname KorAPQuery-class
293#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200294#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200295setMethod("show", "KorAPQuery", function(object) {
296 format(object)
297})