blob: 8081f01f09f8d4d0130d47bb9792f1ae73ca436d [file] [log] [blame]
Marc Kupietze95108e2019-09-18 13:23:58 +02001#' Class KorAPQuery
2#'
3#' \code{KorAPQuery} objetcs represent the current state of a query to a KorAP server.
4#' New \code{KorAPQuery} objects are typically created by the \code{\link{corpusQuery}} method.
5#'
6#' @include KorAPConnection.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02007#' @import jsonlite
Marc Kupietze95108e2019-09-18 13:23:58 +02008#' @import httr
9#'
10#'
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
14 "request",
15 "vc",
16 "totalResults",
17 "nextStartIndex",
18 "fields",
19 "requestUrl",
20 "webUIRequestUrl",
21 "apiResponse",
22 "collectedMatches",
23 "hasMoreMatches"
24))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020025
Marc Kupietze95108e2019-09-18 13:23:58 +020026#' Method initialize
27#'
28#' @rdname KorAPQuery-class
29#' @param .Object …
30#' @param request query part of the request URL
31#' @param vc definition of a virtual corpus
32#' @param totalResults number of hits the query has yielded
33#' @param nextStartIndex at what index to start the next fetch of query results
34#' @param fields what data / metadata fields should be collected
35#' @param requestUrl complete URL of the API request
36#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
37#' @param apiResponse data-frame representation of the JSON response of the API request
38#' @param hasMoreMatches boolean that signals if more query results can be fetched
39#' @param collectedMatches matches already fetched from the KorAP-API-server
40#' @export
41setMethod("initialize", "KorAPQuery",
42 function(.Object, request = NULL, vc="", totalResults=0, nextStartIndex=0, fields=c("corpusSigle", "textSigle", "pubDate", "pubPlace",
43 "availability", "textClass", "snippet"),
44 requestUrl="", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches= FALSE, collectedMatches = NULL) {
45 .Object <- callNextMethod()
46 .Object@request = request
47 .Object@vc = vc
48 .Object@totalResults = totalResults
49 .Object@nextStartIndex = nextStartIndex
50 .Object@fields = fields
51 .Object@requestUrl = requestUrl
52 .Object@webUIRequestUrl = webUIRequestUrl
53 .Object@apiResponse = apiResponse
54 .Object@hasMoreMatches = hasMoreMatches
55 .Object@collectedMatches = collectedMatches
56 .Object
57 })
Marc Kupietz632cbd42019-09-06 16:04:51 +020058
Marc Kupietze95108e2019-09-18 13:23:58 +020059setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery") )
60setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll") )
61setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") )
62setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
63
64maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020065
Marc Kupietz632cbd42019-09-06 16:04:51 +020066QueryParameterFromUrl <- function(url, parameter) {
67 regex <- paste0(".*[?&]", parameter, "=([^&]*).*")
68 if (grepl(regex, url)) {
69 return(gsub(regex, '\\1', url, perl = TRUE))
70 } else {
71 return("")
72 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020073}
74
Marc Kupietz632cbd42019-09-06 16:04:51 +020075KorAPQueryStringFromUrl <- function(KorAPUrl) {
76 return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE)))
77}
78
Marc Kupietze95108e2019-09-18 13:23:58 +020079#' Method corpusQuery
80#'
81#' Perform a corpus query via a connection to a KorAP-API-server.
82#'
83#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
Marc Kupietz632cbd42019-09-06 16:04:51 +020084#' @param query string that contains the corpus query. The query langauge depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}
85#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
86#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietzb125bdd2019-09-09 12:05:59 +020087#' @param metadataOnly boolean that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz3c531f62019-09-13 12:17:24 +020088#' @param ql string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values.
Marc Kupietzb125bdd2019-09-09 12:05:59 +020089#' @param fields (meta)data fields that will be fetched for every match
Marc Kupietz25aebc32019-09-16 18:40:50 +020090#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficent rights (not yet implemented)
91#' @param verbose print some info
Marc Kupietze95108e2019-09-18 13:23:58 +020092#' @return A \code{\link{KorAPQuery}} object that, among other information, contains the total number of results in \code{@totalResults}. The resulting object can be used to fetch all query results (with \code{\link{fetchAll}}) or the next page of results (with \code{\link{fetchNext}}).
93#' A corresponding URL to be used within a web browser is contained in \code{@webUIRequestUrl}
Marc Kupietz62da2b52019-09-12 17:43:34 +020094#' Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +020095#'
96#' @examples
Marc Kupietz603491f2019-09-18 14:01:02 +020097#' # Fetch metadata of every query hit for "Ameisenplage" and show a summary
Marc Kupietze95108e2019-09-18 13:23:58 +020098#' kco <- new("KorAPConnection")
99#' kqo <- corpusQuery(kco, "Ameisenplage")
100#' kqo <- fetchAll(kqo)
101#' kqo
Marc Kupietz3c531f62019-09-13 12:17:24 +0200102#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200103#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
104#' # and show the number of query hits (but don't fetch them).
105#' kco <- new("KorAPConnection")
Marc Kupietze95108e2019-09-18 13:23:58 +0200106#' kqo <- corpusQuery(kco,
Marc Kupietz37b8ef12019-09-16 18:37:49 +0200107#' KorAPUrl = "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp")
Marc Kupietze95108e2019-09-18 13:23:58 +0200108#' kqo
Marc Kupietz3c531f62019-09-13 12:17:24 +0200109#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200110#' # Plot the time/frequency curve of "Ameisenplage"
111#' kco <- new("KorAPConnection")
Marc Kupietze95108e2019-09-18 13:23:58 +0200112#' q <- corpusQuery(kco, "Ameisenplage")
113#' q <- fetchAll(q, verbose=TRUE)
114#' tokensPerYear <- function(year) { return(corpusStats(kco, paste("pubDate in", year))@tokens) }
115#' df <- as.data.frame(table(as.numeric(format(q@collectedMatches$pubDate,"%Y")), dnn="year"),
Marc Kupietz37b8ef12019-09-16 18:37:49 +0200116#' stringsAsFactors = FALSE)
117#' df$ipm <- 1000000 * df$Freq / tokensPerYear(df$year)
118#' plot(df$year, df$ipm, type="l")
119#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200120#' @seealso \code{\link{KorAPConnection}}, \code{\link{fetchNext}}, \code{\link{fetchRest}}, \code{\link{fetchAll}}, \code{\link{corpusStats}}
Marc Kupietz632cbd42019-09-06 16:04:51 +0200121#'
122#' @references
123#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
124#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200125#' @aliases corpusQuery
Marc Kupietz632cbd42019-09-06 16:04:51 +0200126#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200127setMethod("corpusQuery", "KorAPConnection",
128 function(kco, query, vc="", KorAPUrl, metadataOnly = TRUE, ql = "poliqarp", fields = defaultFields,
129 accessRewriteFatal = TRUE, verbose=FALSE) {
130 defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace",
131 "availability", "textClass", "snippet")
132 contentFields <- c("snippet")
133
134 if (missing(query) && missing(KorAPUrl) || ! (missing(query) || missing(KorAPUrl))) {
135 stop("Exactly one of the parameters query and KorAPUrl must be specified.")
136 }
137 if (missing(query)) {
138 query <- QueryParameterFromUrl(KorAPUrl, "q")
139 vc <- QueryParameterFromUrl(KorAPUrl, "vc")
140 ql <- QueryParameterFromUrl(KorAPUrl, "ql")
141 }
142 request <- paste0('?q=', URLencode(query, reserved=TRUE),
143 ifelse(vc != '', paste0('&cq=', URLencode(vc, reserved=TRUE)), ''), '&ql=', ql)
144 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
145 requestUrl <- paste0(kco@apiUrl, 'search', request,
146 '&fields=', paste(defaultFields, collapse = ","),
147 ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
148 if (verbose) {
Marc Kupietz56456c62019-09-18 21:45:14 +0200149 cat("Searching \"", query, "\" in \"", vc, "\"", sep="")
Marc Kupietze95108e2019-09-18 13:23:58 +0200150 }
151 res = fromJSON(paste0(requestUrl, '&count=1'))
Marc Kupietz56456c62019-09-18 21:45:14 +0200152 if (verbose) {
153 cat(" took ", res$meta$benchmark, "\n", sep="")
154 }
Marc Kupietze95108e2019-09-18 13:23:58 +0200155 KorAPQuery(
156 nextStartIndex = 0,
157 fields = fields[!fields %in% contentFields],
158 requestUrl = requestUrl,
159 request = request,
160 totalResults = res$meta$totalResults,
161 vc = vc,
162 apiResponse = res,
163 webUIRequestUrl = webUIRequestUrl,
164 hasMoreMatches = (res$meta$totalResults > 0),
165 )
166 })
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200167
Marc Kupietz62da2b52019-09-12 17:43:34 +0200168#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200169#'
170#' @param kqo object obtained from \code{\link{corpusQuery}}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200171#' @param offset start offset for query results to fetch
172#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200173#' @param verbose print progress information if true
Marc Kupietze95108e2019-09-18 13:23:58 +0200174#' @return The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200175#'
176#' @references
177#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
178#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200179#' @aliases fetchNext
180#' @rdname KorAPQuery-class
Marc Kupietz632cbd42019-09-06 16:04:51 +0200181#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200182setMethod("fetchNext", "KorAPQuery", function(kqo, offset = kqo@nextStartIndex, maxFetch = maxResultsPerPage, verbose = FALSE) {
183 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
184 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200185 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200186
187 page <- 1
188 results <- 0
Marc Kupietz25aebc32019-09-16 18:40:50 +0200189 pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietze95108e2019-09-18 13:23:58 +0200190 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200191
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200192 repeat {
Marc Kupietze95108e2019-09-18 13:23:58 +0200193 res <- fromJSON(paste0(kqo@requestUrl, '&count=', min(ifelse(!is.na(maxFetch), maxFetch - results, maxResultsPerPage), maxResultsPerPage) ,'&offset=', offset + results))
194 if (res$meta$totalResults == 0) { return(kqo) }
195 for (field in kqo@fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200196 if (!field %in% colnames(res$matches)) {
197 res$matches[, field] <- NA
198 }
199 }
Marc Kupietze95108e2019-09-18 13:23:58 +0200200 currentMatches <- res$matches[kqo@fields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200201 factorCols <- colnames(subset(currentMatches, select=-c(pubDate)))
202 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
203 currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200204 if (!is.list(collectedMatches)) {
205 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200206 } else {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200207 collectedMatches <- rbind(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200208 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200209 if (verbose) {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200210 cat(paste0("Retrieved page: ", page, "/", ceiling((res$meta$totalResults) / res$meta$itemsPerPage), ': ', res$meta$benchmark, '\n'))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200211 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200212 page <- page + 1
213 results <- results + res$meta$itemsPerPage
Marc Kupietz62da2b52019-09-12 17:43:34 +0200214 if (offset + results >= res$meta$totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200215 break
216 }
217 }
Marc Kupietze95108e2019-09-18 13:23:58 +0200218 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, res$meta$totalResults)
219 KorAPQuery(nextStartIndex = nextStartIndex,
220 fields = kqo@fields,
221 requestUrl = kqo@requestUrl,
222 request = kqo@request,
223 totalResults = res$meta$totalResults,
224 vc = kqo@vc,
225 webUIRequestUrl = kqo@webUIRequestUrl,
226 hasMoreMatches = (res$meta$totalResults > nextStartIndex),
227 apiResponse = res,
228 collectedMatches = collectedMatches)
229})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200230
231#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200232#'
233#' @examples
Marc Kupietze95108e2019-09-18 13:23:58 +0200234#' q <- fetchAll(corpusQuery(new("KorAPConnection"), "Ameisenplage"))
235#' q@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200236#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200237#' @aliases fetchAll
238#' @rdname KorAPQuery-class
Marc Kupietz62da2b52019-09-12 17:43:34 +0200239#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200240setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = FALSE) {
241 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose))
242})
243
244#' Fetches the remaining results of a KorAP query.
245#'
246#' @examples
247#' q <- fetchRest(fetchNext(corpusQuery(new("KorAPConnection"), "Ameisenplage")))
248#' q@collectedMatches
249#'
250#' @aliases fetchRest
251#' @rdname KorAPQuery-class
252#' @export
253setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = FALSE) {
254 return(fetchNext(kqo, maxFetch = NA, verbose = verbose))
255})
256
257#´ format()
258#' @rdname KorAPQuery-class
259#' @param x KorAPQuery object
260#' @param ... further arguments passed to or from other methods
261#' @export
262format.KorAPQuery <- function(x, ...) {
263 cat("<KorAPQuery>\n")
264 q <- x
265 aurl = parse_url(q@request)
266 cat(" Query: ", aurl$query$q, "\n")
267 if (!is.null(aurl$query$vc) && aurl$query$vc != "") {
268 cat("Virtual corpus: ", aurl$query$vc, "\n")
269 }
270 if (!is.null(q@collectedMatches)) {
271 cat("==============================================================================================================", "\n")
272 print(summary(q@collectedMatches))
273 cat("==============================================================================================================", "\n")
274 }
275 cat(" Total results: ", q@totalResults, "\n")
276 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200277}
278
Marc Kupietze95108e2019-09-18 13:23:58 +0200279#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200280#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200281#' @rdname KorAPQuery-class
282#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200283#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200284setMethod("show", "KorAPQuery", function(object) {
285 format(object)
286})