blob: b70a2e266bb012dcb0a61aaedb6c644f30043a63 [file] [log] [blame]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02001#' @import jsonlite
2#' @import curl
Marc Kupietzd235a712019-09-16 18:31:33 +02003#' @import utils
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02004
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02005defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietz632cbd42019-09-06 16:04:51 +02006 "availability", "textClass", "snippet")
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02007
Marc Kupietz632cbd42019-09-06 16:04:51 +02008contentFields <- c("snippet")
9
Marc Kupietz62da2b52019-09-12 17:43:34 +020010maxResultsPerPage <- 50;
11
Marc Kupietz632cbd42019-09-06 16:04:51 +020012QueryParameterFromUrl <- function(url, parameter) {
13 regex <- paste0(".*[?&]", parameter, "=([^&]*).*")
14 if (grepl(regex, url)) {
15 return(gsub(regex, '\\1', url, perl = TRUE))
16 } else {
17 return("")
18 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020019}
20
Marc Kupietz632cbd42019-09-06 16:04:51 +020021KorAPQueryStringFromUrl <- function(KorAPUrl) {
22 return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE)))
23}
24
Marc Kupietz62da2b52019-09-12 17:43:34 +020025#' Send a query to a KorAP connection.
Marc Kupietz632cbd42019-09-06 16:04:51 +020026#' @param con object obtained from \code{\link{KorAPConnection}}, that contains all necessary connection information
27#' @param query string that contains the corpus query. The query langauge depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}
28#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
29#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietzb125bdd2019-09-09 12:05:59 +020030#' @param metadataOnly boolean that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz3c531f62019-09-13 12:17:24 +020031#' @param ql string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values.
Marc Kupietzb125bdd2019-09-09 12:05:59 +020032#' @param fields (meta)data fields that will be fetched for every match
Marc Kupietz62da2b52019-09-12 17:43:34 +020033#' @return A KorAP query object that, among other information, contains the total number of results in \code{$meta$totalResults}. The resulting object can be used to fetch all query results (with \code{\link{KorAPFetchAll}}) or the next page of results (with \code{\link{KorAPFetchNext}}).
34#' A correspunding URL to be used within a web browser is contained in \code{$webUIRequestUrl}
35#' Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +020036#'
37#' @examples
Marc Kupietz3c531f62019-09-13 12:17:24 +020038#' con <- KorAPConnection()
Marc Kupietz632cbd42019-09-06 16:04:51 +020039#' q <- KorAPQuery(con, "Ameisenplage")
Marc Kupietz3c531f62019-09-13 12:17:24 +020040#' q <- KorAPFetchAll(q)
41#' summary(q$collectedMatches)
42#'
Marc Kupietz632cbd42019-09-06 16:04:51 +020043#' q <- KorAPQuery(con, KorAPUrl = "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp&cutoff=1")
Marc Kupietz3c531f62019-09-13 12:17:24 +020044#' q$meta$totalResults
45#'
46#' @seealso \code{\link{KorAPConnection}}, \code{\link{KorAPFetchNext}}, \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchAll}}, \code{\link{KorAPCorpusStats}}
Marc Kupietz632cbd42019-09-06 16:04:51 +020047#'
48#' @references
49#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
50#'
51#' @export
Marc Kupietzf568f3f2019-09-16 17:03:46 +020052KorAPQuery <- function(con, query, vc="", KorAPUrl, metadataOnly = TRUE, ql = "poliqarp", fields = defaultFields,
Marc Kupietz62da2b52019-09-12 17:43:34 +020053 accessRewriteFatal = TRUE) {
Marc Kupietzf568f3f2019-09-16 17:03:46 +020054 if (missing(query) && missing(KorAPUrl) || ! (missing(query) || missing(KorAPUrl))) {
Marc Kupietz62da2b52019-09-12 17:43:34 +020055 stop("Exactly one of the parameters query and KorAPUrl must be specified.")
Marc Kupietz632cbd42019-09-06 16:04:51 +020056 }
Marc Kupietzf568f3f2019-09-16 17:03:46 +020057 if (missing(query)) {
Marc Kupietz632cbd42019-09-06 16:04:51 +020058 query <- QueryParameterFromUrl(KorAPUrl, "q")
59 vc <- QueryParameterFromUrl(KorAPUrl, "vc")
60 ql <- QueryParameterFromUrl(KorAPUrl, "ql")
61 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020062 request <- paste0('?q=', URLencode(query, reserved=TRUE),
Marc Kupietz632cbd42019-09-06 16:04:51 +020063 ifelse(vc != '', paste0('&vc=', URLencode(vc, reserved=TRUE)), ''),
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020064 '&ql=', ql);
65 webUIRequestUrl <- paste0(con$KorAPUrl, request)
66 requestUrl <- paste0(con$apiUrl, 'search', request,
67 '&fields=', paste(defaultFields, collapse = ","),
Marc Kupietz632cbd42019-09-06 16:04:51 +020068 ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020069 result <- fromJSON(paste0(requestUrl, '&count=1'))
Marc Kupietz62da2b52019-09-12 17:43:34 +020070 result$fields <- fields[!fields %in% contentFields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020071 result$requestUrl <- requestUrl
72 result$request <- request
Marc Kupietz632cbd42019-09-06 16:04:51 +020073 result$vc <- vc
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020074 result$webUIRequestUrl <- webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +020075 result$nextStartIndex <- 0
76 result$hasMoreMatches <- (result$meta$totalResults > 0)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020077 return(result)
78}
79
Marc Kupietz62da2b52019-09-12 17:43:34 +020080#' Fetch the next bunch of results of a KorAP query.
81#' @param queryObject object obtained from \code{\link{KorAPQuery}}
82#' @param offset start offset for query results to fetch
83#' @param maxFetch maximum number of query results to fetch
84#' @param verbose
85#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, , \code{$hasMoreMatches}
86#'
87#' @examples
88#' q <- KorapFetchNext(KorAPQuery(KorAPConnection(), "Ameisenplage"))
89#'
90#' @seealso \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchAll}}
91#'
92#' @references
93#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
94#'
Marc Kupietz632cbd42019-09-06 16:04:51 +020095#' @export
Marc Kupietz62da2b52019-09-12 17:43:34 +020096KorAPFetchNext <- function(queryObject, offset = queryObject$nextStartIndex, maxFetch = maxResultsPerPage, verbose = FALSE) {
97 if (queryObject$meta$totalResults == 0 || offset >= queryObject$meta$totalResults) {
98 return(queryObject)
99 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200100
101 page <- 1
102 results <- 0
103
Marc Kupietz62da2b52019-09-12 17:43:34 +0200104 collectedMatches <- queryObject$collectedMatches
105
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200106 repeat {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200107 res <- fromJSON(paste0(queryObject$requestUrl, '&count=', min(ifelse(!is.na(maxFetch), maxFetch - results, maxResultsPerPage), maxResultsPerPage) ,'&offset=', offset + results))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200108 if (res$meta$totalResults == 0) { return(data.frame()) }
Marc Kupietzb3065522019-09-09 11:34:19 +0200109 for (field in queryObject$fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200110 if (!field %in% colnames(res$matches)) {
111 res$matches[, field] <- NA
112 }
113 }
Marc Kupietzb3065522019-09-09 11:34:19 +0200114 currentMatches <- res$matches[queryObject$fields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200115 factorCols <- colnames(subset(currentMatches, select=-c(pubDate)))
116 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
117 currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200118 if (!is.list(collectedMatches)) {
119 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200120 } else {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200121 collectedMatches <- rbind(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200122 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200123 if (verbose) {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200124 cat(paste0("Retrieved page: ", page, "/", ceiling((res$meta$totalResults) / res$meta$itemsPerPage), ': ', res$meta$benchmark, '\n'))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200125 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200126 page <- page + 1
127 results <- results + res$meta$itemsPerPage
Marc Kupietz62da2b52019-09-12 17:43:34 +0200128 if (offset + results >= res$meta$totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200129 break
130 }
131 }
Marc Kupietzcb725f82019-08-30 18:04:57 +0200132 res$nextStartIndex <- res$meta$startIndex + res$meta$itemsPerPage
Marc Kupietzb3065522019-09-09 11:34:19 +0200133 res$fields <- queryObject$fields
134 res$requestUrl <- queryObject$requestUrl
135 res$request <- queryObject$request
136 res$webUIRequestUrl <- queryObject$webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +0200137 res$hasMoreMatches <- (res$meta$totalResults > res$nextStartIndex)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200138 res$collectedMatches <- collectedMatches
Marc Kupietzcb725f82019-08-30 18:04:57 +0200139 return(res)
140}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200141
142#' Fetch all results of a KorAP query.
143#' @param queryObject object obtained from \code{\link{KorAPQuery}}
144#' @param verbose
145#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, \code{$hasMoreMatches}
146#'
147#' @examples
148#' q <- KorAPFetchAll(KorAPQuery(KorAPConnection(), "Ameisenplage"))
149#' q$collectedMatches
150#'
151#' @seealso \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchNext}}
152#'
153#' @references
154#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
155#'
156#' @export
157KorAPFetchAll <- function(queryObject, verbose = FALSE) {
158 return(KorAPFetchNext(queryObject, offset = 0, maxFetch = NA, verbose = verbose))
159}
160
161#' Fetches all remaining results of a KorAP query.
162#' @param queryObject object obtained from \code{\link{KorAPQuery}}
163#' @param verbose
164#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, \code{$hasMoreMatches}
165#'
166#' @examples
167#' q <- KorAPFetchRest(KorAPQueryNext(KorAPQuery(KorAPConnection(), "Ameisenplage")))
168#' q$collectedMatches
169#'
170#' @seealso \code{\link{KorAPFetchAll}}, \code{\link{KorAPFetchNext}}
171#'
172#' @references
173#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
174#'
175#' @export
176KorAPFetchRest <- function(queryObject, verbose = FALSE) {
177 return(KorAPFetchNext(queryObject, maxFetch = NA, verbose = verbose))
178}