blob: 79b589cf58148c6cde1a7a340301463a94224a82 [file] [log] [blame]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02001#' @import jsonlite
2#' @import curl
3
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02004defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietz632cbd42019-09-06 16:04:51 +02005 "availability", "textClass", "snippet")
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02006
Marc Kupietz632cbd42019-09-06 16:04:51 +02007contentFields <- c("snippet")
8
Marc Kupietz62da2b52019-09-12 17:43:34 +02009maxResultsPerPage <- 50;
10
Marc Kupietz632cbd42019-09-06 16:04:51 +020011QueryParameterFromUrl <- function(url, parameter) {
12 regex <- paste0(".*[?&]", parameter, "=([^&]*).*")
13 if (grepl(regex, url)) {
14 return(gsub(regex, '\\1', url, perl = TRUE))
15 } else {
16 return("")
17 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020018}
19
Marc Kupietz632cbd42019-09-06 16:04:51 +020020KorAPQueryStringFromUrl <- function(KorAPUrl) {
21 return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE)))
22}
23
Marc Kupietz62da2b52019-09-12 17:43:34 +020024#' Send a query to a KorAP connection.
Marc Kupietz632cbd42019-09-06 16:04:51 +020025#' @param con object obtained from \code{\link{KorAPConnection}}, that contains all necessary connection information
26#' @param query string that contains the corpus query. The query langauge depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}
27#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
28#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietzb125bdd2019-09-09 12:05:59 +020029#' @param metadataOnly boolean that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz3c531f62019-09-13 12:17:24 +020030#' @param ql string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values.
Marc Kupietzb125bdd2019-09-09 12:05:59 +020031#' @param fields (meta)data fields that will be fetched for every match
Marc Kupietz62da2b52019-09-12 17:43:34 +020032#' @return A KorAP query object that, among other information, contains the total number of results in \code{$meta$totalResults}. The resulting object can be used to fetch all query results (with \code{\link{KorAPFetchAll}}) or the next page of results (with \code{\link{KorAPFetchNext}}).
33#' A correspunding URL to be used within a web browser is contained in \code{$webUIRequestUrl}
34#' Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +020035#'
36#' @examples
Marc Kupietz3c531f62019-09-13 12:17:24 +020037#' con <- KorAPConnection()
Marc Kupietz632cbd42019-09-06 16:04:51 +020038#' q <- KorAPQuery(con, "Ameisenplage")
Marc Kupietz3c531f62019-09-13 12:17:24 +020039#' q <- KorAPFetchAll(q)
40#' summary(q$collectedMatches)
41#'
Marc Kupietz632cbd42019-09-06 16:04:51 +020042#' q <- KorAPQuery(con, KorAPUrl = "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp&cutoff=1")
Marc Kupietz3c531f62019-09-13 12:17:24 +020043#' q$meta$totalResults
44#'
45#' @seealso \code{\link{KorAPConnection}}, \code{\link{KorAPFetchNext}}, \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchAll}}, \code{\link{KorAPCorpusStats}}
Marc Kupietz632cbd42019-09-06 16:04:51 +020046#'
47#' @references
48#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
49#'
50#' @export
Marc Kupietzf568f3f2019-09-16 17:03:46 +020051KorAPQuery <- function(con, query, vc="", KorAPUrl, metadataOnly = TRUE, ql = "poliqarp", fields = defaultFields,
Marc Kupietz62da2b52019-09-12 17:43:34 +020052 accessRewriteFatal = TRUE) {
Marc Kupietzf568f3f2019-09-16 17:03:46 +020053 if (missing(query) && missing(KorAPUrl) || ! (missing(query) || missing(KorAPUrl))) {
Marc Kupietz62da2b52019-09-12 17:43:34 +020054 stop("Exactly one of the parameters query and KorAPUrl must be specified.")
Marc Kupietz632cbd42019-09-06 16:04:51 +020055 }
Marc Kupietzf568f3f2019-09-16 17:03:46 +020056 if (missing(query)) {
Marc Kupietz632cbd42019-09-06 16:04:51 +020057 query <- QueryParameterFromUrl(KorAPUrl, "q")
58 vc <- QueryParameterFromUrl(KorAPUrl, "vc")
59 ql <- QueryParameterFromUrl(KorAPUrl, "ql")
60 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020061 request <- paste0('?q=', URLencode(query, reserved=TRUE),
Marc Kupietz632cbd42019-09-06 16:04:51 +020062 ifelse(vc != '', paste0('&vc=', URLencode(vc, reserved=TRUE)), ''),
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020063 '&ql=', ql);
64 webUIRequestUrl <- paste0(con$KorAPUrl, request)
65 requestUrl <- paste0(con$apiUrl, 'search', request,
66 '&fields=', paste(defaultFields, collapse = ","),
Marc Kupietz632cbd42019-09-06 16:04:51 +020067 ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020068 result <- fromJSON(paste0(requestUrl, '&count=1'))
Marc Kupietz62da2b52019-09-12 17:43:34 +020069 result$fields <- fields[!fields %in% contentFields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020070 result$requestUrl <- requestUrl
71 result$request <- request
Marc Kupietz632cbd42019-09-06 16:04:51 +020072 result$vc <- vc
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020073 result$webUIRequestUrl <- webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +020074 result$nextStartIndex <- 0
75 result$hasMoreMatches <- (result$meta$totalResults > 0)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020076 return(result)
77}
78
Marc Kupietz62da2b52019-09-12 17:43:34 +020079#' Fetch the next bunch of results of a KorAP query.
80#' @param queryObject object obtained from \code{\link{KorAPQuery}}
81#' @param offset start offset for query results to fetch
82#' @param maxFetch maximum number of query results to fetch
83#' @param verbose
84#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, , \code{$hasMoreMatches}
85#'
86#' @examples
87#' q <- KorapFetchNext(KorAPQuery(KorAPConnection(), "Ameisenplage"))
88#'
89#' @seealso \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchAll}}
90#'
91#' @references
92#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
93#'
Marc Kupietz632cbd42019-09-06 16:04:51 +020094#' @export
Marc Kupietz62da2b52019-09-12 17:43:34 +020095KorAPFetchNext <- function(queryObject, offset = queryObject$nextStartIndex, maxFetch = maxResultsPerPage, verbose = FALSE) {
96 if (queryObject$meta$totalResults == 0 || offset >= queryObject$meta$totalResults) {
97 return(queryObject)
98 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020099
100 page <- 1
101 results <- 0
102
Marc Kupietz62da2b52019-09-12 17:43:34 +0200103 collectedMatches <- queryObject$collectedMatches
104
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200105 repeat {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200106 res <- fromJSON(paste0(queryObject$requestUrl, '&count=', min(ifelse(!is.na(maxFetch), maxFetch - results, maxResultsPerPage), maxResultsPerPage) ,'&offset=', offset + results))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200107 if (res$meta$totalResults == 0) { return(data.frame()) }
Marc Kupietzb3065522019-09-09 11:34:19 +0200108 for (field in queryObject$fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200109 if (!field %in% colnames(res$matches)) {
110 res$matches[, field] <- NA
111 }
112 }
Marc Kupietzb3065522019-09-09 11:34:19 +0200113 currentMatches <- res$matches[queryObject$fields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200114 factorCols <- colnames(subset(currentMatches, select=-c(pubDate)))
115 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
116 currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200117 if (!is.list(collectedMatches)) {
118 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200119 } else {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200120 collectedMatches <- rbind(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200121 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200122 if (verbose) {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200123 cat(paste0("Retrieved page: ", page, "/", ceiling((res$meta$totalResults) / res$meta$itemsPerPage), ': ', res$meta$benchmark, '\n'))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200124 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200125 page <- page + 1
126 results <- results + res$meta$itemsPerPage
Marc Kupietz62da2b52019-09-12 17:43:34 +0200127 if (offset + results >= res$meta$totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200128 break
129 }
130 }
Marc Kupietzcb725f82019-08-30 18:04:57 +0200131 res$nextStartIndex <- res$meta$startIndex + res$meta$itemsPerPage
Marc Kupietzb3065522019-09-09 11:34:19 +0200132 res$fields <- queryObject$fields
133 res$requestUrl <- queryObject$requestUrl
134 res$request <- queryObject$request
135 res$webUIRequestUrl <- queryObject$webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +0200136 res$hasMoreMatches <- (res$meta$totalResults > res$nextStartIndex)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200137 res$collectedMatches <- collectedMatches
Marc Kupietzcb725f82019-08-30 18:04:57 +0200138 return(res)
139}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200140
141#' Fetch all results of a KorAP query.
142#' @param queryObject object obtained from \code{\link{KorAPQuery}}
143#' @param verbose
144#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, \code{$hasMoreMatches}
145#'
146#' @examples
147#' q <- KorAPFetchAll(KorAPQuery(KorAPConnection(), "Ameisenplage"))
148#' q$collectedMatches
149#'
150#' @seealso \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchNext}}
151#'
152#' @references
153#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
154#'
155#' @export
156KorAPFetchAll <- function(queryObject, verbose = FALSE) {
157 return(KorAPFetchNext(queryObject, offset = 0, maxFetch = NA, verbose = verbose))
158}
159
160#' Fetches all remaining results of a KorAP query.
161#' @param queryObject object obtained from \code{\link{KorAPQuery}}
162#' @param verbose
163#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, \code{$hasMoreMatches}
164#'
165#' @examples
166#' q <- KorAPFetchRest(KorAPQueryNext(KorAPQuery(KorAPConnection(), "Ameisenplage")))
167#' q$collectedMatches
168#'
169#' @seealso \code{\link{KorAPFetchAll}}, \code{\link{KorAPFetchNext}}
170#'
171#' @references
172#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
173#'
174#' @export
175KorAPFetchRest <- function(queryObject, verbose = FALSE) {
176 return(KorAPFetchNext(queryObject, maxFetch = NA, verbose = verbose))
177}