blob: 3875bc24bde1f6f51781913230b366d34f4f5c3a [file] [log] [blame]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02001#' @import jsonlite
2#' @import curl
3
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02004defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietz632cbd42019-09-06 16:04:51 +02005 "availability", "textClass", "snippet")
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02006
Marc Kupietz632cbd42019-09-06 16:04:51 +02007contentFields <- c("snippet")
8
Marc Kupietz62da2b52019-09-12 17:43:34 +02009maxResultsPerPage <- 50;
10
Marc Kupietz632cbd42019-09-06 16:04:51 +020011QueryParameterFromUrl <- function(url, parameter) {
12 regex <- paste0(".*[?&]", parameter, "=([^&]*).*")
13 if (grepl(regex, url)) {
14 return(gsub(regex, '\\1', url, perl = TRUE))
15 } else {
16 return("")
17 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020018}
19
Marc Kupietz632cbd42019-09-06 16:04:51 +020020KorAPQueryStringFromUrl <- function(KorAPUrl) {
21 return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE)))
22}
23
Marc Kupietz62da2b52019-09-12 17:43:34 +020024#' Send a query to a KorAP connection.
Marc Kupietz632cbd42019-09-06 16:04:51 +020025#' @param con object obtained from \code{\link{KorAPConnection}}, that contains all necessary connection information
26#' @param query string that contains the corpus query. The query langauge depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}
27#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
28#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietzb125bdd2019-09-09 12:05:59 +020029#' @param metadataOnly boolean that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz3c531f62019-09-13 12:17:24 +020030#' @param ql string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values.
Marc Kupietzb125bdd2019-09-09 12:05:59 +020031#' @param fields (meta)data fields that will be fetched for every match
Marc Kupietz62da2b52019-09-12 17:43:34 +020032#' @return A KorAP query object that, among other information, contains the total number of results in \code{$meta$totalResults}. The resulting object can be used to fetch all query results (with \code{\link{KorAPFetchAll}}) or the next page of results (with \code{\link{KorAPFetchNext}}).
33#' A correspunding URL to be used within a web browser is contained in \code{$webUIRequestUrl}
34#' Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +020035#'
36#' @examples
Marc Kupietz3c531f62019-09-13 12:17:24 +020037#' con <- KorAPConnection()
Marc Kupietz632cbd42019-09-06 16:04:51 +020038#' q <- KorAPQuery(con, "Ameisenplage")
Marc Kupietz3c531f62019-09-13 12:17:24 +020039#' q <- KorAPFetchAll(q)
40#' summary(q$collectedMatches)
41#'
Marc Kupietz632cbd42019-09-06 16:04:51 +020042#' q <- KorAPQuery(con, KorAPUrl = "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp&cutoff=1")
Marc Kupietz3c531f62019-09-13 12:17:24 +020043#' q$meta$totalResults
44#'
45#' @seealso \code{\link{KorAPConnection}}, \code{\link{KorAPFetchNext}}, \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchAll}}, \code{\link{KorAPCorpusStats}}
Marc Kupietz632cbd42019-09-06 16:04:51 +020046#'
47#' @references
48#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
49#'
50#' @export
Marc Kupietz9d97eb92019-09-13 12:03:52 +020051KorAPQuery <- function(con, query = NA, vc = NA, KorAPUrl = NA, metadataOnly = TRUE, ql = "poliqarp", fields = defaultFields,
Marc Kupietz62da2b52019-09-12 17:43:34 +020052 accessRewriteFatal = TRUE) {
Marc Kupietz632cbd42019-09-06 16:04:51 +020053 if (is.na(query) && is.na(KorAPUrl) || ! (is.na(query) || is.na(KorAPUrl))) {
Marc Kupietz62da2b52019-09-12 17:43:34 +020054 stop("Exactly one of the parameters query and KorAPUrl must be specified.")
Marc Kupietz632cbd42019-09-06 16:04:51 +020055 }
56 if (is.na(query)) {
57 query <- QueryParameterFromUrl(KorAPUrl, "q")
58 vc <- QueryParameterFromUrl(KorAPUrl, "vc")
59 ql <- QueryParameterFromUrl(KorAPUrl, "ql")
60 }
Marc Kupietz7d88e2e2019-09-07 21:07:40 +020061 if (is.na(vc)) {
62 vc <- ""
63 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020064 request <- paste0('?q=', URLencode(query, reserved=TRUE),
Marc Kupietz632cbd42019-09-06 16:04:51 +020065 ifelse(vc != '', paste0('&vc=', URLencode(vc, reserved=TRUE)), ''),
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020066 '&ql=', ql);
67 webUIRequestUrl <- paste0(con$KorAPUrl, request)
68 requestUrl <- paste0(con$apiUrl, 'search', request,
69 '&fields=', paste(defaultFields, collapse = ","),
Marc Kupietz632cbd42019-09-06 16:04:51 +020070 ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020071 result <- fromJSON(paste0(requestUrl, '&count=1'))
Marc Kupietz62da2b52019-09-12 17:43:34 +020072 result$fields <- fields[!fields %in% contentFields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020073 result$requestUrl <- requestUrl
74 result$request <- request
Marc Kupietz632cbd42019-09-06 16:04:51 +020075 result$vc <- vc
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020076 result$webUIRequestUrl <- webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +020077 result$nextStartIndex <- 0
78 result$hasMoreMatches <- (result$meta$totalResults > 0)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020079 return(result)
80}
81
Marc Kupietz62da2b52019-09-12 17:43:34 +020082#' Fetch the next bunch of results of a KorAP query.
83#' @param queryObject object obtained from \code{\link{KorAPQuery}}
84#' @param offset start offset for query results to fetch
85#' @param maxFetch maximum number of query results to fetch
86#' @param verbose
87#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, , \code{$hasMoreMatches}
88#'
89#' @examples
90#' q <- KorapFetchNext(KorAPQuery(KorAPConnection(), "Ameisenplage"))
91#'
92#' @seealso \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchAll}}
93#'
94#' @references
95#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
96#'
Marc Kupietz632cbd42019-09-06 16:04:51 +020097#' @export
Marc Kupietz62da2b52019-09-12 17:43:34 +020098KorAPFetchNext <- function(queryObject, offset = queryObject$nextStartIndex, maxFetch = maxResultsPerPage, verbose = FALSE) {
99 if (queryObject$meta$totalResults == 0 || offset >= queryObject$meta$totalResults) {
100 return(queryObject)
101 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200102
103 page <- 1
104 results <- 0
105
Marc Kupietz62da2b52019-09-12 17:43:34 +0200106 collectedMatches <- queryObject$collectedMatches
107
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200108 repeat {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200109 res <- fromJSON(paste0(queryObject$requestUrl, '&count=', min(ifelse(!is.na(maxFetch), maxFetch - results, maxResultsPerPage), maxResultsPerPage) ,'&offset=', offset + results))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200110 if (res$meta$totalResults == 0) { return(data.frame()) }
Marc Kupietzb3065522019-09-09 11:34:19 +0200111 for (field in queryObject$fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200112 if (!field %in% colnames(res$matches)) {
113 res$matches[, field] <- NA
114 }
115 }
Marc Kupietzb3065522019-09-09 11:34:19 +0200116 currentMatches <- res$matches[queryObject$fields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200117 factorCols <- colnames(subset(currentMatches, select=-c(pubDate)))
118 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
119 currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200120 if (!is.list(collectedMatches)) {
121 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200122 } else {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200123 collectedMatches <- rbind(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200124 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200125 if (verbose) {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200126 cat(paste0("Retrieved page: ", page, "/", ceiling((res$meta$totalResults) / res$meta$itemsPerPage), ': ', res$meta$benchmark, '\n'))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200127 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200128 page <- page + 1
129 results <- results + res$meta$itemsPerPage
Marc Kupietz62da2b52019-09-12 17:43:34 +0200130 if (offset + results >= res$meta$totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200131 break
132 }
133 }
Marc Kupietzcb725f82019-08-30 18:04:57 +0200134 res$nextStartIndex <- res$meta$startIndex + res$meta$itemsPerPage
Marc Kupietzb3065522019-09-09 11:34:19 +0200135 res$fields <- queryObject$fields
136 res$requestUrl <- queryObject$requestUrl
137 res$request <- queryObject$request
138 res$webUIRequestUrl <- queryObject$webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +0200139 res$hasMoreMatches <- (res$meta$totalResults > res$nextStartIndex)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200140 res$collectedMatches <- collectedMatches
Marc Kupietzcb725f82019-08-30 18:04:57 +0200141 return(res)
142}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200143
144#' Fetch all results of a KorAP query.
145#' @param queryObject object obtained from \code{\link{KorAPQuery}}
146#' @param verbose
147#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, \code{$hasMoreMatches}
148#'
149#' @examples
150#' q <- KorAPFetchAll(KorAPQuery(KorAPConnection(), "Ameisenplage"))
151#' q$collectedMatches
152#'
153#' @seealso \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchNext}}
154#'
155#' @references
156#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
157#'
158#' @export
159KorAPFetchAll <- function(queryObject, verbose = FALSE) {
160 return(KorAPFetchNext(queryObject, offset = 0, maxFetch = NA, verbose = verbose))
161}
162
163#' Fetches all remaining results of a KorAP query.
164#' @param queryObject object obtained from \code{\link{KorAPQuery}}
165#' @param verbose
166#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, \code{$hasMoreMatches}
167#'
168#' @examples
169#' q <- KorAPFetchRest(KorAPQueryNext(KorAPQuery(KorAPConnection(), "Ameisenplage")))
170#' q$collectedMatches
171#'
172#' @seealso \code{\link{KorAPFetchAll}}, \code{\link{KorAPFetchNext}}
173#'
174#' @references
175#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
176#'
177#' @export
178KorAPFetchRest <- function(queryObject, verbose = FALSE) {
179 return(KorAPFetchNext(queryObject, maxFetch = NA, verbose = verbose))
180}