blob: 98039f397da846002e549ab64d817ff1035779d5 [file] [log] [blame]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02001#' @import jsonlite
2#' @import curl
3
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02004defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietz632cbd42019-09-06 16:04:51 +02005 "availability", "textClass", "snippet")
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02006
Marc Kupietz632cbd42019-09-06 16:04:51 +02007contentFields <- c("snippet")
8
Marc Kupietz62da2b52019-09-12 17:43:34 +02009maxResultsPerPage <- 50;
10
Marc Kupietz632cbd42019-09-06 16:04:51 +020011QueryParameterFromUrl <- function(url, parameter) {
12 regex <- paste0(".*[?&]", parameter, "=([^&]*).*")
13 if (grepl(regex, url)) {
14 return(gsub(regex, '\\1', url, perl = TRUE))
15 } else {
16 return("")
17 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020018}
19
Marc Kupietz632cbd42019-09-06 16:04:51 +020020KorAPQueryStringFromUrl <- function(KorAPUrl) {
21 return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE)))
22}
23
Marc Kupietz62da2b52019-09-12 17:43:34 +020024#' Send a query to a KorAP connection.
Marc Kupietz632cbd42019-09-06 16:04:51 +020025#' @param con object obtained from \code{\link{KorAPConnection}}, that contains all necessary connection information
26#' @param query string that contains the corpus query. The query langauge depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}
27#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
28#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietzb125bdd2019-09-09 12:05:59 +020029#' @param metadataOnly boolean that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz632cbd42019-09-06 16:04:51 +020030#' @param ql string to choose the query language
Marc Kupietzb125bdd2019-09-09 12:05:59 +020031#' @param fields (meta)data fields that will be fetched for every match
Marc Kupietz62da2b52019-09-12 17:43:34 +020032#' @return A KorAP query object that, among other information, contains the total number of results in \code{$meta$totalResults}. The resulting object can be used to fetch all query results (with \code{\link{KorAPFetchAll}}) or the next page of results (with \code{\link{KorAPFetchNext}}).
33#' A correspunding URL to be used within a web browser is contained in \code{$webUIRequestUrl}
34#' Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +020035#'
36#' @examples
37#' q <- KorAPQuery(con, "Ameisenplage")
38#' q <- KorAPQuery(KorAPConnection(), "Ameisenplage")
39#' q <- KorAPQuery(con, KorAPUrl = "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp&cutoff=1")
40#'
41#' @references
42#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
43#'
44#' @export
Marc Kupietz62da2b52019-09-12 17:43:34 +020045KorAPQuery <- function(con, query, vc = NA, KorAPUrl = NA, metadataOnly = TRUE, ql = "poliqarp", fields = defaultFields,
46 accessRewriteFatal = TRUE) {
Marc Kupietz632cbd42019-09-06 16:04:51 +020047 if (is.na(query) && is.na(KorAPUrl) || ! (is.na(query) || is.na(KorAPUrl))) {
Marc Kupietz62da2b52019-09-12 17:43:34 +020048 stop("Exactly one of the parameters query and KorAPUrl must be specified.")
Marc Kupietz632cbd42019-09-06 16:04:51 +020049 }
50 if (is.na(query)) {
51 query <- QueryParameterFromUrl(KorAPUrl, "q")
52 vc <- QueryParameterFromUrl(KorAPUrl, "vc")
53 ql <- QueryParameterFromUrl(KorAPUrl, "ql")
54 }
Marc Kupietz7d88e2e2019-09-07 21:07:40 +020055 if (is.na(vc)) {
56 vc <- ""
57 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020058 request <- paste0('?q=', URLencode(query, reserved=TRUE),
Marc Kupietz632cbd42019-09-06 16:04:51 +020059 ifelse(vc != '', paste0('&vc=', URLencode(vc, reserved=TRUE)), ''),
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020060 '&ql=', ql);
61 webUIRequestUrl <- paste0(con$KorAPUrl, request)
62 requestUrl <- paste0(con$apiUrl, 'search', request,
63 '&fields=', paste(defaultFields, collapse = ","),
Marc Kupietz632cbd42019-09-06 16:04:51 +020064 ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020065 result <- fromJSON(paste0(requestUrl, '&count=1'))
Marc Kupietz62da2b52019-09-12 17:43:34 +020066 result$fields <- fields[!fields %in% contentFields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020067 result$requestUrl <- requestUrl
68 result$request <- request
Marc Kupietz632cbd42019-09-06 16:04:51 +020069 result$vc <- vc
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020070 result$webUIRequestUrl <- webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +020071 result$nextStartIndex <- 0
72 result$hasMoreMatches <- (result$meta$totalResults > 0)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020073 return(result)
74}
75
Marc Kupietz62da2b52019-09-12 17:43:34 +020076#' Fetch the next bunch of results of a KorAP query.
77#' @param queryObject object obtained from \code{\link{KorAPQuery}}
78#' @param offset start offset for query results to fetch
79#' @param maxFetch maximum number of query results to fetch
80#' @param verbose
81#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, , \code{$hasMoreMatches}
82#'
83#' @examples
84#' q <- KorapFetchNext(KorAPQuery(KorAPConnection(), "Ameisenplage"))
85#'
86#' @seealso \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchAll}}
87#'
88#' @references
89#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
90#'
Marc Kupietz632cbd42019-09-06 16:04:51 +020091#' @export
Marc Kupietz62da2b52019-09-12 17:43:34 +020092KorAPFetchNext <- function(queryObject, offset = queryObject$nextStartIndex, maxFetch = maxResultsPerPage, verbose = FALSE) {
93 if (queryObject$meta$totalResults == 0 || offset >= queryObject$meta$totalResults) {
94 return(queryObject)
95 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020096
97 page <- 1
98 results <- 0
99
Marc Kupietz62da2b52019-09-12 17:43:34 +0200100 collectedMatches <- queryObject$collectedMatches
101
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200102 repeat {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200103 res <- fromJSON(paste0(queryObject$requestUrl, '&count=', min(ifelse(!is.na(maxFetch), maxFetch - results, maxResultsPerPage), maxResultsPerPage) ,'&offset=', offset + results))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200104 if (res$meta$totalResults == 0) { return(data.frame()) }
Marc Kupietzb3065522019-09-09 11:34:19 +0200105 for (field in queryObject$fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200106 if (!field %in% colnames(res$matches)) {
107 res$matches[, field] <- NA
108 }
109 }
Marc Kupietzb3065522019-09-09 11:34:19 +0200110 currentMatches <- res$matches[queryObject$fields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200111 factorCols <- colnames(subset(currentMatches, select=-c(pubDate)))
112 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
113 currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200114 if (!is.list(collectedMatches)) {
115 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200116 } else {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200117 collectedMatches <- rbind(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200118 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200119 if (verbose) {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200120 cat(paste0("Retrieved page: ", page, "/", ceiling((res$meta$totalResults) / res$meta$itemsPerPage), ': ', res$meta$benchmark, '\n'))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200121 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200122 page <- page + 1
123 results <- results + res$meta$itemsPerPage
Marc Kupietz62da2b52019-09-12 17:43:34 +0200124 if (offset + results >= res$meta$totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200125 break
126 }
127 }
Marc Kupietzcb725f82019-08-30 18:04:57 +0200128 res$nextStartIndex <- res$meta$startIndex + res$meta$itemsPerPage
Marc Kupietzb3065522019-09-09 11:34:19 +0200129 res$fields <- queryObject$fields
130 res$requestUrl <- queryObject$requestUrl
131 res$request <- queryObject$request
132 res$webUIRequestUrl <- queryObject$webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +0200133 res$hasMoreMatches <- (res$meta$totalResults > res$nextStartIndex)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200134 res$collectedMatches <- collectedMatches
Marc Kupietzcb725f82019-08-30 18:04:57 +0200135 return(res)
136}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200137
138#' Fetch all results of a KorAP query.
139#' @param queryObject object obtained from \code{\link{KorAPQuery}}
140#' @param verbose
141#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, \code{$hasMoreMatches}
142#'
143#' @examples
144#' q <- KorAPFetchAll(KorAPQuery(KorAPConnection(), "Ameisenplage"))
145#' q$collectedMatches
146#'
147#' @seealso \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchNext}}
148#'
149#' @references
150#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
151#'
152#' @export
153KorAPFetchAll <- function(queryObject, verbose = FALSE) {
154 return(KorAPFetchNext(queryObject, offset = 0, maxFetch = NA, verbose = verbose))
155}
156
157#' Fetches all remaining results of a KorAP query.
158#' @param queryObject object obtained from \code{\link{KorAPQuery}}
159#' @param verbose
160#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, \code{$hasMoreMatches}
161#'
162#' @examples
163#' q <- KorAPFetchRest(KorAPQueryNext(KorAPQuery(KorAPConnection(), "Ameisenplage")))
164#' q$collectedMatches
165#'
166#' @seealso \code{\link{KorAPFetchAll}}, \code{\link{KorAPFetchNext}}
167#'
168#' @references
169#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
170#'
171#' @export
172KorAPFetchRest <- function(queryObject, verbose = FALSE) {
173 return(KorAPFetchNext(queryObject, maxFetch = NA, verbose = verbose))
174}