blob: a204390306dc8340596f32c54d3db9c82e7bb46b [file] [log] [blame]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02001#' @import jsonlite
2#' @import curl
Marc Kupietzd235a712019-09-16 18:31:33 +02003#' @import utils
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02004
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02005defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietz632cbd42019-09-06 16:04:51 +02006 "availability", "textClass", "snippet")
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02007
Marc Kupietz632cbd42019-09-06 16:04:51 +02008contentFields <- c("snippet")
9
Marc Kupietz62da2b52019-09-12 17:43:34 +020010maxResultsPerPage <- 50;
11
Marc Kupietz632cbd42019-09-06 16:04:51 +020012QueryParameterFromUrl <- function(url, parameter) {
13 regex <- paste0(".*[?&]", parameter, "=([^&]*).*")
14 if (grepl(regex, url)) {
15 return(gsub(regex, '\\1', url, perl = TRUE))
16 } else {
17 return("")
18 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020019}
20
Marc Kupietz632cbd42019-09-06 16:04:51 +020021KorAPQueryStringFromUrl <- function(KorAPUrl) {
22 return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE)))
23}
24
Marc Kupietz62da2b52019-09-12 17:43:34 +020025#' Send a query to a KorAP connection.
Marc Kupietz632cbd42019-09-06 16:04:51 +020026#' @param con object obtained from \code{\link{KorAPConnection}}, that contains all necessary connection information
27#' @param query string that contains the corpus query. The query langauge depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}
28#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
29#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietzb125bdd2019-09-09 12:05:59 +020030#' @param metadataOnly boolean that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz3c531f62019-09-13 12:17:24 +020031#' @param ql string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values.
Marc Kupietzb125bdd2019-09-09 12:05:59 +020032#' @param fields (meta)data fields that will be fetched for every match
Marc Kupietz62da2b52019-09-12 17:43:34 +020033#' @return A KorAP query object that, among other information, contains the total number of results in \code{$meta$totalResults}. The resulting object can be used to fetch all query results (with \code{\link{KorAPFetchAll}}) or the next page of results (with \code{\link{KorAPFetchNext}}).
34#' A correspunding URL to be used within a web browser is contained in \code{$webUIRequestUrl}
35#' Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +020036#'
37#' @examples
Marc Kupietz3c531f62019-09-13 12:17:24 +020038#' con <- KorAPConnection()
Marc Kupietz632cbd42019-09-06 16:04:51 +020039#' q <- KorAPQuery(con, "Ameisenplage")
Marc Kupietz3c531f62019-09-13 12:17:24 +020040#' q <- KorAPFetchAll(q)
41#' summary(q$collectedMatches)
42#'
Marc Kupietz37b8ef12019-09-16 18:37:49 +020043#' q <- KorAPQuery(con,
44#' KorAPUrl = "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp")
Marc Kupietz3c531f62019-09-13 12:17:24 +020045#' q$meta$totalResults
46#'
Marc Kupietz37b8ef12019-09-16 18:37:49 +020047#' q <- KorAPQuery(con, "Ameisenplage")
48#' q <- KorAPFetchAll(q, verbose=TRUE)
49#' tokensPerYear <- function(year) { return(KorAPCorpusStats(con, paste("pubDate in", year))$tokens) }
50#' df <- as.data.frame(table(as.numeric(format(q$collectedMatches$pubDate,"%Y")), dnn="year"),
51#' stringsAsFactors = FALSE)
52#' df$ipm <- 1000000 * df$Freq / tokensPerYear(df$year)
53#' plot(df$year, df$ipm, type="l")
54#'
Marc Kupietz3c531f62019-09-13 12:17:24 +020055#' @seealso \code{\link{KorAPConnection}}, \code{\link{KorAPFetchNext}}, \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchAll}}, \code{\link{KorAPCorpusStats}}
Marc Kupietz632cbd42019-09-06 16:04:51 +020056#'
57#' @references
58#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
59#'
60#' @export
Marc Kupietzf568f3f2019-09-16 17:03:46 +020061KorAPQuery <- function(con, query, vc="", KorAPUrl, metadataOnly = TRUE, ql = "poliqarp", fields = defaultFields,
Marc Kupietz62da2b52019-09-12 17:43:34 +020062 accessRewriteFatal = TRUE) {
Marc Kupietzf568f3f2019-09-16 17:03:46 +020063 if (missing(query) && missing(KorAPUrl) || ! (missing(query) || missing(KorAPUrl))) {
Marc Kupietz62da2b52019-09-12 17:43:34 +020064 stop("Exactly one of the parameters query and KorAPUrl must be specified.")
Marc Kupietz632cbd42019-09-06 16:04:51 +020065 }
Marc Kupietzf568f3f2019-09-16 17:03:46 +020066 if (missing(query)) {
Marc Kupietz632cbd42019-09-06 16:04:51 +020067 query <- QueryParameterFromUrl(KorAPUrl, "q")
68 vc <- QueryParameterFromUrl(KorAPUrl, "vc")
69 ql <- QueryParameterFromUrl(KorAPUrl, "ql")
70 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020071 request <- paste0('?q=', URLencode(query, reserved=TRUE),
Marc Kupietz37b8ef12019-09-16 18:37:49 +020072 ifelse(vc != '', paste0('&cq=', URLencode(vc, reserved=TRUE)), ''), '&ql=', ql)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020073 webUIRequestUrl <- paste0(con$KorAPUrl, request)
74 requestUrl <- paste0(con$apiUrl, 'search', request,
75 '&fields=', paste(defaultFields, collapse = ","),
Marc Kupietz632cbd42019-09-06 16:04:51 +020076 ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020077 result <- fromJSON(paste0(requestUrl, '&count=1'))
Marc Kupietz62da2b52019-09-12 17:43:34 +020078 result$fields <- fields[!fields %in% contentFields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020079 result$requestUrl <- requestUrl
80 result$request <- request
Marc Kupietz632cbd42019-09-06 16:04:51 +020081 result$vc <- vc
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020082 result$webUIRequestUrl <- webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +020083 result$nextStartIndex <- 0
84 result$hasMoreMatches <- (result$meta$totalResults > 0)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020085 return(result)
86}
87
Marc Kupietz62da2b52019-09-12 17:43:34 +020088#' Fetch the next bunch of results of a KorAP query.
89#' @param queryObject object obtained from \code{\link{KorAPQuery}}
90#' @param offset start offset for query results to fetch
91#' @param maxFetch maximum number of query results to fetch
92#' @param verbose
93#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, , \code{$hasMoreMatches}
94#'
95#' @examples
96#' q <- KorapFetchNext(KorAPQuery(KorAPConnection(), "Ameisenplage"))
97#'
98#' @seealso \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchAll}}
99#'
100#' @references
101#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
102#'
Marc Kupietz632cbd42019-09-06 16:04:51 +0200103#' @export
Marc Kupietz62da2b52019-09-12 17:43:34 +0200104KorAPFetchNext <- function(queryObject, offset = queryObject$nextStartIndex, maxFetch = maxResultsPerPage, verbose = FALSE) {
105 if (queryObject$meta$totalResults == 0 || offset >= queryObject$meta$totalResults) {
106 return(queryObject)
107 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200108
109 page <- 1
110 results <- 0
111
Marc Kupietz62da2b52019-09-12 17:43:34 +0200112 collectedMatches <- queryObject$collectedMatches
113
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200114 repeat {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200115 res <- fromJSON(paste0(queryObject$requestUrl, '&count=', min(ifelse(!is.na(maxFetch), maxFetch - results, maxResultsPerPage), maxResultsPerPage) ,'&offset=', offset + results))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200116 if (res$meta$totalResults == 0) { return(data.frame()) }
Marc Kupietzb3065522019-09-09 11:34:19 +0200117 for (field in queryObject$fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200118 if (!field %in% colnames(res$matches)) {
119 res$matches[, field] <- NA
120 }
121 }
Marc Kupietzb3065522019-09-09 11:34:19 +0200122 currentMatches <- res$matches[queryObject$fields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200123 factorCols <- colnames(subset(currentMatches, select=-c(pubDate)))
124 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
125 currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200126 if (!is.list(collectedMatches)) {
127 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200128 } else {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200129 collectedMatches <- rbind(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200130 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200131 if (verbose) {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200132 cat(paste0("Retrieved page: ", page, "/", ceiling((res$meta$totalResults) / res$meta$itemsPerPage), ': ', res$meta$benchmark, '\n'))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200133 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200134 page <- page + 1
135 results <- results + res$meta$itemsPerPage
Marc Kupietz62da2b52019-09-12 17:43:34 +0200136 if (offset + results >= res$meta$totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200137 break
138 }
139 }
Marc Kupietzcb725f82019-08-30 18:04:57 +0200140 res$nextStartIndex <- res$meta$startIndex + res$meta$itemsPerPage
Marc Kupietzb3065522019-09-09 11:34:19 +0200141 res$fields <- queryObject$fields
142 res$requestUrl <- queryObject$requestUrl
143 res$request <- queryObject$request
144 res$webUIRequestUrl <- queryObject$webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +0200145 res$hasMoreMatches <- (res$meta$totalResults > res$nextStartIndex)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200146 res$collectedMatches <- collectedMatches
Marc Kupietzcb725f82019-08-30 18:04:57 +0200147 return(res)
148}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200149
150#' Fetch all results of a KorAP query.
151#' @param queryObject object obtained from \code{\link{KorAPQuery}}
152#' @param verbose
153#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, \code{$hasMoreMatches}
154#'
155#' @examples
156#' q <- KorAPFetchAll(KorAPQuery(KorAPConnection(), "Ameisenplage"))
157#' q$collectedMatches
158#'
159#' @seealso \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchNext}}
160#'
161#' @references
162#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
163#'
164#' @export
165KorAPFetchAll <- function(queryObject, verbose = FALSE) {
166 return(KorAPFetchNext(queryObject, offset = 0, maxFetch = NA, verbose = verbose))
167}
168
169#' Fetches all remaining results of a KorAP query.
170#' @param queryObject object obtained from \code{\link{KorAPQuery}}
171#' @param verbose
172#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, \code{$hasMoreMatches}
173#'
174#' @examples
175#' q <- KorAPFetchRest(KorAPQueryNext(KorAPQuery(KorAPConnection(), "Ameisenplage")))
176#' q$collectedMatches
177#'
178#' @seealso \code{\link{KorAPFetchAll}}, \code{\link{KorAPFetchNext}}
179#'
180#' @references
181#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
182#'
183#' @export
184KorAPFetchRest <- function(queryObject, verbose = FALSE) {
185 return(KorAPFetchNext(queryObject, maxFetch = NA, verbose = verbose))
186}