blob: 1d36e8d0cf389a4c3972d9ced1d07660fde8e325 [file] [log] [blame]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02001#' @import jsonlite
2#' @import curl
Marc Kupietzd235a712019-09-16 18:31:33 +02003#' @import utils
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02004
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02005defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietz632cbd42019-09-06 16:04:51 +02006 "availability", "textClass", "snippet")
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02007
Marc Kupietz632cbd42019-09-06 16:04:51 +02008contentFields <- c("snippet")
9
Marc Kupietz62da2b52019-09-12 17:43:34 +020010maxResultsPerPage <- 50;
11
Marc Kupietz632cbd42019-09-06 16:04:51 +020012QueryParameterFromUrl <- function(url, parameter) {
13 regex <- paste0(".*[?&]", parameter, "=([^&]*).*")
14 if (grepl(regex, url)) {
15 return(gsub(regex, '\\1', url, perl = TRUE))
16 } else {
17 return("")
18 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020019}
20
Marc Kupietz632cbd42019-09-06 16:04:51 +020021KorAPQueryStringFromUrl <- function(KorAPUrl) {
22 return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE)))
23}
24
Marc Kupietz62da2b52019-09-12 17:43:34 +020025#' Send a query to a KorAP connection.
Marc Kupietz632cbd42019-09-06 16:04:51 +020026#' @param con object obtained from \code{\link{KorAPConnection}}, that contains all necessary connection information
27#' @param query string that contains the corpus query. The query langauge depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}
28#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
29#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietzb125bdd2019-09-09 12:05:59 +020030#' @param metadataOnly boolean that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz3c531f62019-09-13 12:17:24 +020031#' @param ql string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values.
Marc Kupietzb125bdd2019-09-09 12:05:59 +020032#' @param fields (meta)data fields that will be fetched for every match
Marc Kupietz25aebc32019-09-16 18:40:50 +020033#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficent rights (not yet implemented)
34#' @param verbose print some info
Marc Kupietz62da2b52019-09-12 17:43:34 +020035#' @return A KorAP query object that, among other information, contains the total number of results in \code{$meta$totalResults}. The resulting object can be used to fetch all query results (with \code{\link{KorAPFetchAll}}) or the next page of results (with \code{\link{KorAPFetchNext}}).
36#' A correspunding URL to be used within a web browser is contained in \code{$webUIRequestUrl}
37#' Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +020038#'
39#' @examples
Marc Kupietz3c531f62019-09-13 12:17:24 +020040#' con <- KorAPConnection()
Marc Kupietz632cbd42019-09-06 16:04:51 +020041#' q <- KorAPQuery(con, "Ameisenplage")
Marc Kupietz3c531f62019-09-13 12:17:24 +020042#' q <- KorAPFetchAll(q)
43#' summary(q$collectedMatches)
44#'
Marc Kupietz37b8ef12019-09-16 18:37:49 +020045#' q <- KorAPQuery(con,
46#' KorAPUrl = "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp")
Marc Kupietz3c531f62019-09-13 12:17:24 +020047#' q$meta$totalResults
48#'
Marc Kupietz37b8ef12019-09-16 18:37:49 +020049#' q <- KorAPQuery(con, "Ameisenplage")
50#' q <- KorAPFetchAll(q, verbose=TRUE)
51#' tokensPerYear <- function(year) { return(KorAPCorpusStats(con, paste("pubDate in", year))$tokens) }
52#' df <- as.data.frame(table(as.numeric(format(q$collectedMatches$pubDate,"%Y")), dnn="year"),
53#' stringsAsFactors = FALSE)
54#' df$ipm <- 1000000 * df$Freq / tokensPerYear(df$year)
55#' plot(df$year, df$ipm, type="l")
56#'
Marc Kupietz3c531f62019-09-13 12:17:24 +020057#' @seealso \code{\link{KorAPConnection}}, \code{\link{KorAPFetchNext}}, \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchAll}}, \code{\link{KorAPCorpusStats}}
Marc Kupietz632cbd42019-09-06 16:04:51 +020058#'
59#' @references
60#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
61#'
62#' @export
Marc Kupietzf568f3f2019-09-16 17:03:46 +020063KorAPQuery <- function(con, query, vc="", KorAPUrl, metadataOnly = TRUE, ql = "poliqarp", fields = defaultFields,
Marc Kupietz25aebc32019-09-16 18:40:50 +020064 accessRewriteFatal = TRUE, verbose=FALSE) {
Marc Kupietzf568f3f2019-09-16 17:03:46 +020065 if (missing(query) && missing(KorAPUrl) || ! (missing(query) || missing(KorAPUrl))) {
Marc Kupietz62da2b52019-09-12 17:43:34 +020066 stop("Exactly one of the parameters query and KorAPUrl must be specified.")
Marc Kupietz632cbd42019-09-06 16:04:51 +020067 }
Marc Kupietzf568f3f2019-09-16 17:03:46 +020068 if (missing(query)) {
Marc Kupietz632cbd42019-09-06 16:04:51 +020069 query <- QueryParameterFromUrl(KorAPUrl, "q")
70 vc <- QueryParameterFromUrl(KorAPUrl, "vc")
71 ql <- QueryParameterFromUrl(KorAPUrl, "ql")
72 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020073 request <- paste0('?q=', URLencode(query, reserved=TRUE),
Marc Kupietz37b8ef12019-09-16 18:37:49 +020074 ifelse(vc != '', paste0('&cq=', URLencode(vc, reserved=TRUE)), ''), '&ql=', ql)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020075 webUIRequestUrl <- paste0(con$KorAPUrl, request)
76 requestUrl <- paste0(con$apiUrl, 'search', request,
77 '&fields=', paste(defaultFields, collapse = ","),
Marc Kupietz632cbd42019-09-06 16:04:51 +020078 ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
Marc Kupietz25aebc32019-09-16 18:40:50 +020079 if (verbose) {
80 cat(paste0(webUIRequestUrl, "\n"))
81 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020082 result <- fromJSON(paste0(requestUrl, '&count=1'))
Marc Kupietz62da2b52019-09-12 17:43:34 +020083 result$fields <- fields[!fields %in% contentFields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020084 result$requestUrl <- requestUrl
85 result$request <- request
Marc Kupietz632cbd42019-09-06 16:04:51 +020086 result$vc <- vc
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020087 result$webUIRequestUrl <- webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +020088 result$nextStartIndex <- 0
89 result$hasMoreMatches <- (result$meta$totalResults > 0)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020090 return(result)
91}
92
Marc Kupietz62da2b52019-09-12 17:43:34 +020093#' Fetch the next bunch of results of a KorAP query.
94#' @param queryObject object obtained from \code{\link{KorAPQuery}}
95#' @param offset start offset for query results to fetch
96#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +020097#' @param verbose print progress information if true
Marc Kupietz62da2b52019-09-12 17:43:34 +020098#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, , \code{$hasMoreMatches}
99#'
100#' @examples
Marc Kupietz25aebc32019-09-16 18:40:50 +0200101#' q <- KorAPFetchNext(KorAPQuery(KorAPConnection(), "Ameisenplage"))
Marc Kupietz62da2b52019-09-12 17:43:34 +0200102#'
103#' @seealso \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchAll}}
104#'
105#' @references
106#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
107#'
Marc Kupietz632cbd42019-09-06 16:04:51 +0200108#' @export
Marc Kupietz62da2b52019-09-12 17:43:34 +0200109KorAPFetchNext <- function(queryObject, offset = queryObject$nextStartIndex, maxFetch = maxResultsPerPage, verbose = FALSE) {
110 if (queryObject$meta$totalResults == 0 || offset >= queryObject$meta$totalResults) {
111 return(queryObject)
112 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200113
114 page <- 1
115 results <- 0
Marc Kupietz25aebc32019-09-16 18:40:50 +0200116 pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietz62da2b52019-09-12 17:43:34 +0200117 collectedMatches <- queryObject$collectedMatches
118
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200119 repeat {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200120 res <- fromJSON(paste0(queryObject$requestUrl, '&count=', min(ifelse(!is.na(maxFetch), maxFetch - results, maxResultsPerPage), maxResultsPerPage) ,'&offset=', offset + results))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200121 if (res$meta$totalResults == 0) { return(data.frame()) }
Marc Kupietzb3065522019-09-09 11:34:19 +0200122 for (field in queryObject$fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200123 if (!field %in% colnames(res$matches)) {
124 res$matches[, field] <- NA
125 }
126 }
Marc Kupietzb3065522019-09-09 11:34:19 +0200127 currentMatches <- res$matches[queryObject$fields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200128 factorCols <- colnames(subset(currentMatches, select=-c(pubDate)))
129 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
130 currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200131 if (!is.list(collectedMatches)) {
132 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200133 } else {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200134 collectedMatches <- rbind(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200135 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200136 if (verbose) {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200137 cat(paste0("Retrieved page: ", page, "/", ceiling((res$meta$totalResults) / res$meta$itemsPerPage), ': ', res$meta$benchmark, '\n'))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200138 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200139 page <- page + 1
140 results <- results + res$meta$itemsPerPage
Marc Kupietz62da2b52019-09-12 17:43:34 +0200141 if (offset + results >= res$meta$totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200142 break
143 }
144 }
Marc Kupietzcb725f82019-08-30 18:04:57 +0200145 res$nextStartIndex <- res$meta$startIndex + res$meta$itemsPerPage
Marc Kupietzb3065522019-09-09 11:34:19 +0200146 res$fields <- queryObject$fields
147 res$requestUrl <- queryObject$requestUrl
148 res$request <- queryObject$request
149 res$webUIRequestUrl <- queryObject$webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +0200150 res$hasMoreMatches <- (res$meta$totalResults > res$nextStartIndex)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200151 res$collectedMatches <- collectedMatches
Marc Kupietzcb725f82019-08-30 18:04:57 +0200152 return(res)
153}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200154
155#' Fetch all results of a KorAP query.
156#' @param queryObject object obtained from \code{\link{KorAPQuery}}
Marc Kupietz25aebc32019-09-16 18:40:50 +0200157#' @param verbose print progress information if true
Marc Kupietz62da2b52019-09-12 17:43:34 +0200158#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, \code{$hasMoreMatches}
159#'
160#' @examples
161#' q <- KorAPFetchAll(KorAPQuery(KorAPConnection(), "Ameisenplage"))
162#' q$collectedMatches
163#'
164#' @seealso \code{\link{KorAPFetchRest}}, \code{\link{KorAPFetchNext}}
165#'
166#' @references
167#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
168#'
169#' @export
170KorAPFetchAll <- function(queryObject, verbose = FALSE) {
171 return(KorAPFetchNext(queryObject, offset = 0, maxFetch = NA, verbose = verbose))
172}
173
174#' Fetches all remaining results of a KorAP query.
175#' @param queryObject object obtained from \code{\link{KorAPQuery}}
Marc Kupietz25aebc32019-09-16 18:40:50 +0200176#' @param verbose print progress information if true
Marc Kupietz62da2b52019-09-12 17:43:34 +0200177#' @return The \code{queryObject} input parameter with updated fields \code{$collectedMatches}, \code{$matches} (latest bunch only), \code{$nextStartIndex}, \code{$hasMoreMatches}
178#'
179#' @examples
Marc Kupietz25aebc32019-09-16 18:40:50 +0200180#' q <- KorAPFetchRest(KorAPFetchNext(KorAPQuery(KorAPConnection(), "Ameisenplage")))
Marc Kupietz62da2b52019-09-12 17:43:34 +0200181#' q$collectedMatches
182#'
183#' @seealso \code{\link{KorAPFetchAll}}, \code{\link{KorAPFetchNext}}
184#'
185#' @references
186#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
187#'
188#' @export
189KorAPFetchRest <- function(queryObject, verbose = FALSE) {
190 return(KorAPFetchNext(queryObject, maxFetch = NA, verbose = verbose))
191}