blob: fcb9d8751f2960d7a5fc843b7c11a4d2929edfe2 [file] [log] [blame]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02001#' @import jsonlite
2#' @import curl
3
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02004defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietz632cbd42019-09-06 16:04:51 +02005 "availability", "textClass", "snippet")
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02006
Marc Kupietz632cbd42019-09-06 16:04:51 +02007contentFields <- c("snippet")
8
9QueryParameterFromUrl <- function(url, parameter) {
10 regex <- paste0(".*[?&]", parameter, "=([^&]*).*")
11 if (grepl(regex, url)) {
12 return(gsub(regex, '\\1', url, perl = TRUE))
13 } else {
14 return("")
15 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020016}
17
Marc Kupietz632cbd42019-09-06 16:04:51 +020018KorAPQueryStringFromUrl <- function(KorAPUrl) {
19 return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE)))
20}
21
22#' \code{KorAPQuery} perform a query on the KorAP server.
23#' @param con object obtained from \code{\link{KorAPConnection}}, that contains all necessary connection information
24#' @param query string that contains the corpus query. The query langauge depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}
25#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
26#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietzb125bdd2019-09-09 12:05:59 +020027#' @param metadataOnly boolean that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz632cbd42019-09-06 16:04:51 +020028#' @param ql string to choose the query language
Marc Kupietzb125bdd2019-09-09 12:05:59 +020029#' @param fields (meta)data fields that will be fetched for every match
Marc Kupietz7bce47d2019-09-09 11:53:11 +020030#' @return A KorAP query object that, among other information, contains the total number of results in \code{$meta$totalResults}. The resulting object can be used to fetch all query results (with \code{\link{KorAPFetchAll}}) or the next page of results (with \code{\link{KorAPFetchNext}}). Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +020031#'
32#' @examples
33#' q <- KorAPQuery(con, "Ameisenplage")
34#' q <- KorAPQuery(KorAPConnection(), "Ameisenplage")
35#' q <- KorAPQuery(con, KorAPUrl = "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp&cutoff=1")
36#'
37#' @references
38#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
39#'
40#' @export
Marc Kupietzb125bdd2019-09-09 12:05:59 +020041KorAPQuery <- function(con, query, vc = NA, KorAPUrl = NA, metadataOnly = TRUE, ql = "poliqarp", fields = defaultFields) {
Marc Kupietz632cbd42019-09-06 16:04:51 +020042 if (is.na(query) && is.na(KorAPUrl) || ! (is.na(query) || is.na(KorAPUrl))) {
43 stop("Exaclty one of the parameters query and KorAPUrl must be specified.")
44 }
45 if (is.na(query)) {
46 query <- QueryParameterFromUrl(KorAPUrl, "q")
47 vc <- QueryParameterFromUrl(KorAPUrl, "vc")
48 ql <- QueryParameterFromUrl(KorAPUrl, "ql")
49 }
Marc Kupietz7d88e2e2019-09-07 21:07:40 +020050 if (is.na(vc)) {
51 vc <- ""
52 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020053 request <- paste0('?q=', URLencode(query, reserved=TRUE),
Marc Kupietz632cbd42019-09-06 16:04:51 +020054 ifelse(vc != '', paste0('&vc=', URLencode(vc, reserved=TRUE)), ''),
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020055 '&ql=', ql);
56 webUIRequestUrl <- paste0(con$KorAPUrl, request)
57 requestUrl <- paste0(con$apiUrl, 'search', request,
58 '&fields=', paste(defaultFields, collapse = ","),
Marc Kupietz632cbd42019-09-06 16:04:51 +020059 ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020060 result <- fromJSON(paste0(requestUrl, '&count=1'))
Marc Kupietz632cbd42019-09-06 16:04:51 +020061 result$fields <- fields[!metadataOnly || !fields %in% contentFields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020062 result$requestUrl <- requestUrl
63 result$request <- request
Marc Kupietz632cbd42019-09-06 16:04:51 +020064 result$vc <- vc
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020065 result$webUIRequestUrl <- webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +020066 result$nextStartIndex <- 0
67 result$hasMoreMatches <- (result$meta$totalResults > 0)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020068 return(result)
69}
70
Marc Kupietz632cbd42019-09-06 16:04:51 +020071#' @export
Marc Kupietzb3065522019-09-09 11:34:19 +020072KorAPFetchAll <- function(queryObject, verbose=FALSE) {
73 if (queryObject$meta$totalResults == 0) { return(data.frame()) }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020074
75 page <- 1
76 results <- 0
77
78 repeat {
Marc Kupietzb3065522019-09-09 11:34:19 +020079 res <- fromJSON(paste0(queryObject$requestUrl, '&count=50&offset=', results))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020080 if (res$meta$totalResults == 0) { return(data.frame()) }
Marc Kupietzb3065522019-09-09 11:34:19 +020081 for (field in queryObject$fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020082 if (!field %in% colnames(res$matches)) {
83 res$matches[, field] <- NA
84 }
85 }
Marc Kupietzb3065522019-09-09 11:34:19 +020086 currentMatches <- res$matches[queryObject$fields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020087 factorCols <- colnames(subset(currentMatches, select=-c(pubDate)))
88 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
89 currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d")
Marc Kupietz9392d5d2019-08-30 16:48:50 +020090 if (results == 0) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020091 allMatches <- currentMatches
92 expectedResults <- res$meta$totalResults
93 } else {
94 allMatches <- rbind(allMatches, currentMatches)
95 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +020096 if (verbose) {
97 cat(paste0("Retrieved page: ", page, "/", ceiling(expectedResults / res$meta$itemsPerPage), ': ', res$meta$benchmark, '\n'))
98 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020099 page <- page + 1
100 results <- results + res$meta$itemsPerPage
101 if (results >= expectedResults) {
102 break
103 }
104 }
105 return(allMatches)
106}
Marc Kupietzcb725f82019-08-30 18:04:57 +0200107
Marc Kupietz632cbd42019-09-06 16:04:51 +0200108#' @export
Marc Kupietzb3065522019-09-09 11:34:19 +0200109KorAPFetchNext <- function(queryObject, offset=queryObject$nextStartIndex, verbose=FALSE) {
110 if (queryObject$nextStartIndex >= queryObject$meta$totalResults) {
111 queryObject$hasMoreMatches <- FALSE
112 return(queryObject)
Marc Kupietzcb725f82019-08-30 18:04:57 +0200113 }
114
Marc Kupietzb3065522019-09-09 11:34:19 +0200115 res <- fromJSON(paste0(queryObject$requestUrl, '&count=50&offset=', offset))
116 for (field in queryObject$fields) {
Marc Kupietzcb725f82019-08-30 18:04:57 +0200117 if (!field %in% colnames(res$matches)) {
118 res$matches[, field] <- NA
119 }
120 }
Marc Kupietzb3065522019-09-09 11:34:19 +0200121 currentMatches <- res$matches[queryObject$fields]
Marc Kupietzcb725f82019-08-30 18:04:57 +0200122 factorCols <- colnames(subset(currentMatches, select=-c(pubDate)))
123 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
124 currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d")
125 if (offset == 0) {
126 res$collectedMatches <- currentMatches
127 } else {
Marc Kupietzb3065522019-09-09 11:34:19 +0200128 res$collectedMatches <- rbind(queryObject$collectedMatches, currentMatches)
Marc Kupietzcb725f82019-08-30 18:04:57 +0200129 }
130 if (verbose) {
131 cat(paste0("Retrieved page in ", res$meta$benchmark, '\n'))
132 }
133 res$nextStartIndex <- res$meta$startIndex + res$meta$itemsPerPage
Marc Kupietzb3065522019-09-09 11:34:19 +0200134 res$fields <- queryObject$fields
135 res$requestUrl <- queryObject$requestUrl
136 res$request <- queryObject$request
137 res$webUIRequestUrl <- queryObject$webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +0200138 res$hasMoreMatches <- (res$meta$totalResults > res$nextStartIndex)
139
140 return(res)
141}