blob: d0dbf4859879169591f08728cb3fdce0899752d1 [file] [log] [blame]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02001#' @import jsonlite
2#' @import curl
3
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02004defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietz632cbd42019-09-06 16:04:51 +02005 "availability", "textClass", "snippet")
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02006
Marc Kupietz632cbd42019-09-06 16:04:51 +02007contentFields <- c("snippet")
8
9QueryParameterFromUrl <- function(url, parameter) {
10 regex <- paste0(".*[?&]", parameter, "=([^&]*).*")
11 if (grepl(regex, url)) {
12 return(gsub(regex, '\\1', url, perl = TRUE))
13 } else {
14 return("")
15 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020016}
17
Marc Kupietz632cbd42019-09-06 16:04:51 +020018KorAPQueryStringFromUrl <- function(KorAPUrl) {
19 return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE)))
20}
21
22#' \code{KorAPQuery} perform a query on the KorAP server.
23#' @param con object obtained from \code{\link{KorAPConnection}}, that contains all necessary connection information
24#' @param query string that contains the corpus query. The query langauge depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}
25#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
26#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietzb3065522019-09-09 11:34:19 +020027#' @param metadataOnly boolean that determines wether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites.
Marc Kupietz632cbd42019-09-06 16:04:51 +020028#' @param ql string to choose the query language
29#' @param fields (meta)data fields that will be fetch for every matcch
Marc Kupietz7bce47d2019-09-09 11:53:11 +020030#' @return A KorAP query object that, among other information, contains the total number of results in \code{$meta$totalResults}. The resulting object can be used to fetch all query results (with \code{\link{KorAPFetchAll}}) or the next page of results (with \code{\link{KorAPFetchNext}}). Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +020031#'
32#' @examples
33#' q <- KorAPQuery(con, "Ameisenplage")
34#' q <- KorAPQuery(KorAPConnection(), "Ameisenplage")
35#' q <- KorAPQuery(con, KorAPUrl = "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp&cutoff=1")
36#'
37#' @references
38#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
39#'
40#' @export
Marc Kupietz7d88e2e2019-09-07 21:07:40 +020041KorAPQuery <- function(con, query, vc = NA, KorAPUrl = NA, metadataOnly=FALSE, ql="poliqarp", fields=defaultFields) {
Marc Kupietz632cbd42019-09-06 16:04:51 +020042 if (is.na(query) && is.na(KorAPUrl) || ! (is.na(query) || is.na(KorAPUrl))) {
43 stop("Exaclty one of the parameters query and KorAPUrl must be specified.")
44 }
45 if (is.na(query)) {
46 query <- QueryParameterFromUrl(KorAPUrl, "q")
47 vc <- QueryParameterFromUrl(KorAPUrl, "vc")
48 ql <- QueryParameterFromUrl(KorAPUrl, "ql")
49 }
Marc Kupietz7d88e2e2019-09-07 21:07:40 +020050 if (is.na(vc)) {
51 vc <- ""
52 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020053 request <- paste0('?q=', URLencode(query, reserved=TRUE),
Marc Kupietz632cbd42019-09-06 16:04:51 +020054 ifelse(vc != '', paste0('&vc=', URLencode(vc, reserved=TRUE)), ''),
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020055 '&ql=', ql);
56 webUIRequestUrl <- paste0(con$KorAPUrl, request)
57 requestUrl <- paste0(con$apiUrl, 'search', request,
58 '&fields=', paste(defaultFields, collapse = ","),
Marc Kupietz632cbd42019-09-06 16:04:51 +020059 ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020060 result <- fromJSON(paste0(requestUrl, '&count=1'))
61
Marc Kupietz632cbd42019-09-06 16:04:51 +020062 result$fields <- fields[!metadataOnly || !fields %in% contentFields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020063 result$requestUrl <- requestUrl
64 result$request <- request
Marc Kupietz632cbd42019-09-06 16:04:51 +020065 result$vc <- vc
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020066 result$webUIRequestUrl <- webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +020067 result$nextStartIndex <- 0
68 result$hasMoreMatches <- (result$meta$totalResults > 0)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020069 return(result)
70}
71
Marc Kupietz632cbd42019-09-06 16:04:51 +020072#' @export
Marc Kupietzb3065522019-09-09 11:34:19 +020073KorAPFetchAll <- function(queryObject, verbose=FALSE) {
74 if (queryObject$meta$totalResults == 0) { return(data.frame()) }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020075
76 page <- 1
77 results <- 0
78
79 repeat {
Marc Kupietzb3065522019-09-09 11:34:19 +020080 res <- fromJSON(paste0(queryObject$requestUrl, '&count=50&offset=', results))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020081 if (res$meta$totalResults == 0) { return(data.frame()) }
Marc Kupietzb3065522019-09-09 11:34:19 +020082 for (field in queryObject$fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020083 if (!field %in% colnames(res$matches)) {
84 res$matches[, field] <- NA
85 }
86 }
Marc Kupietzb3065522019-09-09 11:34:19 +020087 currentMatches <- res$matches[queryObject$fields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020088 factorCols <- colnames(subset(currentMatches, select=-c(pubDate)))
89 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
90 currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d")
Marc Kupietz9392d5d2019-08-30 16:48:50 +020091 if (results == 0) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020092 allMatches <- currentMatches
93 expectedResults <- res$meta$totalResults
94 } else {
95 allMatches <- rbind(allMatches, currentMatches)
96 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +020097 if (verbose) {
98 cat(paste0("Retrieved page: ", page, "/", ceiling(expectedResults / res$meta$itemsPerPage), ': ', res$meta$benchmark, '\n'))
99 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200100 page <- page + 1
101 results <- results + res$meta$itemsPerPage
102 if (results >= expectedResults) {
103 break
104 }
105 }
106 return(allMatches)
107}
Marc Kupietzcb725f82019-08-30 18:04:57 +0200108
Marc Kupietz632cbd42019-09-06 16:04:51 +0200109#' @export
Marc Kupietzb3065522019-09-09 11:34:19 +0200110KorAPFetchNext <- function(queryObject, offset=queryObject$nextStartIndex, verbose=FALSE) {
111 if (queryObject$nextStartIndex >= queryObject$meta$totalResults) {
112 queryObject$hasMoreMatches <- FALSE
113 return(queryObject)
Marc Kupietzcb725f82019-08-30 18:04:57 +0200114 }
115
Marc Kupietzb3065522019-09-09 11:34:19 +0200116 res <- fromJSON(paste0(queryObject$requestUrl, '&count=50&offset=', offset))
117 for (field in queryObject$fields) {
Marc Kupietzcb725f82019-08-30 18:04:57 +0200118 if (!field %in% colnames(res$matches)) {
119 res$matches[, field] <- NA
120 }
121 }
Marc Kupietzb3065522019-09-09 11:34:19 +0200122 currentMatches <- res$matches[queryObject$fields]
Marc Kupietzcb725f82019-08-30 18:04:57 +0200123 factorCols <- colnames(subset(currentMatches, select=-c(pubDate)))
124 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
125 currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d")
126 if (offset == 0) {
127 res$collectedMatches <- currentMatches
128 } else {
Marc Kupietzb3065522019-09-09 11:34:19 +0200129 res$collectedMatches <- rbind(queryObject$collectedMatches, currentMatches)
Marc Kupietzcb725f82019-08-30 18:04:57 +0200130 }
131 if (verbose) {
132 cat(paste0("Retrieved page in ", res$meta$benchmark, '\n'))
133 }
134 res$nextStartIndex <- res$meta$startIndex + res$meta$itemsPerPage
Marc Kupietzb3065522019-09-09 11:34:19 +0200135 res$fields <- queryObject$fields
136 res$requestUrl <- queryObject$requestUrl
137 res$request <- queryObject$request
138 res$webUIRequestUrl <- queryObject$webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +0200139 res$hasMoreMatches <- (res$meta$totalResults > res$nextStartIndex)
140
141 return(res)
142}