blob: 8bc822f284ee65fbcc871aa1a35e1a358fab1e7d [file] [log] [blame]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02001#' @import jsonlite
2#' @import curl
3
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02004defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietz632cbd42019-09-06 16:04:51 +02005 "availability", "textClass", "snippet")
Marc Kupietz5bbc9db2019-08-30 16:30:45 +02006
Marc Kupietz632cbd42019-09-06 16:04:51 +02007contentFields <- c("snippet")
8
9QueryParameterFromUrl <- function(url, parameter) {
10 regex <- paste0(".*[?&]", parameter, "=([^&]*).*")
11 if (grepl(regex, url)) {
12 return(gsub(regex, '\\1', url, perl = TRUE))
13 } else {
14 return("")
15 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020016}
17
Marc Kupietz632cbd42019-09-06 16:04:51 +020018KorAPQueryStringFromUrl <- function(KorAPUrl) {
19 return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE)))
20}
21
22#' \code{KorAPQuery} perform a query on the KorAP server.
23#' @param con object obtained from \code{\link{KorAPConnection}}, that contains all necessary connection information
24#' @param query string that contains the corpus query. The query langauge depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}
25#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
26#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
27#' @param metaDataOnly boolean that determines wether queries should return only metadata without any snippets. This can also be useful to prevent query rewrites.
28#' @param ql string to choose the query language
29#' @param fields (meta)data fields that will be fetch for every matcch
30#'
31#' @return A KorAP query object that, among other information, contains the total number of results in \code{$meta$totalResults}. The resulting object can be used to fetch all (\code{\link{KorAPFetchAll}) or the next page of results (\code{\link{KorAPFetchNext}}). Please make sure to check \code{$collection$rewrites} to see if any unforseen rewrites of the query had to be performed.
32#'
33#' @examples
34#' q <- KorAPQuery(con, "Ameisenplage")
35#' q <- KorAPQuery(KorAPConnection(), "Ameisenplage")
36#' q <- KorAPQuery(con, KorAPUrl = "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp&cutoff=1")
37#'
38#' @references
39#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
40#'
41#' @export
42KorAPQuery <- function(con, query = NA, vc=NA, KorAPUrl = NA, metadataOnly=FALSE, ql="poliqarp", fields=defaultFields) {
43 if (is.na(query) && is.na(KorAPUrl) || ! (is.na(query) || is.na(KorAPUrl))) {
44 stop("Exaclty one of the parameters query and KorAPUrl must be specified.")
45 }
46 if (is.na(query)) {
47 query <- QueryParameterFromUrl(KorAPUrl, "q")
48 vc <- QueryParameterFromUrl(KorAPUrl, "vc")
49 ql <- QueryParameterFromUrl(KorAPUrl, "ql")
50 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020051 request <- paste0('?q=', URLencode(query, reserved=TRUE),
Marc Kupietz632cbd42019-09-06 16:04:51 +020052 ifelse(vc != '', paste0('&vc=', URLencode(vc, reserved=TRUE)), ''),
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020053 '&ql=', ql);
54 webUIRequestUrl <- paste0(con$KorAPUrl, request)
Marc Kupietz632cbd42019-09-06 16:04:51 +020055 if (is.na(vc)) {
56 vc <-""
57 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020058 requestUrl <- paste0(con$apiUrl, 'search', request,
59 '&fields=', paste(defaultFields, collapse = ","),
Marc Kupietz632cbd42019-09-06 16:04:51 +020060 ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020061 result <- fromJSON(paste0(requestUrl, '&count=1'))
62
Marc Kupietz632cbd42019-09-06 16:04:51 +020063 result$fields <- fields[!metadataOnly || !fields %in% contentFields]
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020064 result$requestUrl <- requestUrl
65 result$request <- request
Marc Kupietz632cbd42019-09-06 16:04:51 +020066 result$vc <- vc
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020067 result$webUIRequestUrl <- webUIRequestUrl
Marc Kupietzcb725f82019-08-30 18:04:57 +020068 result$nextStartIndex <- 0
69 result$hasMoreMatches <- (result$meta$totalResults > 0)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020070 return(result)
71}
72
Marc Kupietz632cbd42019-09-06 16:04:51 +020073#' @export
Marc Kupietz31569912019-08-30 16:53:04 +020074KorAPFetchAll <- function(query, verbose=FALSE) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020075 if (query$meta$totalResults == 0) { return(data.frame()) }
76
77 page <- 1
78 results <- 0
79
80 repeat {
Marc Kupietz9392d5d2019-08-30 16:48:50 +020081 res <- fromJSON(paste0(query$requestUrl, '&count=50&offset=', results))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020082 if (res$meta$totalResults == 0) { return(data.frame()) }
83 for (field in query$fields) {
84 if (!field %in% colnames(res$matches)) {
85 res$matches[, field] <- NA
86 }
87 }
88 currentMatches <- res$matches[query$fields]
89 factorCols <- colnames(subset(currentMatches, select=-c(pubDate)))
90 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
91 currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d")
Marc Kupietz9392d5d2019-08-30 16:48:50 +020092 if (results == 0) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020093 allMatches <- currentMatches
94 expectedResults <- res$meta$totalResults
95 } else {
96 allMatches <- rbind(allMatches, currentMatches)
97 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +020098 if (verbose) {
99 cat(paste0("Retrieved page: ", page, "/", ceiling(expectedResults / res$meta$itemsPerPage), ': ', res$meta$benchmark, '\n'))
100 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200101 page <- page + 1
102 results <- results + res$meta$itemsPerPage
103 if (results >= expectedResults) {
104 break
105 }
106 }
107 return(allMatches)
108}
Marc Kupietzcb725f82019-08-30 18:04:57 +0200109
Marc Kupietz632cbd42019-09-06 16:04:51 +0200110#' @export
Marc Kupietzcb725f82019-08-30 18:04:57 +0200111KorAPFetchNext <- function(query, offset=query$nextStartIndex, verbose=FALSE) {
112 if (query$nextStartIndex >= query$meta$totalResults) {
113 query$hasMoreMatches <- FALSE
114 return(query)
115 }
116
117 res <- fromJSON(paste0(query$requestUrl, '&count=50&offset=', offset))
118 for (field in query$fields) {
119 if (!field %in% colnames(res$matches)) {
120 res$matches[, field] <- NA
121 }
122 }
123 currentMatches <- res$matches[query$fields]
124 factorCols <- colnames(subset(currentMatches, select=-c(pubDate)))
125 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
126 currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d")
127 if (offset == 0) {
128 res$collectedMatches <- currentMatches
129 } else {
130 res$collectedMatches <- rbind(query$collectedMatches, currentMatches)
131 }
132 if (verbose) {
133 cat(paste0("Retrieved page in ", res$meta$benchmark, '\n'))
134 }
135 res$nextStartIndex <- res$meta$startIndex + res$meta$itemsPerPage
136 res$fields <- query$fields
137 res$requestUrl <- query$requestUrl
138 res$request <- query$request
139 res$webUIRequestUrl <- query$webUIRequestUrl
140 res$hasMoreMatches <- (res$meta$totalResults > res$nextStartIndex)
141
142 return(res)
143}