Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 1 | #' @import jsonlite |
| 2 | #' @import curl |
| 3 | |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 4 | defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace", |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 5 | "availability", "textClass", "snippet") |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 6 | |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 7 | contentFields <- c("snippet") |
| 8 | |
| 9 | QueryParameterFromUrl <- function(url, parameter) { |
| 10 | regex <- paste0(".*[?&]", parameter, "=([^&]*).*") |
| 11 | if (grepl(regex, url)) { |
| 12 | return(gsub(regex, '\\1', url, perl = TRUE)) |
| 13 | } else { |
| 14 | return("") |
| 15 | } |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 16 | } |
| 17 | |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 18 | KorAPQueryStringFromUrl <- function(KorAPUrl) { |
| 19 | return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE))) |
| 20 | } |
| 21 | |
| 22 | #' \code{KorAPQuery} perform a query on the KorAP server. |
| 23 | #' @param con object obtained from \code{\link{KorAPConnection}}, that contains all necessary connection information |
| 24 | #' @param query string that contains the corpus query. The query langauge depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl} |
| 25 | #' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible. |
| 26 | #' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query. |
Marc Kupietz | b125bdd | 2019-09-09 12:05:59 +0200 | [diff] [blame^] | 27 | #' @param metadataOnly boolean that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible). |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 28 | #' @param ql string to choose the query language |
Marc Kupietz | b125bdd | 2019-09-09 12:05:59 +0200 | [diff] [blame^] | 29 | #' @param fields (meta)data fields that will be fetched for every match |
Marc Kupietz | 7bce47d | 2019-09-09 11:53:11 +0200 | [diff] [blame] | 30 | #' @return A KorAP query object that, among other information, contains the total number of results in \code{$meta$totalResults}. The resulting object can be used to fetch all query results (with \code{\link{KorAPFetchAll}}) or the next page of results (with \code{\link{KorAPFetchNext}}). Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed. |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 31 | #' |
| 32 | #' @examples |
| 33 | #' q <- KorAPQuery(con, "Ameisenplage") |
| 34 | #' q <- KorAPQuery(KorAPConnection(), "Ameisenplage") |
| 35 | #' q <- KorAPQuery(con, KorAPUrl = "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp&cutoff=1") |
| 36 | #' |
| 37 | #' @references |
| 38 | #' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026} |
| 39 | #' |
| 40 | #' @export |
Marc Kupietz | b125bdd | 2019-09-09 12:05:59 +0200 | [diff] [blame^] | 41 | KorAPQuery <- function(con, query, vc = NA, KorAPUrl = NA, metadataOnly = TRUE, ql = "poliqarp", fields = defaultFields) { |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 42 | if (is.na(query) && is.na(KorAPUrl) || ! (is.na(query) || is.na(KorAPUrl))) { |
| 43 | stop("Exaclty one of the parameters query and KorAPUrl must be specified.") |
| 44 | } |
| 45 | if (is.na(query)) { |
| 46 | query <- QueryParameterFromUrl(KorAPUrl, "q") |
| 47 | vc <- QueryParameterFromUrl(KorAPUrl, "vc") |
| 48 | ql <- QueryParameterFromUrl(KorAPUrl, "ql") |
| 49 | } |
Marc Kupietz | 7d88e2e | 2019-09-07 21:07:40 +0200 | [diff] [blame] | 50 | if (is.na(vc)) { |
| 51 | vc <- "" |
| 52 | } |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 53 | request <- paste0('?q=', URLencode(query, reserved=TRUE), |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 54 | ifelse(vc != '', paste0('&vc=', URLencode(vc, reserved=TRUE)), ''), |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 55 | '&ql=', ql); |
| 56 | webUIRequestUrl <- paste0(con$KorAPUrl, request) |
| 57 | requestUrl <- paste0(con$apiUrl, 'search', request, |
| 58 | '&fields=', paste(defaultFields, collapse = ","), |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 59 | ifelse(metadataOnly, '&access-rewrite-disabled=true', '')) |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 60 | result <- fromJSON(paste0(requestUrl, '&count=1')) |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 61 | result$fields <- fields[!metadataOnly || !fields %in% contentFields] |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 62 | result$requestUrl <- requestUrl |
| 63 | result$request <- request |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 64 | result$vc <- vc |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 65 | result$webUIRequestUrl <- webUIRequestUrl |
Marc Kupietz | cb725f8 | 2019-08-30 18:04:57 +0200 | [diff] [blame] | 66 | result$nextStartIndex <- 0 |
| 67 | result$hasMoreMatches <- (result$meta$totalResults > 0) |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 68 | return(result) |
| 69 | } |
| 70 | |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 71 | #' @export |
Marc Kupietz | b306552 | 2019-09-09 11:34:19 +0200 | [diff] [blame] | 72 | KorAPFetchAll <- function(queryObject, verbose=FALSE) { |
| 73 | if (queryObject$meta$totalResults == 0) { return(data.frame()) } |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 74 | |
| 75 | page <- 1 |
| 76 | results <- 0 |
| 77 | |
| 78 | repeat { |
Marc Kupietz | b306552 | 2019-09-09 11:34:19 +0200 | [diff] [blame] | 79 | res <- fromJSON(paste0(queryObject$requestUrl, '&count=50&offset=', results)) |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 80 | if (res$meta$totalResults == 0) { return(data.frame()) } |
Marc Kupietz | b306552 | 2019-09-09 11:34:19 +0200 | [diff] [blame] | 81 | for (field in queryObject$fields) { |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 82 | if (!field %in% colnames(res$matches)) { |
| 83 | res$matches[, field] <- NA |
| 84 | } |
| 85 | } |
Marc Kupietz | b306552 | 2019-09-09 11:34:19 +0200 | [diff] [blame] | 86 | currentMatches <- res$matches[queryObject$fields] |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 87 | factorCols <- colnames(subset(currentMatches, select=-c(pubDate))) |
| 88 | currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor) |
| 89 | currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d") |
Marc Kupietz | 9392d5d | 2019-08-30 16:48:50 +0200 | [diff] [blame] | 90 | if (results == 0) { |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 91 | allMatches <- currentMatches |
| 92 | expectedResults <- res$meta$totalResults |
| 93 | } else { |
| 94 | allMatches <- rbind(allMatches, currentMatches) |
| 95 | } |
Marc Kupietz | c2c59bd | 2019-08-30 16:50:49 +0200 | [diff] [blame] | 96 | if (verbose) { |
| 97 | cat(paste0("Retrieved page: ", page, "/", ceiling(expectedResults / res$meta$itemsPerPage), ': ', res$meta$benchmark, '\n')) |
| 98 | } |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 99 | page <- page + 1 |
| 100 | results <- results + res$meta$itemsPerPage |
| 101 | if (results >= expectedResults) { |
| 102 | break |
| 103 | } |
| 104 | } |
| 105 | return(allMatches) |
| 106 | } |
Marc Kupietz | cb725f8 | 2019-08-30 18:04:57 +0200 | [diff] [blame] | 107 | |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 108 | #' @export |
Marc Kupietz | b306552 | 2019-09-09 11:34:19 +0200 | [diff] [blame] | 109 | KorAPFetchNext <- function(queryObject, offset=queryObject$nextStartIndex, verbose=FALSE) { |
| 110 | if (queryObject$nextStartIndex >= queryObject$meta$totalResults) { |
| 111 | queryObject$hasMoreMatches <- FALSE |
| 112 | return(queryObject) |
Marc Kupietz | cb725f8 | 2019-08-30 18:04:57 +0200 | [diff] [blame] | 113 | } |
| 114 | |
Marc Kupietz | b306552 | 2019-09-09 11:34:19 +0200 | [diff] [blame] | 115 | res <- fromJSON(paste0(queryObject$requestUrl, '&count=50&offset=', offset)) |
| 116 | for (field in queryObject$fields) { |
Marc Kupietz | cb725f8 | 2019-08-30 18:04:57 +0200 | [diff] [blame] | 117 | if (!field %in% colnames(res$matches)) { |
| 118 | res$matches[, field] <- NA |
| 119 | } |
| 120 | } |
Marc Kupietz | b306552 | 2019-09-09 11:34:19 +0200 | [diff] [blame] | 121 | currentMatches <- res$matches[queryObject$fields] |
Marc Kupietz | cb725f8 | 2019-08-30 18:04:57 +0200 | [diff] [blame] | 122 | factorCols <- colnames(subset(currentMatches, select=-c(pubDate))) |
| 123 | currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor) |
| 124 | currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d") |
| 125 | if (offset == 0) { |
| 126 | res$collectedMatches <- currentMatches |
| 127 | } else { |
Marc Kupietz | b306552 | 2019-09-09 11:34:19 +0200 | [diff] [blame] | 128 | res$collectedMatches <- rbind(queryObject$collectedMatches, currentMatches) |
Marc Kupietz | cb725f8 | 2019-08-30 18:04:57 +0200 | [diff] [blame] | 129 | } |
| 130 | if (verbose) { |
| 131 | cat(paste0("Retrieved page in ", res$meta$benchmark, '\n')) |
| 132 | } |
| 133 | res$nextStartIndex <- res$meta$startIndex + res$meta$itemsPerPage |
Marc Kupietz | b306552 | 2019-09-09 11:34:19 +0200 | [diff] [blame] | 134 | res$fields <- queryObject$fields |
| 135 | res$requestUrl <- queryObject$requestUrl |
| 136 | res$request <- queryObject$request |
| 137 | res$webUIRequestUrl <- queryObject$webUIRequestUrl |
Marc Kupietz | cb725f8 | 2019-08-30 18:04:57 +0200 | [diff] [blame] | 138 | res$hasMoreMatches <- (res$meta$totalResults > res$nextStartIndex) |
| 139 | |
| 140 | return(res) |
| 141 | } |