Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 1 | #' @import jsonlite |
| 2 | #' @import curl |
| 3 | |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 4 | defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace", |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 5 | "availability", "textClass", "snippet") |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 6 | |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 7 | contentFields <- c("snippet") |
| 8 | |
| 9 | QueryParameterFromUrl <- function(url, parameter) { |
| 10 | regex <- paste0(".*[?&]", parameter, "=([^&]*).*") |
| 11 | if (grepl(regex, url)) { |
| 12 | return(gsub(regex, '\\1', url, perl = TRUE)) |
| 13 | } else { |
| 14 | return("") |
| 15 | } |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 16 | } |
| 17 | |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 18 | KorAPQueryStringFromUrl <- function(KorAPUrl) { |
| 19 | return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE))) |
| 20 | } |
| 21 | |
| 22 | #' \code{KorAPQuery} perform a query on the KorAP server. |
| 23 | #' @param con object obtained from \code{\link{KorAPConnection}}, that contains all necessary connection information |
| 24 | #' @param query string that contains the corpus query. The query langauge depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl} |
| 25 | #' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible. |
| 26 | #' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query. |
| 27 | #' @param metaDataOnly boolean that determines wether queries should return only metadata without any snippets. This can also be useful to prevent query rewrites. |
| 28 | #' @param ql string to choose the query language |
| 29 | #' @param fields (meta)data fields that will be fetch for every matcch |
| 30 | #' |
| 31 | #' @return A KorAP query object that, among other information, contains the total number of results in \code{$meta$totalResults}. The resulting object can be used to fetch all (\code{\link{KorAPFetchAll}) or the next page of results (\code{\link{KorAPFetchNext}}). Please make sure to check \code{$collection$rewrites} to see if any unforseen rewrites of the query had to be performed. |
| 32 | #' |
| 33 | #' @examples |
| 34 | #' q <- KorAPQuery(con, "Ameisenplage") |
| 35 | #' q <- KorAPQuery(KorAPConnection(), "Ameisenplage") |
| 36 | #' q <- KorAPQuery(con, KorAPUrl = "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp&cutoff=1") |
| 37 | #' |
| 38 | #' @references |
| 39 | #' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026} |
| 40 | #' |
| 41 | #' @export |
Marc Kupietz | 7d88e2e | 2019-09-07 21:07:40 +0200 | [diff] [blame] | 42 | KorAPQuery <- function(con, query, vc = NA, KorAPUrl = NA, metadataOnly=FALSE, ql="poliqarp", fields=defaultFields) { |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 43 | if (is.na(query) && is.na(KorAPUrl) || ! (is.na(query) || is.na(KorAPUrl))) { |
| 44 | stop("Exaclty one of the parameters query and KorAPUrl must be specified.") |
| 45 | } |
| 46 | if (is.na(query)) { |
| 47 | query <- QueryParameterFromUrl(KorAPUrl, "q") |
| 48 | vc <- QueryParameterFromUrl(KorAPUrl, "vc") |
| 49 | ql <- QueryParameterFromUrl(KorAPUrl, "ql") |
| 50 | } |
Marc Kupietz | 7d88e2e | 2019-09-07 21:07:40 +0200 | [diff] [blame] | 51 | if (is.na(vc)) { |
| 52 | vc <- "" |
| 53 | } |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 54 | request <- paste0('?q=', URLencode(query, reserved=TRUE), |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 55 | ifelse(vc != '', paste0('&vc=', URLencode(vc, reserved=TRUE)), ''), |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 56 | '&ql=', ql); |
| 57 | webUIRequestUrl <- paste0(con$KorAPUrl, request) |
| 58 | requestUrl <- paste0(con$apiUrl, 'search', request, |
| 59 | '&fields=', paste(defaultFields, collapse = ","), |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 60 | ifelse(metadataOnly, '&access-rewrite-disabled=true', '')) |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 61 | result <- fromJSON(paste0(requestUrl, '&count=1')) |
| 62 | |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 63 | result$fields <- fields[!metadataOnly || !fields %in% contentFields] |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 64 | result$requestUrl <- requestUrl |
| 65 | result$request <- request |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 66 | result$vc <- vc |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 67 | result$webUIRequestUrl <- webUIRequestUrl |
Marc Kupietz | cb725f8 | 2019-08-30 18:04:57 +0200 | [diff] [blame] | 68 | result$nextStartIndex <- 0 |
| 69 | result$hasMoreMatches <- (result$meta$totalResults > 0) |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 70 | return(result) |
| 71 | } |
| 72 | |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 73 | #' @export |
Marc Kupietz | 3156991 | 2019-08-30 16:53:04 +0200 | [diff] [blame] | 74 | KorAPFetchAll <- function(query, verbose=FALSE) { |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 75 | if (query$meta$totalResults == 0) { return(data.frame()) } |
| 76 | |
| 77 | page <- 1 |
| 78 | results <- 0 |
| 79 | |
| 80 | repeat { |
Marc Kupietz | 9392d5d | 2019-08-30 16:48:50 +0200 | [diff] [blame] | 81 | res <- fromJSON(paste0(query$requestUrl, '&count=50&offset=', results)) |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 82 | if (res$meta$totalResults == 0) { return(data.frame()) } |
| 83 | for (field in query$fields) { |
| 84 | if (!field %in% colnames(res$matches)) { |
| 85 | res$matches[, field] <- NA |
| 86 | } |
| 87 | } |
| 88 | currentMatches <- res$matches[query$fields] |
| 89 | factorCols <- colnames(subset(currentMatches, select=-c(pubDate))) |
| 90 | currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor) |
| 91 | currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d") |
Marc Kupietz | 9392d5d | 2019-08-30 16:48:50 +0200 | [diff] [blame] | 92 | if (results == 0) { |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 93 | allMatches <- currentMatches |
| 94 | expectedResults <- res$meta$totalResults |
| 95 | } else { |
| 96 | allMatches <- rbind(allMatches, currentMatches) |
| 97 | } |
Marc Kupietz | c2c59bd | 2019-08-30 16:50:49 +0200 | [diff] [blame] | 98 | if (verbose) { |
| 99 | cat(paste0("Retrieved page: ", page, "/", ceiling(expectedResults / res$meta$itemsPerPage), ': ', res$meta$benchmark, '\n')) |
| 100 | } |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 101 | page <- page + 1 |
| 102 | results <- results + res$meta$itemsPerPage |
| 103 | if (results >= expectedResults) { |
| 104 | break |
| 105 | } |
| 106 | } |
| 107 | return(allMatches) |
| 108 | } |
Marc Kupietz | cb725f8 | 2019-08-30 18:04:57 +0200 | [diff] [blame] | 109 | |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 110 | #' @export |
Marc Kupietz | cb725f8 | 2019-08-30 18:04:57 +0200 | [diff] [blame] | 111 | KorAPFetchNext <- function(query, offset=query$nextStartIndex, verbose=FALSE) { |
| 112 | if (query$nextStartIndex >= query$meta$totalResults) { |
| 113 | query$hasMoreMatches <- FALSE |
| 114 | return(query) |
| 115 | } |
| 116 | |
| 117 | res <- fromJSON(paste0(query$requestUrl, '&count=50&offset=', offset)) |
| 118 | for (field in query$fields) { |
| 119 | if (!field %in% colnames(res$matches)) { |
| 120 | res$matches[, field] <- NA |
| 121 | } |
| 122 | } |
| 123 | currentMatches <- res$matches[query$fields] |
| 124 | factorCols <- colnames(subset(currentMatches, select=-c(pubDate))) |
| 125 | currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor) |
| 126 | currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d") |
| 127 | if (offset == 0) { |
| 128 | res$collectedMatches <- currentMatches |
| 129 | } else { |
| 130 | res$collectedMatches <- rbind(query$collectedMatches, currentMatches) |
| 131 | } |
| 132 | if (verbose) { |
| 133 | cat(paste0("Retrieved page in ", res$meta$benchmark, '\n')) |
| 134 | } |
| 135 | res$nextStartIndex <- res$meta$startIndex + res$meta$itemsPerPage |
| 136 | res$fields <- query$fields |
| 137 | res$requestUrl <- query$requestUrl |
| 138 | res$request <- query$request |
| 139 | res$webUIRequestUrl <- query$webUIRequestUrl |
| 140 | res$hasMoreMatches <- (res$meta$totalResults > res$nextStartIndex) |
| 141 | |
| 142 | return(res) |
| 143 | } |