Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 1 | #' Class KorAPQuery |
| 2 | #' |
Marc Kupietz | a6e4ee6 | 2021-03-05 09:00:15 +0100 | [diff] [blame] | 3 | #' This class provides methods to perform different kinds of queries on the KorAP API server. |
| 4 | #' \code{KorAPQuery} objects, which are typically created by the \code{\link{corpusQuery}} method, |
| 5 | #' represent the current state of a query to a KorAP server. |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 6 | #' |
| 7 | #' @include KorAPConnection.R |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 8 | #' @import httr |
| 9 | #' |
Marc Kupietz | a6e4ee6 | 2021-03-05 09:00:15 +0100 | [diff] [blame] | 10 | #' @include RKorAPClient-package.R |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 11 | |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 12 | #' @export |
| 13 | KorAPQuery <- setClass("KorAPQuery", slots = c( |
Marc Kupietz | b897218 | 2019-09-20 21:33:46 +0200 | [diff] [blame] | 14 | "korapConnection", |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 15 | "request", |
| 16 | "vc", |
| 17 | "totalResults", |
| 18 | "nextStartIndex", |
| 19 | "fields", |
| 20 | "requestUrl", |
| 21 | "webUIRequestUrl", |
| 22 | "apiResponse", |
| 23 | "collectedMatches", |
| 24 | "hasMoreMatches" |
| 25 | )) |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 26 | |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 27 | #' Method initialize |
| 28 | #' |
| 29 | #' @rdname KorAPQuery-class |
| 30 | #' @param .Object … |
Marc Kupietz | b897218 | 2019-09-20 21:33:46 +0200 | [diff] [blame] | 31 | #' @param korapConnection KorAPConnection object |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 32 | #' @param request query part of the request URL |
| 33 | #' @param vc definition of a virtual corpus |
| 34 | #' @param totalResults number of hits the query has yielded |
| 35 | #' @param nextStartIndex at what index to start the next fetch of query results |
| 36 | #' @param fields what data / metadata fields should be collected |
| 37 | #' @param requestUrl complete URL of the API request |
| 38 | #' @param webUIRequestUrl URL of a web frontend request corresponding to the API request |
| 39 | #' @param apiResponse data-frame representation of the JSON response of the API request |
Marc Kupietz | 7776dec | 2019-09-27 16:59:02 +0200 | [diff] [blame] | 40 | #' @param hasMoreMatches logical that signals if more query results can be fetched |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 41 | #' @param collectedMatches matches already fetched from the KorAP-API-server |
Marc Kupietz | 97a1bca | 2019-10-04 22:52:09 +0200 | [diff] [blame] | 42 | #' |
| 43 | #' @importFrom tibble tibble |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 44 | #' @export |
| 45 | setMethod("initialize", "KorAPQuery", |
Marc Kupietz | b897218 | 2019-09-20 21:33:46 +0200 | [diff] [blame] | 46 | function(.Object, korapConnection = NULL, request = NULL, vc="", totalResults=0, nextStartIndex=0, fields=c("corpusSigle", "textSigle", "pubDate", "pubPlace", |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 47 | "availability", "textClass", "snippet"), |
| 48 | requestUrl="", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches= FALSE, collectedMatches = NULL) { |
| 49 | .Object <- callNextMethod() |
Marc Kupietz | b897218 | 2019-09-20 21:33:46 +0200 | [diff] [blame] | 50 | .Object@korapConnection = korapConnection |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 51 | .Object@request = request |
| 52 | .Object@vc = vc |
| 53 | .Object@totalResults = totalResults |
| 54 | .Object@nextStartIndex = nextStartIndex |
| 55 | .Object@fields = fields |
| 56 | .Object@requestUrl = requestUrl |
| 57 | .Object@webUIRequestUrl = webUIRequestUrl |
| 58 | .Object@apiResponse = apiResponse |
| 59 | .Object@hasMoreMatches = hasMoreMatches |
| 60 | .Object@collectedMatches = collectedMatches |
| 61 | .Object |
| 62 | }) |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 63 | |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 64 | setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery") ) |
| 65 | setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll") ) |
| 66 | setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") ) |
| 67 | setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") ) |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 68 | setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery") ) |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 69 | setGeneric("collocationScoreQuery", function(kco, ...) standardGeneric("collocationScoreQuery") ) |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 70 | |
| 71 | maxResultsPerPage <- 50 |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 72 | |
Marc Kupietz | 4de53ec | 2019-10-04 09:12:00 +0200 | [diff] [blame] | 73 | ## quiets concerns of R CMD check re: the .'s that appear in pipelines |
| 74 | if(getRversion() >= "2.15.1") utils::globalVariables(c(".")) |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 75 | |
Marc Kupietz | a6e4ee6 | 2021-03-05 09:00:15 +0100 | [diff] [blame] | 76 | #' \bold{\code{corpusQuery}} performs a corpus query via a connection to a KorAP-API-server |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 77 | #' |
| 78 | #' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")} |
Akron | 5e13546 | 2019-09-27 16:31:38 +0200 | [diff] [blame] | 79 | #' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}. |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 80 | #' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible. |
| 81 | #' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query. |
Marc Kupietz | 7776dec | 2019-09-27 16:59:02 +0200 | [diff] [blame] | 82 | #' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible). |
Marc Kupietz | 3c531f6 | 2019-09-13 12:17:24 +0200 | [diff] [blame] | 83 | #' @param ql string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values. |
Akron | 5e13546 | 2019-09-27 16:31:38 +0200 | [diff] [blame] | 84 | #' @param fields (meta)data fields that will be fetched for every match. |
Marc Kupietz | 43a6ade | 2020-02-18 17:01:44 +0100 | [diff] [blame] | 85 | #' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented). |
Marc Kupietz | 25aebc3 | 2019-09-16 18:40:50 +0200 | [diff] [blame] | 86 | #' @param verbose print some info |
Marc Kupietz | 4de53ec | 2019-10-04 09:12:00 +0200 | [diff] [blame] | 87 | #' @param as.df return result as data frame instead of as S4 object? |
Marc Kupietz | 43a6ade | 2020-02-18 17:01:44 +0100 | [diff] [blame] | 88 | #' @param expand logical that decides if \code{query} and \code{vc} parameters are expanded to all of their combinations |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 89 | #' @return Depending on the \code{as.df} parameter, a table or a \code{\link{KorAPQuery}} object that, among other information, contains the total number of results in \code{@totalResults}. The resulting object can be used to fetch all query results (with \code{\link{fetchAll}}) or the next page of results (with \code{\link{fetchNext}}). |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 90 | #' A corresponding URL to be used within a web browser is contained in \code{@webUIRequestUrl} |
Marc Kupietz | 43a6ade | 2020-02-18 17:01:44 +0100 | [diff] [blame] | 91 | #' Please make sure to check \code{$collection$rewrites} to see if any unforeseen access rewrites of the query's virtual corpus had to be performed. |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 92 | #' |
| 93 | #' @examples |
Marc Kupietz | 603491f | 2019-09-18 14:01:02 +0200 | [diff] [blame] | 94 | #' # Fetch metadata of every query hit for "Ameisenplage" and show a summary |
Marc Kupietz | 657d8e7 | 2020-02-25 18:31:50 +0100 | [diff] [blame] | 95 | #' \donttest{ |
Marc Kupietz | 69cc54a | 2019-09-30 12:06:54 +0200 | [diff] [blame] | 96 | #' new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll() |
Marc Kupietz | 657d8e7 | 2020-02-25 18:31:50 +0100 | [diff] [blame] | 97 | #' } |
Marc Kupietz | 3c531f6 | 2019-09-13 12:17:24 +0200 | [diff] [blame] | 98 | #' |
Marc Kupietz | 603491f | 2019-09-18 14:01:02 +0200 | [diff] [blame] | 99 | #' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus |
| 100 | #' # and show the number of query hits (but don't fetch them). |
Marc Kupietz | 69cc54a | 2019-09-30 12:06:54 +0200 | [diff] [blame] | 101 | #' |
| 102 | #' new("KorAPConnection", verbose = TRUE) %>% |
| 103 | #' corpusQuery(KorAPUrl = |
| 104 | #' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp") |
Marc Kupietz | 3c531f6 | 2019-09-13 12:17:24 +0200 | [diff] [blame] | 105 | #' |
Marc Kupietz | 603491f | 2019-09-18 14:01:02 +0200 | [diff] [blame] | 106 | #' # Plot the time/frequency curve of "Ameisenplage" |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 107 | #' \donttest{ |
Marc Kupietz | 69cc54a | 2019-09-30 12:06:54 +0200 | [diff] [blame] | 108 | #' new("KorAPConnection", verbose=TRUE) %>% |
| 109 | #' { . ->> kco } %>% |
| 110 | #' corpusQuery("Ameisenplage") %>% |
| 111 | #' fetchAll() %>% |
| 112 | #' slot("collectedMatches") %>% |
| 113 | #' mutate(year = lubridate::year(pubDate)) %>% |
Marc Kupietz | 19e2ebd | 2019-10-07 11:45:30 +0200 | [diff] [blame] | 114 | #' dplyr::select(year) %>% |
Marc Kupietz | 69cc54a | 2019-09-30 12:06:54 +0200 | [diff] [blame] | 115 | #' group_by(year) %>% |
Marc Kupietz | cb3c59e | 2020-06-02 10:10:43 +0200 | [diff] [blame] | 116 | #' summarise(Count = dplyr::n()) %>% |
Marc Kupietz | 69cc54a | 2019-09-30 12:06:54 +0200 | [diff] [blame] | 117 | #' mutate(Freq = mapply(function(f, y) |
| 118 | #' f / corpusStats(kco, paste("pubDate in", y))@tokens, Count, year)) %>% |
Marc Kupietz | 19e2ebd | 2019-10-07 11:45:30 +0200 | [diff] [blame] | 119 | #' dplyr::select(-Count) %>% |
Marc Kupietz | 69cc54a | 2019-09-30 12:06:54 +0200 | [diff] [blame] | 120 | #' complete(year = min(year):max(year), fill = list(Freq = 0)) %>% |
| 121 | #' plot(type = "l") |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 122 | #' } |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 123 | #' @seealso \code{\link{KorAPConnection}}, \code{\link{fetchNext}}, \code{\link{fetchRest}}, \code{\link{fetchAll}}, \code{\link{corpusStats}} |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 124 | #' |
| 125 | #' @references |
| 126 | #' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026} |
| 127 | #' |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 128 | #' @aliases corpusQuery |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 129 | #' @export |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 130 | setMethod("corpusQuery", "KorAPConnection", |
Marc Kupietz | a96537f | 2019-11-09 23:07:44 +0100 | [diff] [blame] | 131 | function(kco, |
| 132 | query = if (missing(KorAPUrl)) |
| 133 | stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE) |
| 134 | else |
| 135 | httr::parse_url(KorAPUrl)$query$q, |
| 136 | vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq, |
| 137 | KorAPUrl, |
| 138 | metadataOnly = TRUE, |
| 139 | ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql, |
| 140 | fields = c( |
| 141 | "corpusSigle", |
| 142 | "textSigle", |
| 143 | "pubDate", |
| 144 | "pubPlace", |
| 145 | "availability", |
| 146 | "textClass", |
| 147 | "snippet" |
| 148 | ), |
| 149 | accessRewriteFatal = TRUE, |
| 150 | verbose = kco@verbose, |
| 151 | expand = length(vc) != length(query), |
| 152 | as.df = FALSE) { |
| 153 | if (length(query) > 1 || length(vc) > 1) { |
| 154 | |
| 155 | grid <- { |
| 156 | if (expand) |
| 157 | expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc) } |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 158 | return( |
Marc Kupietz | 4de53ec | 2019-10-04 09:12:00 +0200 | [diff] [blame] | 159 | do.call(rbind, |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 160 | Map(function(q, cq) corpusQuery(kco, query=q, vc=cq, ql=ql, |
| 161 | verbose=verbose, as.df = TRUE), grid$query, grid$vc)) %>% |
| 162 | remove_rownames() |
Marc Kupietz | a96537f | 2019-11-09 23:07:44 +0100 | [diff] [blame] | 163 | ) |
| 164 | } else { |
| 165 | contentFields <- c("snippet") |
| 166 | if (metadataOnly) { |
| 167 | fields <- fields[!fields %in% contentFields] |
| 168 | } |
| 169 | request <- |
| 170 | paste0('?q=', |
| 171 | URLencode(query, reserved = TRUE), |
| 172 | if (vc != '') paste0('&cq=', URLencode(vc, reserved = TRUE)) else '', '&ql=', ql) |
| 173 | webUIRequestUrl <- paste0(kco@KorAPUrl, request) |
| 174 | requestUrl <- paste0( |
| 175 | kco@apiUrl, |
| 176 | 'search', |
| 177 | request, |
| 178 | '&fields=', |
| 179 | paste(fields, collapse = ","), |
| 180 | if (metadataOnly) '&access-rewrite-disabled=true' else '' |
| 181 | ) |
| 182 | log.info(verbose, "Searching \"", query, "\" in \"", vc, "\"", sep = |
| 183 | "") |
| 184 | res = apiCall(kco, paste0(requestUrl, '&count=0')) |
Marc Kupietz | f5769b6 | 2019-12-13 09:19:45 +0100 | [diff] [blame] | 185 | if(!is.null(res$meta$cached)) |
| 186 | log.info(verbose, " [cached]\n") |
| 187 | else |
| 188 | log.info(verbose, " took ", res$meta$benchmark, "\n", sep = "") |
Marc Kupietz | a96537f | 2019-11-09 23:07:44 +0100 | [diff] [blame] | 189 | if (as.df) |
| 190 | data.frame( |
| 191 | query = query, |
| 192 | totalResults = res$meta$totalResults, |
| 193 | vc = vc, |
| 194 | webUIRequestUrl = webUIRequestUrl, |
| 195 | stringsAsFactors = FALSE |
| 196 | ) |
| 197 | else |
| 198 | KorAPQuery( |
| 199 | korapConnection = kco, |
| 200 | nextStartIndex = 0, |
| 201 | fields = fields, |
| 202 | requestUrl = requestUrl, |
| 203 | request = request, |
| 204 | totalResults = res$meta$totalResults, |
| 205 | vc = vc, |
| 206 | apiResponse = res, |
| 207 | webUIRequestUrl = webUIRequestUrl, |
| 208 | hasMoreMatches = (res$meta$totalResults > 0), |
| 209 | ) |
| 210 | } |
Marc Kupietz | 4de53ec | 2019-10-04 09:12:00 +0200 | [diff] [blame] | 211 | }) |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 212 | |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 213 | #' Fetch the next bunch of results of a KorAP query. |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 214 | #' |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 215 | #' \bold{\code{fetchNext}} fetches the next bunch of results of a KorAP query. |
| 216 | #' |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 217 | #' @param kqo object obtained from \code{\link{corpusQuery}} |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 218 | #' @param offset start offset for query results to fetch |
| 219 | #' @param maxFetch maximum number of query results to fetch |
Marc Kupietz | 25aebc3 | 2019-09-16 18:40:50 +0200 | [diff] [blame] | 220 | #' @param verbose print progress information if true |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 221 | #' @return The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches} |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 222 | #' |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 223 | #' @examples |
Marc Kupietz | 657d8e7 | 2020-02-25 18:31:50 +0100 | [diff] [blame] | 224 | #' \donttest{q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchNext() |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 225 | #' q@collectedMatches |
Marc Kupietz | 657d8e7 | 2020-02-25 18:31:50 +0100 | [diff] [blame] | 226 | #' } |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 227 | #' |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 228 | #' @references |
| 229 | #' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026} |
| 230 | #' |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 231 | #' @aliases fetchNext |
| 232 | #' @rdname KorAPQuery-class |
Marc Kupietz | cb3c59e | 2020-06-02 10:10:43 +0200 | [diff] [blame] | 233 | #' @importFrom dplyr rowwise bind_rows select summarise n |
Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 234 | #' @export |
Marc Kupietz | f6f7131 | 2019-09-23 18:35:27 +0200 | [diff] [blame] | 235 | setMethod("fetchNext", "KorAPQuery", function(kqo, offset = kqo@nextStartIndex, maxFetch = maxResultsPerPage, verbose = kqo@korapConnection@verbose) { |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 236 | if (kqo@totalResults == 0 || offset >= kqo@totalResults) { |
| 237 | return(kqo) |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 238 | } |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 239 | |
| 240 | page <- 1 |
| 241 | results <- 0 |
Marc Kupietz | 25aebc3 | 2019-09-16 18:40:50 +0200 | [diff] [blame] | 242 | pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 243 | collectedMatches <- kqo@collectedMatches |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 244 | |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 245 | repeat { |
Marc Kupietz | a96537f | 2019-11-09 23:07:44 +0100 | [diff] [blame] | 246 | res <- apiCall(kqo@korapConnection, paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', offset + results)) |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 247 | if (res$meta$totalResults == 0) { return(kqo) } |
| 248 | for (field in kqo@fields) { |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 249 | if (!field %in% colnames(res$matches)) { |
| 250 | res$matches[, field] <- NA |
| 251 | } |
| 252 | } |
Marc Kupietz | 69cc54a | 2019-09-30 12:06:54 +0200 | [diff] [blame] | 253 | currentMatches <- |
Marc Kupietz | f75ab0b | 2020-06-02 12:31:18 +0200 | [diff] [blame] | 254 | res$matches %>% |
Marc Kupietz | 19e2ebd | 2019-10-07 11:45:30 +0200 | [diff] [blame] | 255 | dplyr::select(kqo@fields) |
Marc Kupietz | 36d12d9 | 2019-09-27 18:13:27 +0200 | [diff] [blame] | 256 | if ("pubDate" %in% kqo@fields) { |
Marc Kupietz | 69cc54a | 2019-09-30 12:06:54 +0200 | [diff] [blame] | 257 | currentMatches$pubDate <- currentMatches$pubDate %>% as.Date(format = "%Y-%m-%d") |
| 258 | factorCols <- currentMatches %>% select(-pubDate) %>% colnames() |
Marc Kupietz | 36d12d9 | 2019-09-27 18:13:27 +0200 | [diff] [blame] | 259 | } else { |
| 260 | factorCols <- colnames(currentMatches) |
| 261 | } |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 262 | currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor) |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 263 | if (!is.list(collectedMatches)) { |
| 264 | collectedMatches <- currentMatches |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 265 | } else { |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 266 | collectedMatches <- rbind(collectedMatches, currentMatches) |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 267 | } |
Marc Kupietz | c2c59bd | 2019-08-30 16:50:49 +0200 | [diff] [blame] | 268 | if (verbose) { |
Marc Kupietz | f6f7131 | 2019-09-23 18:35:27 +0200 | [diff] [blame] | 269 | cat(paste0("Retrieved page ", page, "/", ceiling((res$meta$totalResults) / res$meta$itemsPerPage), ' in ', res$meta$benchmark, '\n')) |
Marc Kupietz | c2c59bd | 2019-08-30 16:50:49 +0200 | [diff] [blame] | 270 | } |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 271 | page <- page + 1 |
| 272 | results <- results + res$meta$itemsPerPage |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 273 | if (offset + results >= res$meta$totalResults || (!is.na(maxFetch) && results >= maxFetch)) { |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 274 | break |
| 275 | } |
| 276 | } |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 277 | nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, res$meta$totalResults) |
| 278 | KorAPQuery(nextStartIndex = nextStartIndex, |
Marc Kupietz | d0d3e9b | 2019-09-24 17:36:03 +0200 | [diff] [blame] | 279 | korapConnection = kqo@korapConnection, |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 280 | fields = kqo@fields, |
| 281 | requestUrl = kqo@requestUrl, |
| 282 | request = kqo@request, |
| 283 | totalResults = res$meta$totalResults, |
| 284 | vc = kqo@vc, |
| 285 | webUIRequestUrl = kqo@webUIRequestUrl, |
| 286 | hasMoreMatches = (res$meta$totalResults > nextStartIndex), |
| 287 | apiResponse = res, |
| 288 | collectedMatches = collectedMatches) |
| 289 | }) |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 290 | |
| 291 | #' Fetch all results of a KorAP query. |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 292 | #' |
Marc Kupietz | a6e4ee6 | 2021-03-05 09:00:15 +0100 | [diff] [blame] | 293 | #' \bold{\code{fetchAll}} fetches allf results of a KorAP query. |
| 294 | #' |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 295 | #' @examples |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 296 | #' \donttest{ |
Marc Kupietz | 69cc54a | 2019-09-30 12:06:54 +0200 | [diff] [blame] | 297 | #' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll() |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 298 | #' q@collectedMatches |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 299 | #' } |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 300 | #' |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 301 | #' @aliases fetchAll |
| 302 | #' @rdname KorAPQuery-class |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 303 | #' @export |
Marc Kupietz | f6f7131 | 2019-09-23 18:35:27 +0200 | [diff] [blame] | 304 | setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) { |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 305 | return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose)) |
| 306 | }) |
| 307 | |
| 308 | #' Fetches the remaining results of a KorAP query. |
| 309 | #' |
| 310 | #' @examples |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 311 | #' \donttest{ |
| 312 | #' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchRest() |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 313 | #' q@collectedMatches |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 314 | #' } |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 315 | #' |
| 316 | #' @aliases fetchRest |
| 317 | #' @rdname KorAPQuery-class |
| 318 | #' @export |
Marc Kupietz | f6f7131 | 2019-09-23 18:35:27 +0200 | [diff] [blame] | 319 | setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) { |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 320 | return(fetchNext(kqo, maxFetch = NA, verbose = verbose)) |
| 321 | }) |
| 322 | |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 323 | #' Query relative frequency of search term(s) |
| 324 | #' |
| 325 | #' \bold{\code{frequencyQuery}} combines \code{\link{corpusQuery}}, \code{\link{corpusStats}} and |
| 326 | #' \code{\link{ci}} to compute a table with the relative frequencies and |
| 327 | #' confidence intervals of one ore multiple search terms across one or multiple |
| 328 | #' virtual corpora. |
| 329 | #' |
| 330 | #' @aliases frequencyQuery |
| 331 | #' @rdname KorAPQuery-class |
| 332 | #' @examples |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 333 | #' \donttest{ |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 334 | #' new("KorAPConnection", verbose = TRUE) %>% |
| 335 | #' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003)) |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 336 | #' } |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 337 | #' |
| 338 | #' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")} |
| 339 | #' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}. |
Marc Kupietz | 43a6ade | 2020-02-18 17:01:44 +0100 | [diff] [blame] | 340 | #' @param conf.level confidence level of the returned confidence interval (passed through \code{\link{ci}} to \code{\link{prop.test}}). |
Marc Kupietz | 71d6e05 | 2019-11-22 18:42:10 +0100 | [diff] [blame] | 341 | #' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If \code{as.alternatives} is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies. |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 342 | #' @export |
| 343 | setMethod("frequencyQuery", "KorAPConnection", |
Marc Kupietz | 71d6e05 | 2019-11-22 18:42:10 +0100 | [diff] [blame] | 344 | function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) { |
| 345 | (if (as.alternatives) { |
| 346 | corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) %>% |
| 347 | group_by(vc) %>% |
| 348 | mutate(total = sum(totalResults)) |
| 349 | } else { |
| 350 | corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) %>% |
| 351 | mutate(total = corpusStats(kco, vc=vc, as.df=TRUE)$tokens) |
| 352 | } ) %>% |
Marc Kupietz | 0c29cea | 2019-10-09 08:44:36 +0200 | [diff] [blame] | 353 | ci(conf.level = conf.level) |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 354 | }) |
| 355 | |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 356 | #´ format() |
| 357 | #' @rdname KorAPQuery-class |
| 358 | #' @param x KorAPQuery object |
| 359 | #' @param ... further arguments passed to or from other methods |
| 360 | #' @export |
| 361 | format.KorAPQuery <- function(x, ...) { |
| 362 | cat("<KorAPQuery>\n") |
| 363 | q <- x |
| 364 | aurl = parse_url(q@request) |
Marc Kupietz | 0d4c909 | 2020-03-23 09:02:30 +0100 | [diff] [blame] | 365 | cat(" Query: ", aurl$query$q, "\n") |
| 366 | if (!is.null(aurl$query$cq) && aurl$query$cq != "") { |
| 367 | cat(" Virtual corpus: ", aurl$query$cq, "\n") |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 368 | } |
| 369 | if (!is.null(q@collectedMatches)) { |
| 370 | cat("==============================================================================================================", "\n") |
| 371 | print(summary(q@collectedMatches)) |
| 372 | cat("==============================================================================================================", "\n") |
| 373 | } |
| 374 | cat(" Total results: ", q@totalResults, "\n") |
| 375 | cat(" Fetched results: ", q@nextStartIndex, "\n") |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 376 | } |
| 377 | |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 378 | #' show() |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 379 | #' |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 380 | #' @rdname KorAPQuery-class |
| 381 | #' @param object KorAPQuery object |
Marc Kupietz | 62da2b5 | 2019-09-12 17:43:34 +0200 | [diff] [blame] | 382 | #' @export |
Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 383 | setMethod("show", "KorAPQuery", function(object) { |
| 384 | format(object) |
| 385 | }) |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 386 | |
| 387 | |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 388 | |
| 389 | lemmatizeWordQuery <- function(w) { |
| 390 | paste0('[tt/l=', w, ']') |
| 391 | } |
| 392 | |
| 393 | #' Query frequencies of a node and a collocate and calculate collocation association scores |
| 394 | #' |
| 395 | #' \bold{\code{collocationScoreQuery}} computes various collocation association scores |
Marc Kupietz | e203832 | 2021-03-04 18:24:02 +0100 | [diff] [blame] | 396 | #' based on \code{\link{frequencyQuery}}s for a target word and a collocate. |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 397 | #' |
| 398 | #' @aliases collocationScoreQuery |
| 399 | #' @rdname KorAPQuery-class |
| 400 | #' |
| 401 | #' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")} |
| 402 | #' @param node target word |
| 403 | #' @param collocate collocate of target word |
| 404 | #' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible. |
| 405 | #' @param lemmatizeNodeQuery logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x] |
| 406 | #' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x] |
| 407 | #' @param leftContextSize size of the left context window |
| 408 | #' @param rightContextSize size of the right context window |
Marc Kupietz | e203832 | 2021-03-04 18:24:02 +0100 | [diff] [blame] | 409 | #' @param scoreFunctions named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi} |
| 410 | #' @param smoothingConstant smoothing constant will be added to all observed values |
| 411 | #' |
| 412 | #' @return tibble with query KorAP web request URL, all observed values and association scores |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 413 | #' |
| 414 | #' @examples |
| 415 | #' \donttest{ |
| 416 | #' new("KorAPConnection", verbose = TRUE) %>% |
| 417 | #' collocationScoreQuery("Grund", "triftiger") |
| 418 | #' } |
| 419 | #' |
| 420 | #' \donttest{ |
Marc Kupietz | e203832 | 2021-03-04 18:24:02 +0100 | [diff] [blame] | 421 | #' new("KorAPConnection", verbose = TRUE) %>% |
| 422 | #' collocationScoreQuery("Grund", c("guter", "triftiger"), |
| 423 | #' scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) ) |
| 424 | #' } |
| 425 | #' |
| 426 | #' \donttest{ |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 427 | #' library(highcharter) |
Marc Kupietz | e203832 | 2021-03-04 18:24:02 +0100 | [diff] [blame] | 428 | #' library(tidyr) |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 429 | #' new("KorAPConnection", verbose = TRUE) %>% |
| 430 | #' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)), |
| 431 | #' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>% |
Marc Kupietz | e203832 | 2021-03-04 18:24:02 +0100 | [diff] [blame] | 432 | #' pivot_longer(14:last_col(), names_to = "measure", values_to = "score") %>% |
| 433 | #' hchart(type="spline", hcaes(label, score, group=measure)) %>% |
| 434 | #' hc_add_onclick_korap_search() |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 435 | #' } |
| 436 | #' |
| 437 | #' @importFrom tidyr pivot_longer |
| 438 | #' @export |
| 439 | setMethod("collocationScoreQuery", "KorAPConnection", |
| 440 | function(kco, |
| 441 | node, |
| 442 | collocate, |
| 443 | vc = "", |
| 444 | lemmatizeNodeQuery = FALSE, |
| 445 | lemmatizeCollocateQuery = FALSE, |
| 446 | leftContextSize = 5, |
Marc Kupietz | e203832 | 2021-03-04 18:24:02 +0100 | [diff] [blame] | 447 | rightContextSize = 5, |
| 448 | scoreFunctions = defaultAssociationScoreFunctions(), |
| 449 | smoothingConstant = .5 |
| 450 | ) { |
| 451 | # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check |
| 452 | O1 <- O2 <- O <- N <- E <- w <- 0 |
| 453 | |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 454 | if (leftContextSize <= 0 && rightContextSize <= 0) { |
| 455 | stop("At least one of leftContextSize and rightContextSize must be > 0", |
| 456 | call. = FALSE) |
| 457 | } |
| 458 | |
| 459 | if (lemmatizeNodeQuery) { |
| 460 | node <- lemmatizeWordQuery(node) |
| 461 | } |
| 462 | |
| 463 | if (lemmatizeCollocateQuery) { |
| 464 | collocate <- lemmatizeWordQuery(collocate) |
| 465 | } |
| 466 | |
| 467 | query <- "" |
| 468 | |
| 469 | if (leftContextSize > 0) { |
| 470 | query <- |
Marc Kupietz | e203832 | 2021-03-04 18:24:02 +0100 | [diff] [blame] | 471 | paste0(collocate, |
| 472 | if (leftContextSize > 1) paste0(" []{0,", leftContextSize - 1, "} ") else " ", |
| 473 | node, |
| 474 | if (rightContextSize > 0) " | ") |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 475 | } |
| 476 | |
| 477 | if (rightContextSize > 0) { |
| 478 | query <- |
Marc Kupietz | e203832 | 2021-03-04 18:24:02 +0100 | [diff] [blame] | 479 | paste0(query, node, |
| 480 | if (rightContextSize > 1) paste0(" []{0,", rightContextSize - 1, "} ") else " ", collocate) |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 481 | } |
| 482 | |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 483 | |
| 484 | tibble( |
| 485 | node = node, |
| 486 | collocate = collocate, |
| 487 | label = queryStringToLabel(vc), |
| 488 | vc = vc, |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 489 | webUIRequestUrl = frequencyQuery(kco, query, vc)$webUIRequestUrl, |
Marc Kupietz | e203832 | 2021-03-04 18:24:02 +0100 | [diff] [blame] | 490 | w = leftContextSize + rightContextSize, |
| 491 | leftContextSize, |
| 492 | rightContextSize, |
| 493 | N = frequencyQuery(kco, node, vc)$total + smoothingConstant, |
| 494 | O = as.double(frequencyQuery(kco, query, vc)$totalResults) + smoothingConstant, |
| 495 | O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant, |
| 496 | O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant, |
| 497 | E = w * as.double(O1) * O2 / N |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 498 | ) %>% |
Marc Kupietz | e203832 | 2021-03-04 18:24:02 +0100 | [diff] [blame] | 499 | mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w)) |
| 500 | |
Marc Kupietz | 006b47c | 2021-01-13 17:00:59 +0100 | [diff] [blame] | 501 | }) |