blob: 027d119b5299089e89c478a59b63043d3a744e55 [file] [log] [blame]
Marc Kupietze95108e2019-09-18 13:23:58 +02001#' Class KorAPQuery
2#'
Marc Kupietz43a6ade2020-02-18 17:01:44 +01003#' \code{KorAPQuery} objects represent the current state of a query to a KorAP server.
Marc Kupietze95108e2019-09-18 13:23:58 +02004#' New \code{KorAPQuery} objects are typically created by the \code{\link{corpusQuery}} method.
5#'
6#' @include KorAPConnection.R
Marc Kupietze95108e2019-09-18 13:23:58 +02007#' @import httr
8#'
Marc Kupietz4de53ec2019-10-04 09:12:00 +02009#' @include RKorAPClient.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020010
Marc Kupietze95108e2019-09-18 13:23:58 +020011#' @export
12KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020013 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020014 "request",
15 "vc",
16 "totalResults",
17 "nextStartIndex",
18 "fields",
19 "requestUrl",
20 "webUIRequestUrl",
21 "apiResponse",
22 "collectedMatches",
23 "hasMoreMatches"
24))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020025
Marc Kupietze95108e2019-09-18 13:23:58 +020026#' Method initialize
27#'
28#' @rdname KorAPQuery-class
29#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020030#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020031#' @param request query part of the request URL
32#' @param vc definition of a virtual corpus
33#' @param totalResults number of hits the query has yielded
34#' @param nextStartIndex at what index to start the next fetch of query results
35#' @param fields what data / metadata fields should be collected
36#' @param requestUrl complete URL of the API request
37#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
38#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020039#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020040#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020041#'
42#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020043#' @export
44setMethod("initialize", "KorAPQuery",
Marc Kupietzb8972182019-09-20 21:33:46 +020045 function(.Object, korapConnection = NULL, request = NULL, vc="", totalResults=0, nextStartIndex=0, fields=c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietze95108e2019-09-18 13:23:58 +020046 "availability", "textClass", "snippet"),
47 requestUrl="", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches= FALSE, collectedMatches = NULL) {
48 .Object <- callNextMethod()
Marc Kupietzb8972182019-09-20 21:33:46 +020049 .Object@korapConnection = korapConnection
Marc Kupietze95108e2019-09-18 13:23:58 +020050 .Object@request = request
51 .Object@vc = vc
52 .Object@totalResults = totalResults
53 .Object@nextStartIndex = nextStartIndex
54 .Object@fields = fields
55 .Object@requestUrl = requestUrl
56 .Object@webUIRequestUrl = webUIRequestUrl
57 .Object@apiResponse = apiResponse
58 .Object@hasMoreMatches = hasMoreMatches
59 .Object@collectedMatches = collectedMatches
60 .Object
61 })
Marc Kupietz632cbd42019-09-06 16:04:51 +020062
Marc Kupietze95108e2019-09-18 13:23:58 +020063setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery") )
64setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll") )
65setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") )
66setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
Marc Kupietz3f575282019-10-04 14:46:04 +020067setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery") )
Marc Kupietz006b47c2021-01-13 17:00:59 +010068setGeneric("collocationScoreQuery", function(kco, ...) standardGeneric("collocationScoreQuery") )
69
Marc Kupietze95108e2019-09-18 13:23:58 +020070
71maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020072
Marc Kupietz4de53ec2019-10-04 09:12:00 +020073## quiets concerns of R CMD check re: the .'s that appear in pipelines
74if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020075
Marc Kupietze95108e2019-09-18 13:23:58 +020076#' Method corpusQuery
77#'
78#' Perform a corpus query via a connection to a KorAP-API-server.
79#'
80#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
Akron5e135462019-09-27 16:31:38 +020081#' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.
Marc Kupietz632cbd42019-09-06 16:04:51 +020082#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
83#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietz7776dec2019-09-27 16:59:02 +020084#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz3c531f62019-09-13 12:17:24 +020085#' @param ql string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values.
Akron5e135462019-09-27 16:31:38 +020086#' @param fields (meta)data fields that will be fetched for every match.
Marc Kupietz43a6ade2020-02-18 17:01:44 +010087#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +020088#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +020089#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietz43a6ade2020-02-18 17:01:44 +010090#' @param expand logical that decides if \code{query} and \code{vc} parameters are expanded to all of their combinations
Marc Kupietz3f575282019-10-04 14:46:04 +020091#' @return Depending on the \code{as.df} parameter, a table or a \code{\link{KorAPQuery}} object that, among other information, contains the total number of results in \code{@totalResults}. The resulting object can be used to fetch all query results (with \code{\link{fetchAll}}) or the next page of results (with \code{\link{fetchNext}}).
Marc Kupietze95108e2019-09-18 13:23:58 +020092#' A corresponding URL to be used within a web browser is contained in \code{@webUIRequestUrl}
Marc Kupietz43a6ade2020-02-18 17:01:44 +010093#' Please make sure to check \code{$collection$rewrites} to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +020094#'
95#' @examples
Marc Kupietz603491f2019-09-18 14:01:02 +020096#' # Fetch metadata of every query hit for "Ameisenplage" and show a summary
Marc Kupietz657d8e72020-02-25 18:31:50 +010097#' \donttest{
Marc Kupietz69cc54a2019-09-30 12:06:54 +020098#' new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietz657d8e72020-02-25 18:31:50 +010099#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200100#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200101#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
102#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200103#'
104#' new("KorAPConnection", verbose = TRUE) %>%
105#' corpusQuery(KorAPUrl =
106#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp")
Marc Kupietz3c531f62019-09-13 12:17:24 +0200107#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200108#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietz05b22772020-02-18 21:58:42 +0100109#' \donttest{
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200110#' new("KorAPConnection", verbose=TRUE) %>%
111#' { . ->> kco } %>%
112#' corpusQuery("Ameisenplage") %>%
113#' fetchAll() %>%
114#' slot("collectedMatches") %>%
115#' mutate(year = lubridate::year(pubDate)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200116#' dplyr::select(year) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200117#' group_by(year) %>%
Marc Kupietzcb3c59e2020-06-02 10:10:43 +0200118#' summarise(Count = dplyr::n()) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200119#' mutate(Freq = mapply(function(f, y)
120#' f / corpusStats(kco, paste("pubDate in", y))@tokens, Count, year)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200121#' dplyr::select(-Count) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200122#' complete(year = min(year):max(year), fill = list(Freq = 0)) %>%
123#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100124#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200125#' @seealso \code{\link{KorAPConnection}}, \code{\link{fetchNext}}, \code{\link{fetchRest}}, \code{\link{fetchAll}}, \code{\link{corpusStats}}
Marc Kupietz632cbd42019-09-06 16:04:51 +0200126#'
127#' @references
128#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
129#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200130#' @aliases corpusQuery
Marc Kupietz632cbd42019-09-06 16:04:51 +0200131#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200132setMethod("corpusQuery", "KorAPConnection",
Marc Kupietza96537f2019-11-09 23:07:44 +0100133 function(kco,
134 query = if (missing(KorAPUrl))
135 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
136 else
137 httr::parse_url(KorAPUrl)$query$q,
138 vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq,
139 KorAPUrl,
140 metadataOnly = TRUE,
141 ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql,
142 fields = c(
143 "corpusSigle",
144 "textSigle",
145 "pubDate",
146 "pubPlace",
147 "availability",
148 "textClass",
149 "snippet"
150 ),
151 accessRewriteFatal = TRUE,
152 verbose = kco@verbose,
153 expand = length(vc) != length(query),
154 as.df = FALSE) {
155 if (length(query) > 1 || length(vc) > 1) {
156
157 grid <- {
158 if (expand)
159 expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc) }
Marc Kupietz3f575282019-10-04 14:46:04 +0200160 return(
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200161 do.call(rbind,
Marc Kupietz3f575282019-10-04 14:46:04 +0200162 Map(function(q, cq) corpusQuery(kco, query=q, vc=cq, ql=ql,
163 verbose=verbose, as.df = TRUE), grid$query, grid$vc)) %>%
164 remove_rownames()
Marc Kupietza96537f2019-11-09 23:07:44 +0100165 )
166 } else {
167 contentFields <- c("snippet")
168 if (metadataOnly) {
169 fields <- fields[!fields %in% contentFields]
170 }
171 request <-
172 paste0('?q=',
173 URLencode(query, reserved = TRUE),
174 if (vc != '') paste0('&cq=', URLencode(vc, reserved = TRUE)) else '', '&ql=', ql)
175 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
176 requestUrl <- paste0(
177 kco@apiUrl,
178 'search',
179 request,
180 '&fields=',
181 paste(fields, collapse = ","),
182 if (metadataOnly) '&access-rewrite-disabled=true' else ''
183 )
184 log.info(verbose, "Searching \"", query, "\" in \"", vc, "\"", sep =
185 "")
186 res = apiCall(kco, paste0(requestUrl, '&count=0'))
Marc Kupietzf5769b62019-12-13 09:19:45 +0100187 if(!is.null(res$meta$cached))
188 log.info(verbose, " [cached]\n")
189 else
190 log.info(verbose, " took ", res$meta$benchmark, "\n", sep = "")
Marc Kupietza96537f2019-11-09 23:07:44 +0100191 if (as.df)
192 data.frame(
193 query = query,
194 totalResults = res$meta$totalResults,
195 vc = vc,
196 webUIRequestUrl = webUIRequestUrl,
197 stringsAsFactors = FALSE
198 )
199 else
200 KorAPQuery(
201 korapConnection = kco,
202 nextStartIndex = 0,
203 fields = fields,
204 requestUrl = requestUrl,
205 request = request,
206 totalResults = res$meta$totalResults,
207 vc = vc,
208 apiResponse = res,
209 webUIRequestUrl = webUIRequestUrl,
210 hasMoreMatches = (res$meta$totalResults > 0),
211 )
212 }
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200213 })
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200214
Marc Kupietz62da2b52019-09-12 17:43:34 +0200215#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200216#'
Marc Kupietz3f575282019-10-04 14:46:04 +0200217#' \bold{\code{fetchNext}} fetches the next bunch of results of a KorAP query.
218#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200219#' @param kqo object obtained from \code{\link{corpusQuery}}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200220#' @param offset start offset for query results to fetch
221#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200222#' @param verbose print progress information if true
Marc Kupietze95108e2019-09-18 13:23:58 +0200223#' @return The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200224#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100225#' @examples
Marc Kupietz657d8e72020-02-25 18:31:50 +0100226#' \donttest{q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100227#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100228#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100229#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200230#' @references
231#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
232#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200233#' @aliases fetchNext
234#' @rdname KorAPQuery-class
Marc Kupietzcb3c59e2020-06-02 10:10:43 +0200235#' @importFrom dplyr rowwise bind_rows select summarise n
Marc Kupietz632cbd42019-09-06 16:04:51 +0200236#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200237setMethod("fetchNext", "KorAPQuery", function(kqo, offset = kqo@nextStartIndex, maxFetch = maxResultsPerPage, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200238 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
239 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200240 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200241
242 page <- 1
243 results <- 0
Marc Kupietz25aebc32019-09-16 18:40:50 +0200244 pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietze95108e2019-09-18 13:23:58 +0200245 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200246
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200247 repeat {
Marc Kupietza96537f2019-11-09 23:07:44 +0100248 res <- apiCall(kqo@korapConnection, paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', offset + results))
Marc Kupietze95108e2019-09-18 13:23:58 +0200249 if (res$meta$totalResults == 0) { return(kqo) }
250 for (field in kqo@fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200251 if (!field %in% colnames(res$matches)) {
252 res$matches[, field] <- NA
253 }
254 }
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200255 currentMatches <-
Marc Kupietzf75ab0b2020-06-02 12:31:18 +0200256 res$matches %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200257 dplyr::select(kqo@fields)
Marc Kupietz36d12d92019-09-27 18:13:27 +0200258 if ("pubDate" %in% kqo@fields) {
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200259 currentMatches$pubDate <- currentMatches$pubDate %>% as.Date(format = "%Y-%m-%d")
260 factorCols <- currentMatches %>% select(-pubDate) %>% colnames()
Marc Kupietz36d12d92019-09-27 18:13:27 +0200261 } else {
262 factorCols <- colnames(currentMatches)
263 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200264 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200265 if (!is.list(collectedMatches)) {
266 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200267 } else {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200268 collectedMatches <- rbind(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200269 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200270 if (verbose) {
Marc Kupietzf6f71312019-09-23 18:35:27 +0200271 cat(paste0("Retrieved page ", page, "/", ceiling((res$meta$totalResults) / res$meta$itemsPerPage), ' in ', res$meta$benchmark, '\n'))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200272 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200273 page <- page + 1
274 results <- results + res$meta$itemsPerPage
Marc Kupietz62da2b52019-09-12 17:43:34 +0200275 if (offset + results >= res$meta$totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200276 break
277 }
278 }
Marc Kupietze95108e2019-09-18 13:23:58 +0200279 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, res$meta$totalResults)
280 KorAPQuery(nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200281 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200282 fields = kqo@fields,
283 requestUrl = kqo@requestUrl,
284 request = kqo@request,
285 totalResults = res$meta$totalResults,
286 vc = kqo@vc,
287 webUIRequestUrl = kqo@webUIRequestUrl,
288 hasMoreMatches = (res$meta$totalResults > nextStartIndex),
289 apiResponse = res,
290 collectedMatches = collectedMatches)
291})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200292
293#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200294#'
295#' @examples
Marc Kupietz05b22772020-02-18 21:58:42 +0100296#' \donttest{
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200297#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200298#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100299#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200300#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200301#' @aliases fetchAll
302#' @rdname KorAPQuery-class
Marc Kupietz62da2b52019-09-12 17:43:34 +0200303#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200304setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200305 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose))
306})
307
308#' Fetches the remaining results of a KorAP query.
309#'
310#' @examples
Marc Kupietz05b22772020-02-18 21:58:42 +0100311#' \donttest{
312#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200313#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100314#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200315#'
316#' @aliases fetchRest
317#' @rdname KorAPQuery-class
318#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200319setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200320 return(fetchNext(kqo, maxFetch = NA, verbose = verbose))
321})
322
Marc Kupietz3f575282019-10-04 14:46:04 +0200323#' Query relative frequency of search term(s)
324#'
325#' \bold{\code{frequencyQuery}} combines \code{\link{corpusQuery}}, \code{\link{corpusStats}} and
326#' \code{\link{ci}} to compute a table with the relative frequencies and
327#' confidence intervals of one ore multiple search terms across one or multiple
328#' virtual corpora.
329#'
330#' @aliases frequencyQuery
331#' @rdname KorAPQuery-class
332#' @examples
Marc Kupietz05b22772020-02-18 21:58:42 +0100333#' \donttest{
Marc Kupietz3f575282019-10-04 14:46:04 +0200334#' new("KorAPConnection", verbose = TRUE) %>%
335#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +0100336#' }
Marc Kupietz3f575282019-10-04 14:46:04 +0200337#'
338#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
339#' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100340#' @param conf.level confidence level of the returned confidence interval (passed through \code{\link{ci}} to \code{\link{prop.test}}).
Marc Kupietz71d6e052019-11-22 18:42:10 +0100341#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If \code{as.alternatives} is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietz3f575282019-10-04 14:46:04 +0200342#' @export
343setMethod("frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +0100344 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
345 (if (as.alternatives) {
346 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) %>%
347 group_by(vc) %>%
348 mutate(total = sum(totalResults))
349 } else {
350 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) %>%
351 mutate(total = corpusStats(kco, vc=vc, as.df=TRUE)$tokens)
352 } ) %>%
Marc Kupietz0c29cea2019-10-09 08:44:36 +0200353 ci(conf.level = conf.level)
Marc Kupietz3f575282019-10-04 14:46:04 +0200354})
355
Marc Kupietze95108e2019-09-18 13:23:58 +0200356#´ format()
357#' @rdname KorAPQuery-class
358#' @param x KorAPQuery object
359#' @param ... further arguments passed to or from other methods
360#' @export
361format.KorAPQuery <- function(x, ...) {
362 cat("<KorAPQuery>\n")
363 q <- x
364 aurl = parse_url(q@request)
Marc Kupietz0d4c9092020-03-23 09:02:30 +0100365 cat(" Query: ", aurl$query$q, "\n")
366 if (!is.null(aurl$query$cq) && aurl$query$cq != "") {
367 cat(" Virtual corpus: ", aurl$query$cq, "\n")
Marc Kupietze95108e2019-09-18 13:23:58 +0200368 }
369 if (!is.null(q@collectedMatches)) {
370 cat("==============================================================================================================", "\n")
371 print(summary(q@collectedMatches))
372 cat("==============================================================================================================", "\n")
373 }
374 cat(" Total results: ", q@totalResults, "\n")
375 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200376}
377
Marc Kupietze95108e2019-09-18 13:23:58 +0200378#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200379#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200380#' @rdname KorAPQuery-class
381#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200382#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200383setMethod("show", "KorAPQuery", function(object) {
384 format(object)
385})
Marc Kupietz006b47c2021-01-13 17:00:59 +0100386
387
388#'
389#' @importFrom dplyr if_else
390#'
391ca_ll <- function(w1, w2, w12, n, true_window_size) {
392 r1 = as.double(w1) * true_window_size
393 r2 = as.double(n) - r1
394 c1 = w2
395 c2 = n - c1
396 o11 = w12
397 o12 = r1 - o11
398 o21 = c1 - w12
399 o22 = r2 - o21
400 e11 = r1 * c1 / n
401 e12 = r1 * c2 / n
402 e21 = r2 * c1 / n
403 e22 = r2 * c2 / n
404 2 * ( dplyr::if_else(o11>0, o11 * log(o11/e11), 0)
405 + dplyr::if_else(o12>0, o12 * log(o12/e12), 0)
406 + dplyr::if_else(o21>0, o21 * log(o21/e21), 0)
407 + dplyr::if_else(o22>0, o22 * log(o22/e22), 0))
408}
409
410lemmatizeWordQuery <- function(w) {
411 paste0('[tt/l=', w, ']')
412}
413
414#' Query frequencies of a node and a collocate and calculate collocation association scores
415#'
416#' \bold{\code{collocationScoreQuery}} computes various collocation association scores
417#' based on \code{\link{frequencyQuery}}s for a target worf and a collocate.
418#'
419#' @aliases collocationScoreQuery
420#' @rdname KorAPQuery-class
421#'
422#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
423#' @param node target word
424#' @param collocate collocate of target word
425#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
426#' @param lemmatizeNodeQuery logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
427#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
428#' @param leftContextSize size of the left context window
429#' @param rightContextSize size of the right context window
430#'
431#' @examples
432#' \donttest{
433#' new("KorAPConnection", verbose = TRUE) %>%
434#' collocationScoreQuery("Grund", "triftiger")
435#' }
436#'
437#' \donttest{
438#' library(highcharter)
439#' new("KorAPConnection", verbose = TRUE) %>%
440#' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
441#' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
442#' hchart(type="spline", hcaes(label, score, group=measure))
443#' }
444#'
445#' @importFrom tidyr pivot_longer
446#' @export
447setMethod("collocationScoreQuery", "KorAPConnection",
448 function(kco,
449 node,
450 collocate,
451 vc = "",
452 lemmatizeNodeQuery = FALSE,
453 lemmatizeCollocateQuery = FALSE,
454 leftContextSize = 5,
455 rightContextSize = 5) {
456 if (leftContextSize <= 0 && rightContextSize <= 0) {
457 stop("At least one of leftContextSize and rightContextSize must be > 0",
458 call. = FALSE)
459 }
460
461 if (lemmatizeNodeQuery) {
462 node <- lemmatizeWordQuery(node)
463 }
464
465 if (lemmatizeCollocateQuery) {
466 collocate <- lemmatizeWordQuery(collocate)
467 }
468
469 query <- ""
470
471 if (leftContextSize > 0) {
472 query <-
473 paste0(collocate, " []{0,", leftContextSize - 1, "} ", node,
474 if (rightContextSize > 0) " | " else "")
475 }
476
477 if (rightContextSize > 0) {
478 query <-
479 paste0(query, node, " []{0,", rightContextSize - 1, "} ", collocate)
480 }
481
482 w <- leftContextSize + rightContextSize
483
484 tibble(
485 node = node,
486 collocate = collocate,
487 label = queryStringToLabel(vc),
488 vc = vc,
489
490 O = as.double(frequencyQuery(kco, query, vc)$totalResults),
491 webUIRequestUrl = frequencyQuery(kco, query, vc)$webUIRequestUrl,
492 fx = frequencyQuery(kco, node, vc)$totalResults,
493 fy = frequencyQuery(kco, collocate, vc)$totalResults,
494 N = frequencyQuery(kco, node, vc)$total,
495 E = w * as.double(fx) * fy / N,
496 MI = log2(O / E),
497 MI2 = log2(O ^ 2 / E),
498 MI3 = log2(O ^ 3 / E),
499 logDice = 14 + log2(2 * O / (w * fy + fx)),
500 llr = ca_ll(fx, fy, O, N, w)
501 ) %>%
502 tidyr::pivot_longer(c(MI, MI2, MI3, logDice, llr),
503 names_to = "measure",
504 values_to = "score")
505 })