blob: fa8483981f8c4d33b11ba58735a17ec9ab9c1ce9 [file] [log] [blame]
Marc Kupietze95108e2019-09-18 13:23:58 +02001#' Class KorAPQuery
2#'
Marc Kupietz43a6ade2020-02-18 17:01:44 +01003#' \code{KorAPQuery} objects represent the current state of a query to a KorAP server.
Marc Kupietze95108e2019-09-18 13:23:58 +02004#' New \code{KorAPQuery} objects are typically created by the \code{\link{corpusQuery}} method.
5#'
6#' @include KorAPConnection.R
Marc Kupietze95108e2019-09-18 13:23:58 +02007#' @import httr
8#'
Marc Kupietz4de53ec2019-10-04 09:12:00 +02009#' @include RKorAPClient.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020010
Marc Kupietze95108e2019-09-18 13:23:58 +020011#' @export
12KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020013 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020014 "request",
15 "vc",
16 "totalResults",
17 "nextStartIndex",
18 "fields",
19 "requestUrl",
20 "webUIRequestUrl",
21 "apiResponse",
22 "collectedMatches",
23 "hasMoreMatches"
24))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020025
Marc Kupietze95108e2019-09-18 13:23:58 +020026#' Method initialize
27#'
28#' @rdname KorAPQuery-class
29#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020030#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020031#' @param request query part of the request URL
32#' @param vc definition of a virtual corpus
33#' @param totalResults number of hits the query has yielded
34#' @param nextStartIndex at what index to start the next fetch of query results
35#' @param fields what data / metadata fields should be collected
36#' @param requestUrl complete URL of the API request
37#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
38#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020039#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020040#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020041#'
42#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020043#' @export
44setMethod("initialize", "KorAPQuery",
Marc Kupietzb8972182019-09-20 21:33:46 +020045 function(.Object, korapConnection = NULL, request = NULL, vc="", totalResults=0, nextStartIndex=0, fields=c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietze95108e2019-09-18 13:23:58 +020046 "availability", "textClass", "snippet"),
47 requestUrl="", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches= FALSE, collectedMatches = NULL) {
48 .Object <- callNextMethod()
Marc Kupietzb8972182019-09-20 21:33:46 +020049 .Object@korapConnection = korapConnection
Marc Kupietze95108e2019-09-18 13:23:58 +020050 .Object@request = request
51 .Object@vc = vc
52 .Object@totalResults = totalResults
53 .Object@nextStartIndex = nextStartIndex
54 .Object@fields = fields
55 .Object@requestUrl = requestUrl
56 .Object@webUIRequestUrl = webUIRequestUrl
57 .Object@apiResponse = apiResponse
58 .Object@hasMoreMatches = hasMoreMatches
59 .Object@collectedMatches = collectedMatches
60 .Object
61 })
Marc Kupietz632cbd42019-09-06 16:04:51 +020062
Marc Kupietze95108e2019-09-18 13:23:58 +020063setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery") )
64setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll") )
65setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") )
66setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
Marc Kupietz3f575282019-10-04 14:46:04 +020067setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery") )
Marc Kupietz006b47c2021-01-13 17:00:59 +010068setGeneric("collocationScoreQuery", function(kco, ...) standardGeneric("collocationScoreQuery") )
Marc Kupietze2038322021-03-04 18:24:02 +010069setGeneric("collocationScoreQueryNew", function(kco, ...) standardGeneric("collocationScoreQueryNew") )
Marc Kupietz006b47c2021-01-13 17:00:59 +010070
Marc Kupietze95108e2019-09-18 13:23:58 +020071
72maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020073
Marc Kupietz4de53ec2019-10-04 09:12:00 +020074## quiets concerns of R CMD check re: the .'s that appear in pipelines
75if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020076
Marc Kupietze95108e2019-09-18 13:23:58 +020077#' Method corpusQuery
78#'
79#' Perform a corpus query via a connection to a KorAP-API-server.
80#'
81#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
Akron5e135462019-09-27 16:31:38 +020082#' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.
Marc Kupietz632cbd42019-09-06 16:04:51 +020083#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
84#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietz7776dec2019-09-27 16:59:02 +020085#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz3c531f62019-09-13 12:17:24 +020086#' @param ql string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values.
Akron5e135462019-09-27 16:31:38 +020087#' @param fields (meta)data fields that will be fetched for every match.
Marc Kupietz43a6ade2020-02-18 17:01:44 +010088#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +020089#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +020090#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietz43a6ade2020-02-18 17:01:44 +010091#' @param expand logical that decides if \code{query} and \code{vc} parameters are expanded to all of their combinations
Marc Kupietz3f575282019-10-04 14:46:04 +020092#' @return Depending on the \code{as.df} parameter, a table or a \code{\link{KorAPQuery}} object that, among other information, contains the total number of results in \code{@totalResults}. The resulting object can be used to fetch all query results (with \code{\link{fetchAll}}) or the next page of results (with \code{\link{fetchNext}}).
Marc Kupietze95108e2019-09-18 13:23:58 +020093#' A corresponding URL to be used within a web browser is contained in \code{@webUIRequestUrl}
Marc Kupietz43a6ade2020-02-18 17:01:44 +010094#' Please make sure to check \code{$collection$rewrites} to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +020095#'
96#' @examples
Marc Kupietz603491f2019-09-18 14:01:02 +020097#' # Fetch metadata of every query hit for "Ameisenplage" and show a summary
Marc Kupietz657d8e72020-02-25 18:31:50 +010098#' \donttest{
Marc Kupietz69cc54a2019-09-30 12:06:54 +020099#' new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietz657d8e72020-02-25 18:31:50 +0100100#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +0200101#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200102#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
103#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200104#'
105#' new("KorAPConnection", verbose = TRUE) %>%
106#' corpusQuery(KorAPUrl =
107#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp")
Marc Kupietz3c531f62019-09-13 12:17:24 +0200108#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200109#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietz05b22772020-02-18 21:58:42 +0100110#' \donttest{
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200111#' new("KorAPConnection", verbose=TRUE) %>%
112#' { . ->> kco } %>%
113#' corpusQuery("Ameisenplage") %>%
114#' fetchAll() %>%
115#' slot("collectedMatches") %>%
116#' mutate(year = lubridate::year(pubDate)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200117#' dplyr::select(year) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200118#' group_by(year) %>%
Marc Kupietzcb3c59e2020-06-02 10:10:43 +0200119#' summarise(Count = dplyr::n()) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200120#' mutate(Freq = mapply(function(f, y)
121#' f / corpusStats(kco, paste("pubDate in", y))@tokens, Count, year)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200122#' dplyr::select(-Count) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200123#' complete(year = min(year):max(year), fill = list(Freq = 0)) %>%
124#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100125#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200126#' @seealso \code{\link{KorAPConnection}}, \code{\link{fetchNext}}, \code{\link{fetchRest}}, \code{\link{fetchAll}}, \code{\link{corpusStats}}
Marc Kupietz632cbd42019-09-06 16:04:51 +0200127#'
128#' @references
129#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
130#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200131#' @aliases corpusQuery
Marc Kupietz632cbd42019-09-06 16:04:51 +0200132#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200133setMethod("corpusQuery", "KorAPConnection",
Marc Kupietza96537f2019-11-09 23:07:44 +0100134 function(kco,
135 query = if (missing(KorAPUrl))
136 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
137 else
138 httr::parse_url(KorAPUrl)$query$q,
139 vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq,
140 KorAPUrl,
141 metadataOnly = TRUE,
142 ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql,
143 fields = c(
144 "corpusSigle",
145 "textSigle",
146 "pubDate",
147 "pubPlace",
148 "availability",
149 "textClass",
150 "snippet"
151 ),
152 accessRewriteFatal = TRUE,
153 verbose = kco@verbose,
154 expand = length(vc) != length(query),
155 as.df = FALSE) {
156 if (length(query) > 1 || length(vc) > 1) {
157
158 grid <- {
159 if (expand)
160 expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc) }
Marc Kupietz3f575282019-10-04 14:46:04 +0200161 return(
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200162 do.call(rbind,
Marc Kupietz3f575282019-10-04 14:46:04 +0200163 Map(function(q, cq) corpusQuery(kco, query=q, vc=cq, ql=ql,
164 verbose=verbose, as.df = TRUE), grid$query, grid$vc)) %>%
165 remove_rownames()
Marc Kupietza96537f2019-11-09 23:07:44 +0100166 )
167 } else {
168 contentFields <- c("snippet")
169 if (metadataOnly) {
170 fields <- fields[!fields %in% contentFields]
171 }
172 request <-
173 paste0('?q=',
174 URLencode(query, reserved = TRUE),
175 if (vc != '') paste0('&cq=', URLencode(vc, reserved = TRUE)) else '', '&ql=', ql)
176 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
177 requestUrl <- paste0(
178 kco@apiUrl,
179 'search',
180 request,
181 '&fields=',
182 paste(fields, collapse = ","),
183 if (metadataOnly) '&access-rewrite-disabled=true' else ''
184 )
185 log.info(verbose, "Searching \"", query, "\" in \"", vc, "\"", sep =
186 "")
187 res = apiCall(kco, paste0(requestUrl, '&count=0'))
Marc Kupietzf5769b62019-12-13 09:19:45 +0100188 if(!is.null(res$meta$cached))
189 log.info(verbose, " [cached]\n")
190 else
191 log.info(verbose, " took ", res$meta$benchmark, "\n", sep = "")
Marc Kupietza96537f2019-11-09 23:07:44 +0100192 if (as.df)
193 data.frame(
194 query = query,
195 totalResults = res$meta$totalResults,
196 vc = vc,
197 webUIRequestUrl = webUIRequestUrl,
198 stringsAsFactors = FALSE
199 )
200 else
201 KorAPQuery(
202 korapConnection = kco,
203 nextStartIndex = 0,
204 fields = fields,
205 requestUrl = requestUrl,
206 request = request,
207 totalResults = res$meta$totalResults,
208 vc = vc,
209 apiResponse = res,
210 webUIRequestUrl = webUIRequestUrl,
211 hasMoreMatches = (res$meta$totalResults > 0),
212 )
213 }
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200214 })
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200215
Marc Kupietz62da2b52019-09-12 17:43:34 +0200216#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200217#'
Marc Kupietz3f575282019-10-04 14:46:04 +0200218#' \bold{\code{fetchNext}} fetches the next bunch of results of a KorAP query.
219#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200220#' @param kqo object obtained from \code{\link{corpusQuery}}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200221#' @param offset start offset for query results to fetch
222#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200223#' @param verbose print progress information if true
Marc Kupietze95108e2019-09-18 13:23:58 +0200224#' @return The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200225#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100226#' @examples
Marc Kupietz657d8e72020-02-25 18:31:50 +0100227#' \donttest{q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100228#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100229#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100230#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200231#' @references
232#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
233#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200234#' @aliases fetchNext
235#' @rdname KorAPQuery-class
Marc Kupietzcb3c59e2020-06-02 10:10:43 +0200236#' @importFrom dplyr rowwise bind_rows select summarise n
Marc Kupietz632cbd42019-09-06 16:04:51 +0200237#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200238setMethod("fetchNext", "KorAPQuery", function(kqo, offset = kqo@nextStartIndex, maxFetch = maxResultsPerPage, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200239 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
240 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200241 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200242
243 page <- 1
244 results <- 0
Marc Kupietz25aebc32019-09-16 18:40:50 +0200245 pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietze95108e2019-09-18 13:23:58 +0200246 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200247
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200248 repeat {
Marc Kupietza96537f2019-11-09 23:07:44 +0100249 res <- apiCall(kqo@korapConnection, paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', offset + results))
Marc Kupietze95108e2019-09-18 13:23:58 +0200250 if (res$meta$totalResults == 0) { return(kqo) }
251 for (field in kqo@fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200252 if (!field %in% colnames(res$matches)) {
253 res$matches[, field] <- NA
254 }
255 }
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200256 currentMatches <-
Marc Kupietzf75ab0b2020-06-02 12:31:18 +0200257 res$matches %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200258 dplyr::select(kqo@fields)
Marc Kupietz36d12d92019-09-27 18:13:27 +0200259 if ("pubDate" %in% kqo@fields) {
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200260 currentMatches$pubDate <- currentMatches$pubDate %>% as.Date(format = "%Y-%m-%d")
261 factorCols <- currentMatches %>% select(-pubDate) %>% colnames()
Marc Kupietz36d12d92019-09-27 18:13:27 +0200262 } else {
263 factorCols <- colnames(currentMatches)
264 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200265 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200266 if (!is.list(collectedMatches)) {
267 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200268 } else {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200269 collectedMatches <- rbind(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200270 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200271 if (verbose) {
Marc Kupietzf6f71312019-09-23 18:35:27 +0200272 cat(paste0("Retrieved page ", page, "/", ceiling((res$meta$totalResults) / res$meta$itemsPerPage), ' in ', res$meta$benchmark, '\n'))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200273 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200274 page <- page + 1
275 results <- results + res$meta$itemsPerPage
Marc Kupietz62da2b52019-09-12 17:43:34 +0200276 if (offset + results >= res$meta$totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200277 break
278 }
279 }
Marc Kupietze95108e2019-09-18 13:23:58 +0200280 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, res$meta$totalResults)
281 KorAPQuery(nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200282 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200283 fields = kqo@fields,
284 requestUrl = kqo@requestUrl,
285 request = kqo@request,
286 totalResults = res$meta$totalResults,
287 vc = kqo@vc,
288 webUIRequestUrl = kqo@webUIRequestUrl,
289 hasMoreMatches = (res$meta$totalResults > nextStartIndex),
290 apiResponse = res,
291 collectedMatches = collectedMatches)
292})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200293
294#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200295#'
296#' @examples
Marc Kupietz05b22772020-02-18 21:58:42 +0100297#' \donttest{
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200298#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200299#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100300#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200301#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200302#' @aliases fetchAll
303#' @rdname KorAPQuery-class
Marc Kupietz62da2b52019-09-12 17:43:34 +0200304#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200305setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200306 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose))
307})
308
309#' Fetches the remaining results of a KorAP query.
310#'
311#' @examples
Marc Kupietz05b22772020-02-18 21:58:42 +0100312#' \donttest{
313#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200314#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100315#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200316#'
317#' @aliases fetchRest
318#' @rdname KorAPQuery-class
319#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200320setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200321 return(fetchNext(kqo, maxFetch = NA, verbose = verbose))
322})
323
Marc Kupietz3f575282019-10-04 14:46:04 +0200324#' Query relative frequency of search term(s)
325#'
326#' \bold{\code{frequencyQuery}} combines \code{\link{corpusQuery}}, \code{\link{corpusStats}} and
327#' \code{\link{ci}} to compute a table with the relative frequencies and
328#' confidence intervals of one ore multiple search terms across one or multiple
329#' virtual corpora.
330#'
331#' @aliases frequencyQuery
332#' @rdname KorAPQuery-class
333#' @examples
Marc Kupietz05b22772020-02-18 21:58:42 +0100334#' \donttest{
Marc Kupietz3f575282019-10-04 14:46:04 +0200335#' new("KorAPConnection", verbose = TRUE) %>%
336#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +0100337#' }
Marc Kupietz3f575282019-10-04 14:46:04 +0200338#'
339#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
340#' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100341#' @param conf.level confidence level of the returned confidence interval (passed through \code{\link{ci}} to \code{\link{prop.test}}).
Marc Kupietz71d6e052019-11-22 18:42:10 +0100342#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If \code{as.alternatives} is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietz3f575282019-10-04 14:46:04 +0200343#' @export
344setMethod("frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +0100345 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
346 (if (as.alternatives) {
347 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) %>%
348 group_by(vc) %>%
349 mutate(total = sum(totalResults))
350 } else {
351 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) %>%
352 mutate(total = corpusStats(kco, vc=vc, as.df=TRUE)$tokens)
353 } ) %>%
Marc Kupietz0c29cea2019-10-09 08:44:36 +0200354 ci(conf.level = conf.level)
Marc Kupietz3f575282019-10-04 14:46:04 +0200355})
356
Marc Kupietze95108e2019-09-18 13:23:58 +0200357#´ format()
358#' @rdname KorAPQuery-class
359#' @param x KorAPQuery object
360#' @param ... further arguments passed to or from other methods
361#' @export
362format.KorAPQuery <- function(x, ...) {
363 cat("<KorAPQuery>\n")
364 q <- x
365 aurl = parse_url(q@request)
Marc Kupietz0d4c9092020-03-23 09:02:30 +0100366 cat(" Query: ", aurl$query$q, "\n")
367 if (!is.null(aurl$query$cq) && aurl$query$cq != "") {
368 cat(" Virtual corpus: ", aurl$query$cq, "\n")
Marc Kupietze95108e2019-09-18 13:23:58 +0200369 }
370 if (!is.null(q@collectedMatches)) {
371 cat("==============================================================================================================", "\n")
372 print(summary(q@collectedMatches))
373 cat("==============================================================================================================", "\n")
374 }
375 cat(" Total results: ", q@totalResults, "\n")
376 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200377}
378
Marc Kupietze95108e2019-09-18 13:23:58 +0200379#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200380#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200381#' @rdname KorAPQuery-class
382#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200383#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200384setMethod("show", "KorAPQuery", function(object) {
385 format(object)
386})
Marc Kupietz006b47c2021-01-13 17:00:59 +0100387
388
Marc Kupietz006b47c2021-01-13 17:00:59 +0100389
390lemmatizeWordQuery <- function(w) {
391 paste0('[tt/l=', w, ']')
392}
393
394#' Query frequencies of a node and a collocate and calculate collocation association scores
395#'
396#' \bold{\code{collocationScoreQuery}} computes various collocation association scores
Marc Kupietze2038322021-03-04 18:24:02 +0100397#' based on \code{\link{frequencyQuery}}s for a target word and a collocate.
Marc Kupietz006b47c2021-01-13 17:00:59 +0100398#'
399#' @aliases collocationScoreQuery
400#' @rdname KorAPQuery-class
401#'
402#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
403#' @param node target word
404#' @param collocate collocate of target word
405#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
406#' @param lemmatizeNodeQuery logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
407#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
408#' @param leftContextSize size of the left context window
409#' @param rightContextSize size of the right context window
Marc Kupietze2038322021-03-04 18:24:02 +0100410#' @param scoreFunctions named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}
411#' @param smoothingConstant smoothing constant will be added to all observed values
412#'
413#' @return tibble with query KorAP web request URL, all observed values and association scores
Marc Kupietz006b47c2021-01-13 17:00:59 +0100414#'
415#' @examples
416#' \donttest{
417#' new("KorAPConnection", verbose = TRUE) %>%
418#' collocationScoreQuery("Grund", "triftiger")
419#' }
420#'
421#' \donttest{
Marc Kupietze2038322021-03-04 18:24:02 +0100422#' new("KorAPConnection", verbose = TRUE) %>%
423#' collocationScoreQuery("Grund", c("guter", "triftiger"),
424#' scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
425#' }
426#'
427#' \donttest{
Marc Kupietz006b47c2021-01-13 17:00:59 +0100428#' library(highcharter)
Marc Kupietze2038322021-03-04 18:24:02 +0100429#' library(tidyr)
Marc Kupietz006b47c2021-01-13 17:00:59 +0100430#' new("KorAPConnection", verbose = TRUE) %>%
431#' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
432#' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
Marc Kupietze2038322021-03-04 18:24:02 +0100433#' pivot_longer(14:last_col(), names_to = "measure", values_to = "score") %>%
434#' hchart(type="spline", hcaes(label, score, group=measure)) %>%
435#' hc_add_onclick_korap_search()
Marc Kupietz006b47c2021-01-13 17:00:59 +0100436#' }
437#'
438#' @importFrom tidyr pivot_longer
439#' @export
440setMethod("collocationScoreQuery", "KorAPConnection",
441 function(kco,
442 node,
443 collocate,
444 vc = "",
445 lemmatizeNodeQuery = FALSE,
446 lemmatizeCollocateQuery = FALSE,
447 leftContextSize = 5,
Marc Kupietze2038322021-03-04 18:24:02 +0100448 rightContextSize = 5,
449 scoreFunctions = defaultAssociationScoreFunctions(),
450 smoothingConstant = .5
451 ) {
452 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
453 O1 <- O2 <- O <- N <- E <- w <- 0
454
Marc Kupietz006b47c2021-01-13 17:00:59 +0100455 if (leftContextSize <= 0 && rightContextSize <= 0) {
456 stop("At least one of leftContextSize and rightContextSize must be > 0",
457 call. = FALSE)
458 }
459
460 if (lemmatizeNodeQuery) {
461 node <- lemmatizeWordQuery(node)
462 }
463
464 if (lemmatizeCollocateQuery) {
465 collocate <- lemmatizeWordQuery(collocate)
466 }
467
468 query <- ""
469
470 if (leftContextSize > 0) {
471 query <-
Marc Kupietze2038322021-03-04 18:24:02 +0100472 paste0(collocate,
473 if (leftContextSize > 1) paste0(" []{0,", leftContextSize - 1, "} ") else " ",
474 node,
475 if (rightContextSize > 0) " | ")
Marc Kupietz006b47c2021-01-13 17:00:59 +0100476 }
477
478 if (rightContextSize > 0) {
479 query <-
Marc Kupietze2038322021-03-04 18:24:02 +0100480 paste0(query, node,
481 if (rightContextSize > 1) paste0(" []{0,", rightContextSize - 1, "} ") else " ", collocate)
Marc Kupietz006b47c2021-01-13 17:00:59 +0100482 }
483
Marc Kupietz006b47c2021-01-13 17:00:59 +0100484
485 tibble(
486 node = node,
487 collocate = collocate,
488 label = queryStringToLabel(vc),
489 vc = vc,
Marc Kupietz006b47c2021-01-13 17:00:59 +0100490 webUIRequestUrl = frequencyQuery(kco, query, vc)$webUIRequestUrl,
Marc Kupietze2038322021-03-04 18:24:02 +0100491 w = leftContextSize + rightContextSize,
492 leftContextSize,
493 rightContextSize,
494 N = frequencyQuery(kco, node, vc)$total + smoothingConstant,
495 O = as.double(frequencyQuery(kco, query, vc)$totalResults) + smoothingConstant,
496 O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant,
497 O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant,
498 E = w * as.double(O1) * O2 / N
Marc Kupietz006b47c2021-01-13 17:00:59 +0100499 ) %>%
Marc Kupietze2038322021-03-04 18:24:02 +0100500 mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w))
501
Marc Kupietz006b47c2021-01-13 17:00:59 +0100502 })