blob: 15c742c92a8d4163965fe4e9e768c895e454107e [file] [log] [blame]
Marc Kupietze95108e2019-09-18 13:23:58 +02001#' Class KorAPQuery
2#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +01003#' This class provides methods to perform different kinds of queries on the KorAP API server.
4#' \code{KorAPQuery} objects, which are typically created by the \code{\link{corpusQuery}} method,
5#' represent the current state of a query to a KorAP server.
Marc Kupietze95108e2019-09-18 13:23:58 +02006#'
7#' @include KorAPConnection.R
Marc Kupietze95108e2019-09-18 13:23:58 +02008#' @import httr
9#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @include RKorAPClient-package.R
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020011
Marc Kupietze95108e2019-09-18 13:23:58 +020012#' @export
13KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietzb8972182019-09-20 21:33:46 +020014 "korapConnection",
Marc Kupietze95108e2019-09-18 13:23:58 +020015 "request",
16 "vc",
17 "totalResults",
18 "nextStartIndex",
19 "fields",
20 "requestUrl",
21 "webUIRequestUrl",
22 "apiResponse",
23 "collectedMatches",
24 "hasMoreMatches"
25))
Marc Kupietz5bbc9db2019-08-30 16:30:45 +020026
Marc Kupietze95108e2019-09-18 13:23:58 +020027#' Method initialize
28#'
29#' @rdname KorAPQuery-class
30#' @param .Object …
Marc Kupietzb8972182019-09-20 21:33:46 +020031#' @param korapConnection KorAPConnection object
Marc Kupietze95108e2019-09-18 13:23:58 +020032#' @param request query part of the request URL
33#' @param vc definition of a virtual corpus
34#' @param totalResults number of hits the query has yielded
35#' @param nextStartIndex at what index to start the next fetch of query results
36#' @param fields what data / metadata fields should be collected
37#' @param requestUrl complete URL of the API request
38#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
39#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz7776dec2019-09-27 16:59:02 +020040#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietze95108e2019-09-18 13:23:58 +020041#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz97a1bca2019-10-04 22:52:09 +020042#'
43#' @importFrom tibble tibble
Marc Kupietze95108e2019-09-18 13:23:58 +020044#' @export
45setMethod("initialize", "KorAPQuery",
Marc Kupietzb8972182019-09-20 21:33:46 +020046 function(.Object, korapConnection = NULL, request = NULL, vc="", totalResults=0, nextStartIndex=0, fields=c("corpusSigle", "textSigle", "pubDate", "pubPlace",
Marc Kupietze95108e2019-09-18 13:23:58 +020047 "availability", "textClass", "snippet"),
48 requestUrl="", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches= FALSE, collectedMatches = NULL) {
49 .Object <- callNextMethod()
Marc Kupietzb8972182019-09-20 21:33:46 +020050 .Object@korapConnection = korapConnection
Marc Kupietze95108e2019-09-18 13:23:58 +020051 .Object@request = request
52 .Object@vc = vc
53 .Object@totalResults = totalResults
54 .Object@nextStartIndex = nextStartIndex
55 .Object@fields = fields
56 .Object@requestUrl = requestUrl
57 .Object@webUIRequestUrl = webUIRequestUrl
58 .Object@apiResponse = apiResponse
59 .Object@hasMoreMatches = hasMoreMatches
60 .Object@collectedMatches = collectedMatches
61 .Object
62 })
Marc Kupietz632cbd42019-09-06 16:04:51 +020063
Marc Kupietze95108e2019-09-18 13:23:58 +020064setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery") )
65setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll") )
66setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") )
67setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
Marc Kupietz3f575282019-10-04 14:46:04 +020068setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery") )
Marc Kupietz006b47c2021-01-13 17:00:59 +010069setGeneric("collocationScoreQuery", function(kco, ...) standardGeneric("collocationScoreQuery") )
Marc Kupietze95108e2019-09-18 13:23:58 +020070
71maxResultsPerPage <- 50
Marc Kupietz62da2b52019-09-12 17:43:34 +020072
Marc Kupietz4de53ec2019-10-04 09:12:00 +020073## quiets concerns of R CMD check re: the .'s that appear in pipelines
74if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
Marc Kupietz632cbd42019-09-06 16:04:51 +020075
Marc Kupietza6e4ee62021-03-05 09:00:15 +010076#' \bold{\code{corpusQuery}} performs a corpus query via a connection to a KorAP-API-server
Marc Kupietze95108e2019-09-18 13:23:58 +020077#'
78#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
Akron5e135462019-09-27 16:31:38 +020079#' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.
Marc Kupietz632cbd42019-09-06 16:04:51 +020080#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
81#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.
Marc Kupietz7776dec2019-09-27 16:59:02 +020082#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).
Marc Kupietz3c531f62019-09-13 12:17:24 +020083#' @param ql string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values.
Akron5e135462019-09-27 16:31:38 +020084#' @param fields (meta)data fields that will be fetched for every match.
Marc Kupietz43a6ade2020-02-18 17:01:44 +010085#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz25aebc32019-09-16 18:40:50 +020086#' @param verbose print some info
Marc Kupietz4de53ec2019-10-04 09:12:00 +020087#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietz43a6ade2020-02-18 17:01:44 +010088#' @param expand logical that decides if \code{query} and \code{vc} parameters are expanded to all of their combinations
Marc Kupietz3f575282019-10-04 14:46:04 +020089#' @return Depending on the \code{as.df} parameter, a table or a \code{\link{KorAPQuery}} object that, among other information, contains the total number of results in \code{@totalResults}. The resulting object can be used to fetch all query results (with \code{\link{fetchAll}}) or the next page of results (with \code{\link{fetchNext}}).
Marc Kupietze95108e2019-09-18 13:23:58 +020090#' A corresponding URL to be used within a web browser is contained in \code{@webUIRequestUrl}
Marc Kupietz43a6ade2020-02-18 17:01:44 +010091#' Please make sure to check \code{$collection$rewrites} to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz632cbd42019-09-06 16:04:51 +020092#'
93#' @examples
Marc Kupietz603491f2019-09-18 14:01:02 +020094#' # Fetch metadata of every query hit for "Ameisenplage" and show a summary
Marc Kupietz657d8e72020-02-25 18:31:50 +010095#' \donttest{
Marc Kupietz69cc54a2019-09-30 12:06:54 +020096#' new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietz657d8e72020-02-25 18:31:50 +010097#' }
Marc Kupietz3c531f62019-09-13 12:17:24 +020098#'
Marc Kupietz603491f2019-09-18 14:01:02 +020099#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
100#' # and show the number of query hits (but don't fetch them).
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200101#'
102#' new("KorAPConnection", verbose = TRUE) %>%
103#' corpusQuery(KorAPUrl =
104#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp")
Marc Kupietz3c531f62019-09-13 12:17:24 +0200105#'
Marc Kupietz603491f2019-09-18 14:01:02 +0200106#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietz05b22772020-02-18 21:58:42 +0100107#' \donttest{
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200108#' new("KorAPConnection", verbose=TRUE) %>%
109#' { . ->> kco } %>%
110#' corpusQuery("Ameisenplage") %>%
111#' fetchAll() %>%
112#' slot("collectedMatches") %>%
113#' mutate(year = lubridate::year(pubDate)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200114#' dplyr::select(year) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200115#' group_by(year) %>%
Marc Kupietzcb3c59e2020-06-02 10:10:43 +0200116#' summarise(Count = dplyr::n()) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200117#' mutate(Freq = mapply(function(f, y)
118#' f / corpusStats(kco, paste("pubDate in", y))@tokens, Count, year)) %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200119#' dplyr::select(-Count) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200120#' complete(year = min(year):max(year), fill = list(Freq = 0)) %>%
121#' plot(type = "l")
Marc Kupietz05b22772020-02-18 21:58:42 +0100122#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200123#' @seealso \code{\link{KorAPConnection}}, \code{\link{fetchNext}}, \code{\link{fetchRest}}, \code{\link{fetchAll}}, \code{\link{corpusStats}}
Marc Kupietz632cbd42019-09-06 16:04:51 +0200124#'
125#' @references
126#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
127#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200128#' @aliases corpusQuery
Marc Kupietz632cbd42019-09-06 16:04:51 +0200129#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200130setMethod("corpusQuery", "KorAPConnection",
Marc Kupietza96537f2019-11-09 23:07:44 +0100131 function(kco,
132 query = if (missing(KorAPUrl))
133 stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
134 else
135 httr::parse_url(KorAPUrl)$query$q,
136 vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq,
137 KorAPUrl,
138 metadataOnly = TRUE,
139 ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql,
140 fields = c(
141 "corpusSigle",
142 "textSigle",
143 "pubDate",
144 "pubPlace",
145 "availability",
146 "textClass",
147 "snippet"
148 ),
149 accessRewriteFatal = TRUE,
150 verbose = kco@verbose,
151 expand = length(vc) != length(query),
152 as.df = FALSE) {
153 if (length(query) > 1 || length(vc) > 1) {
154
155 grid <- {
156 if (expand)
157 expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc) }
Marc Kupietz3f575282019-10-04 14:46:04 +0200158 return(
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200159 do.call(rbind,
Marc Kupietz3f575282019-10-04 14:46:04 +0200160 Map(function(q, cq) corpusQuery(kco, query=q, vc=cq, ql=ql,
161 verbose=verbose, as.df = TRUE), grid$query, grid$vc)) %>%
162 remove_rownames()
Marc Kupietza96537f2019-11-09 23:07:44 +0100163 )
164 } else {
165 contentFields <- c("snippet")
166 if (metadataOnly) {
167 fields <- fields[!fields %in% contentFields]
168 }
169 request <-
170 paste0('?q=',
171 URLencode(query, reserved = TRUE),
172 if (vc != '') paste0('&cq=', URLencode(vc, reserved = TRUE)) else '', '&ql=', ql)
173 webUIRequestUrl <- paste0(kco@KorAPUrl, request)
174 requestUrl <- paste0(
175 kco@apiUrl,
176 'search',
177 request,
178 '&fields=',
179 paste(fields, collapse = ","),
180 if (metadataOnly) '&access-rewrite-disabled=true' else ''
181 )
182 log.info(verbose, "Searching \"", query, "\" in \"", vc, "\"", sep =
183 "")
184 res = apiCall(kco, paste0(requestUrl, '&count=0'))
Marc Kupietzf5769b62019-12-13 09:19:45 +0100185 if(!is.null(res$meta$cached))
186 log.info(verbose, " [cached]\n")
187 else
188 log.info(verbose, " took ", res$meta$benchmark, "\n", sep = "")
Marc Kupietza96537f2019-11-09 23:07:44 +0100189 if (as.df)
190 data.frame(
191 query = query,
192 totalResults = res$meta$totalResults,
193 vc = vc,
194 webUIRequestUrl = webUIRequestUrl,
195 stringsAsFactors = FALSE
196 )
197 else
198 KorAPQuery(
199 korapConnection = kco,
200 nextStartIndex = 0,
201 fields = fields,
202 requestUrl = requestUrl,
203 request = request,
204 totalResults = res$meta$totalResults,
205 vc = vc,
206 apiResponse = res,
207 webUIRequestUrl = webUIRequestUrl,
208 hasMoreMatches = (res$meta$totalResults > 0),
209 )
210 }
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200211 })
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200212
Marc Kupietz62da2b52019-09-12 17:43:34 +0200213#' Fetch the next bunch of results of a KorAP query.
Marc Kupietze95108e2019-09-18 13:23:58 +0200214#'
Marc Kupietz3f575282019-10-04 14:46:04 +0200215#' \bold{\code{fetchNext}} fetches the next bunch of results of a KorAP query.
216#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200217#' @param kqo object obtained from \code{\link{corpusQuery}}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200218#' @param offset start offset for query results to fetch
219#' @param maxFetch maximum number of query results to fetch
Marc Kupietz25aebc32019-09-16 18:40:50 +0200220#' @param verbose print progress information if true
Marc Kupietze95108e2019-09-18 13:23:58 +0200221#' @return The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
Marc Kupietz62da2b52019-09-12 17:43:34 +0200222#'
Marc Kupietz05b22772020-02-18 21:58:42 +0100223#' @examples
Marc Kupietz657d8e72020-02-25 18:31:50 +0100224#' \donttest{q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchNext()
Marc Kupietz05b22772020-02-18 21:58:42 +0100225#' q@collectedMatches
Marc Kupietz657d8e72020-02-25 18:31:50 +0100226#' }
Marc Kupietz05b22772020-02-18 21:58:42 +0100227#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200228#' @references
229#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
230#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200231#' @aliases fetchNext
232#' @rdname KorAPQuery-class
Marc Kupietzcb3c59e2020-06-02 10:10:43 +0200233#' @importFrom dplyr rowwise bind_rows select summarise n
Marc Kupietz632cbd42019-09-06 16:04:51 +0200234#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200235setMethod("fetchNext", "KorAPQuery", function(kqo, offset = kqo@nextStartIndex, maxFetch = maxResultsPerPage, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200236 if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
237 return(kqo)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200238 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200239
240 page <- 1
241 results <- 0
Marc Kupietz25aebc32019-09-16 18:40:50 +0200242 pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietze95108e2019-09-18 13:23:58 +0200243 collectedMatches <- kqo@collectedMatches
Marc Kupietz62da2b52019-09-12 17:43:34 +0200244
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200245 repeat {
Marc Kupietz68170952021-06-30 09:37:21 +0200246 query <- paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', offset + results, '&cutoff=true')
247 res <- apiCall(kqo@korapConnection, query)
248 if (length(res$matches) == 0) {
249 break
250 }
251
Marc Kupietze95108e2019-09-18 13:23:58 +0200252 for (field in kqo@fields) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200253 if (!field %in% colnames(res$matches)) {
254 res$matches[, field] <- NA
255 }
256 }
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200257 currentMatches <-
Marc Kupietzf75ab0b2020-06-02 12:31:18 +0200258 res$matches %>%
Marc Kupietz19e2ebd2019-10-07 11:45:30 +0200259 dplyr::select(kqo@fields)
Marc Kupietz36d12d92019-09-27 18:13:27 +0200260 if ("pubDate" %in% kqo@fields) {
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200261 currentMatches$pubDate <- currentMatches$pubDate %>% as.Date(format = "%Y-%m-%d")
262 factorCols <- currentMatches %>% select(-pubDate) %>% colnames()
Marc Kupietz36d12d92019-09-27 18:13:27 +0200263 } else {
264 factorCols <- colnames(currentMatches)
265 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200266 currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor)
Marc Kupietz62da2b52019-09-12 17:43:34 +0200267 if (!is.list(collectedMatches)) {
268 collectedMatches <- currentMatches
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200269 } else {
Marc Kupietz62da2b52019-09-12 17:43:34 +0200270 collectedMatches <- rbind(collectedMatches, currentMatches)
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200271 }
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200272 if (verbose) {
Marc Kupietz68170952021-06-30 09:37:21 +0200273 cat(paste0("Retrieved page ", page, "/", ceiling((kqo@totalResults) / res$meta$itemsPerPage), ' in ', res$meta$benchmark, '\n'))
Marc Kupietzc2c59bd2019-08-30 16:50:49 +0200274 }
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200275 page <- page + 1
276 results <- results + res$meta$itemsPerPage
Marc Kupietz68170952021-06-30 09:37:21 +0200277 if (offset + results >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz5bbc9db2019-08-30 16:30:45 +0200278 break
279 }
280 }
Marc Kupietz68170952021-06-30 09:37:21 +0200281 nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietze95108e2019-09-18 13:23:58 +0200282 KorAPQuery(nextStartIndex = nextStartIndex,
Marc Kupietzd0d3e9b2019-09-24 17:36:03 +0200283 korapConnection = kqo@korapConnection,
Marc Kupietze95108e2019-09-18 13:23:58 +0200284 fields = kqo@fields,
285 requestUrl = kqo@requestUrl,
286 request = kqo@request,
Marc Kupietz68170952021-06-30 09:37:21 +0200287 totalResults = kqo@totalResults,
Marc Kupietze95108e2019-09-18 13:23:58 +0200288 vc = kqo@vc,
289 webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz68170952021-06-30 09:37:21 +0200290 hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietze95108e2019-09-18 13:23:58 +0200291 apiResponse = res,
292 collectedMatches = collectedMatches)
293})
Marc Kupietz62da2b52019-09-12 17:43:34 +0200294
295#' Fetch all results of a KorAP query.
Marc Kupietz62da2b52019-09-12 17:43:34 +0200296#'
Marc Kupietz9f3356d2021-06-30 09:29:26 +0200297#' \bold{\code{fetchAll}} fetches all results of a KorAP query.
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100298#'
Marc Kupietz62da2b52019-09-12 17:43:34 +0200299#' @examples
Marc Kupietz05b22772020-02-18 21:58:42 +0100300#' \donttest{
Marc Kupietz69cc54a2019-09-30 12:06:54 +0200301#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchAll()
Marc Kupietze95108e2019-09-18 13:23:58 +0200302#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100303#' }
Marc Kupietz62da2b52019-09-12 17:43:34 +0200304#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200305#' @aliases fetchAll
306#' @rdname KorAPQuery-class
Marc Kupietz62da2b52019-09-12 17:43:34 +0200307#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200308setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200309 return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose))
310})
311
312#' Fetches the remaining results of a KorAP query.
313#'
314#' @examples
Marc Kupietz05b22772020-02-18 21:58:42 +0100315#' \donttest{
316#' q <- new("KorAPConnection") %>% corpusQuery("Ameisenplage") %>% fetchRest()
Marc Kupietze95108e2019-09-18 13:23:58 +0200317#' q@collectedMatches
Marc Kupietz05b22772020-02-18 21:58:42 +0100318#' }
Marc Kupietze95108e2019-09-18 13:23:58 +0200319#'
320#' @aliases fetchRest
321#' @rdname KorAPQuery-class
322#' @export
Marc Kupietzf6f71312019-09-23 18:35:27 +0200323setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
Marc Kupietze95108e2019-09-18 13:23:58 +0200324 return(fetchNext(kqo, maxFetch = NA, verbose = verbose))
325})
326
Marc Kupietz3f575282019-10-04 14:46:04 +0200327#' Query relative frequency of search term(s)
328#'
329#' \bold{\code{frequencyQuery}} combines \code{\link{corpusQuery}}, \code{\link{corpusStats}} and
330#' \code{\link{ci}} to compute a table with the relative frequencies and
331#' confidence intervals of one ore multiple search terms across one or multiple
332#' virtual corpora.
333#'
334#' @aliases frequencyQuery
335#' @rdname KorAPQuery-class
336#' @examples
Marc Kupietz05b22772020-02-18 21:58:42 +0100337#' \donttest{
Marc Kupietz3f575282019-10-04 14:46:04 +0200338#' new("KorAPConnection", verbose = TRUE) %>%
339#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz05b22772020-02-18 21:58:42 +0100340#' }
Marc Kupietz3f575282019-10-04 14:46:04 +0200341#'
342#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
343#' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.
Marc Kupietz43a6ade2020-02-18 17:01:44 +0100344#' @param conf.level confidence level of the returned confidence interval (passed through \code{\link{ci}} to \code{\link{prop.test}}).
Marc Kupietz71d6e052019-11-22 18:42:10 +0100345#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If \code{as.alternatives} is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietz3f575282019-10-04 14:46:04 +0200346#' @export
347setMethod("frequencyQuery", "KorAPConnection",
Marc Kupietz71d6e052019-11-22 18:42:10 +0100348 function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
349 (if (as.alternatives) {
350 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) %>%
351 group_by(vc) %>%
352 mutate(total = sum(totalResults))
353 } else {
354 corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) %>%
355 mutate(total = corpusStats(kco, vc=vc, as.df=TRUE)$tokens)
356 } ) %>%
Marc Kupietz0c29cea2019-10-09 08:44:36 +0200357 ci(conf.level = conf.level)
Marc Kupietz3f575282019-10-04 14:46:04 +0200358})
359
Marc Kupietze95108e2019-09-18 13:23:58 +0200360#´ format()
361#' @rdname KorAPQuery-class
362#' @param x KorAPQuery object
363#' @param ... further arguments passed to or from other methods
364#' @export
365format.KorAPQuery <- function(x, ...) {
366 cat("<KorAPQuery>\n")
367 q <- x
368 aurl = parse_url(q@request)
Marc Kupietz0d4c9092020-03-23 09:02:30 +0100369 cat(" Query: ", aurl$query$q, "\n")
370 if (!is.null(aurl$query$cq) && aurl$query$cq != "") {
371 cat(" Virtual corpus: ", aurl$query$cq, "\n")
Marc Kupietze95108e2019-09-18 13:23:58 +0200372 }
373 if (!is.null(q@collectedMatches)) {
374 cat("==============================================================================================================", "\n")
375 print(summary(q@collectedMatches))
376 cat("==============================================================================================================", "\n")
377 }
378 cat(" Total results: ", q@totalResults, "\n")
379 cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz62da2b52019-09-12 17:43:34 +0200380}
381
Marc Kupietze95108e2019-09-18 13:23:58 +0200382#' show()
Marc Kupietz62da2b52019-09-12 17:43:34 +0200383#'
Marc Kupietze95108e2019-09-18 13:23:58 +0200384#' @rdname KorAPQuery-class
385#' @param object KorAPQuery object
Marc Kupietz62da2b52019-09-12 17:43:34 +0200386#' @export
Marc Kupietze95108e2019-09-18 13:23:58 +0200387setMethod("show", "KorAPQuery", function(object) {
388 format(object)
389})
Marc Kupietz006b47c2021-01-13 17:00:59 +0100390
391
Marc Kupietz006b47c2021-01-13 17:00:59 +0100392
393lemmatizeWordQuery <- function(w) {
394 paste0('[tt/l=', w, ']')
395}
396
397#' Query frequencies of a node and a collocate and calculate collocation association scores
398#'
399#' \bold{\code{collocationScoreQuery}} computes various collocation association scores
Marc Kupietze2038322021-03-04 18:24:02 +0100400#' based on \code{\link{frequencyQuery}}s for a target word and a collocate.
Marc Kupietz006b47c2021-01-13 17:00:59 +0100401#'
402#' @aliases collocationScoreQuery
403#' @rdname KorAPQuery-class
404#'
405#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
406#' @param node target word
407#' @param collocate collocate of target word
408#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
409#' @param lemmatizeNodeQuery logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
410#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
411#' @param leftContextSize size of the left context window
412#' @param rightContextSize size of the right context window
Marc Kupietze2038322021-03-04 18:24:02 +0100413#' @param scoreFunctions named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}
414#' @param smoothingConstant smoothing constant will be added to all observed values
415#'
416#' @return tibble with query KorAP web request URL, all observed values and association scores
Marc Kupietz006b47c2021-01-13 17:00:59 +0100417#'
418#' @examples
419#' \donttest{
420#' new("KorAPConnection", verbose = TRUE) %>%
421#' collocationScoreQuery("Grund", "triftiger")
422#' }
423#'
424#' \donttest{
Marc Kupietze2038322021-03-04 18:24:02 +0100425#' new("KorAPConnection", verbose = TRUE) %>%
426#' collocationScoreQuery("Grund", c("guter", "triftiger"),
427#' scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
428#' }
429#'
430#' \donttest{
Marc Kupietz006b47c2021-01-13 17:00:59 +0100431#' library(highcharter)
Marc Kupietze2038322021-03-04 18:24:02 +0100432#' library(tidyr)
Marc Kupietz006b47c2021-01-13 17:00:59 +0100433#' new("KorAPConnection", verbose = TRUE) %>%
434#' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
435#' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
Marc Kupietze2038322021-03-04 18:24:02 +0100436#' pivot_longer(14:last_col(), names_to = "measure", values_to = "score") %>%
437#' hchart(type="spline", hcaes(label, score, group=measure)) %>%
438#' hc_add_onclick_korap_search()
Marc Kupietz006b47c2021-01-13 17:00:59 +0100439#' }
440#'
441#' @importFrom tidyr pivot_longer
442#' @export
443setMethod("collocationScoreQuery", "KorAPConnection",
444 function(kco,
445 node,
446 collocate,
447 vc = "",
448 lemmatizeNodeQuery = FALSE,
449 lemmatizeCollocateQuery = FALSE,
450 leftContextSize = 5,
Marc Kupietze2038322021-03-04 18:24:02 +0100451 rightContextSize = 5,
452 scoreFunctions = defaultAssociationScoreFunctions(),
453 smoothingConstant = .5
454 ) {
455 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
456 O1 <- O2 <- O <- N <- E <- w <- 0
457
Marc Kupietz006b47c2021-01-13 17:00:59 +0100458 if (leftContextSize <= 0 && rightContextSize <= 0) {
459 stop("At least one of leftContextSize and rightContextSize must be > 0",
460 call. = FALSE)
461 }
462
463 if (lemmatizeNodeQuery) {
464 node <- lemmatizeWordQuery(node)
465 }
466
467 if (lemmatizeCollocateQuery) {
468 collocate <- lemmatizeWordQuery(collocate)
469 }
470
471 query <- ""
472
473 if (leftContextSize > 0) {
474 query <-
Marc Kupietze2038322021-03-04 18:24:02 +0100475 paste0(collocate,
476 if (leftContextSize > 1) paste0(" []{0,", leftContextSize - 1, "} ") else " ",
477 node,
478 if (rightContextSize > 0) " | ")
Marc Kupietz006b47c2021-01-13 17:00:59 +0100479 }
480
481 if (rightContextSize > 0) {
482 query <-
Marc Kupietze2038322021-03-04 18:24:02 +0100483 paste0(query, node,
484 if (rightContextSize > 1) paste0(" []{0,", rightContextSize - 1, "} ") else " ", collocate)
Marc Kupietz006b47c2021-01-13 17:00:59 +0100485 }
486
Marc Kupietz006b47c2021-01-13 17:00:59 +0100487
488 tibble(
489 node = node,
490 collocate = collocate,
491 label = queryStringToLabel(vc),
492 vc = vc,
Marc Kupietz006b47c2021-01-13 17:00:59 +0100493 webUIRequestUrl = frequencyQuery(kco, query, vc)$webUIRequestUrl,
Marc Kupietze2038322021-03-04 18:24:02 +0100494 w = leftContextSize + rightContextSize,
495 leftContextSize,
496 rightContextSize,
497 N = frequencyQuery(kco, node, vc)$total + smoothingConstant,
498 O = as.double(frequencyQuery(kco, query, vc)$totalResults) + smoothingConstant,
499 O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant,
500 O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant,
501 E = w * as.double(O1) * O2 / N
Marc Kupietz006b47c2021-01-13 17:00:59 +0100502 ) %>%
Marc Kupietze2038322021-03-04 18:24:02 +0100503 mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w))
504
Marc Kupietz006b47c2021-01-13 17:00:59 +0100505 })