Add method collocationScoreQuery
Change-Id: Ibe7937951ad067dd463e45dfd67df01247dd99b7
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 3ce4773..027d119 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -65,6 +65,8 @@
setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") )
setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery") )
+setGeneric("collocationScoreQuery", function(kco, ...) standardGeneric("collocationScoreQuery") )
+
maxResultsPerPage <- 50
@@ -381,3 +383,123 @@
setMethod("show", "KorAPQuery", function(object) {
format(object)
})
+
+
+#'
+#' @importFrom dplyr if_else
+#'
+ca_ll <- function(w1, w2, w12, n, true_window_size) {
+ r1 = as.double(w1) * true_window_size
+ r2 = as.double(n) - r1
+ c1 = w2
+ c2 = n - c1
+ o11 = w12
+ o12 = r1 - o11
+ o21 = c1 - w12
+ o22 = r2 - o21
+ e11 = r1 * c1 / n
+ e12 = r1 * c2 / n
+ e21 = r2 * c1 / n
+ e22 = r2 * c2 / n
+ 2 * ( dplyr::if_else(o11>0, o11 * log(o11/e11), 0)
+ + dplyr::if_else(o12>0, o12 * log(o12/e12), 0)
+ + dplyr::if_else(o21>0, o21 * log(o21/e21), 0)
+ + dplyr::if_else(o22>0, o22 * log(o22/e22), 0))
+}
+
+lemmatizeWordQuery <- function(w) {
+ paste0('[tt/l=', w, ']')
+}
+
+#' Query frequencies of a node and a collocate and calculate collocation association scores
+#'
+#' \bold{\code{collocationScoreQuery}} computes various collocation association scores
+#' based on \code{\link{frequencyQuery}}s for a target worf and a collocate.
+#'
+#' @aliases collocationScoreQuery
+#' @rdname KorAPQuery-class
+#'
+#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
+#' @param node target word
+#' @param collocate collocate of target word
+#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
+#' @param lemmatizeNodeQuery logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
+#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
+#' @param leftContextSize size of the left context window
+#' @param rightContextSize size of the right context window
+#'
+#' @examples
+#' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Grund", "triftiger")
+#' }
+#'
+#' \donttest{
+#' library(highcharter)
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+#' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
+#' hchart(type="spline", hcaes(label, score, group=measure))
+#' }
+#'
+#' @importFrom tidyr pivot_longer
+#' @export
+setMethod("collocationScoreQuery", "KorAPConnection",
+ function(kco,
+ node,
+ collocate,
+ vc = "",
+ lemmatizeNodeQuery = FALSE,
+ lemmatizeCollocateQuery = FALSE,
+ leftContextSize = 5,
+ rightContextSize = 5) {
+ if (leftContextSize <= 0 && rightContextSize <= 0) {
+ stop("At least one of leftContextSize and rightContextSize must be > 0",
+ call. = FALSE)
+ }
+
+ if (lemmatizeNodeQuery) {
+ node <- lemmatizeWordQuery(node)
+ }
+
+ if (lemmatizeCollocateQuery) {
+ collocate <- lemmatizeWordQuery(collocate)
+ }
+
+ query <- ""
+
+ if (leftContextSize > 0) {
+ query <-
+ paste0(collocate, " []{0,", leftContextSize - 1, "} ", node,
+ if (rightContextSize > 0) " | " else "")
+ }
+
+ if (rightContextSize > 0) {
+ query <-
+ paste0(query, node, " []{0,", rightContextSize - 1, "} ", collocate)
+ }
+
+ w <- leftContextSize + rightContextSize
+
+ tibble(
+ node = node,
+ collocate = collocate,
+ label = queryStringToLabel(vc),
+ vc = vc,
+
+ O = as.double(frequencyQuery(kco, query, vc)$totalResults),
+ webUIRequestUrl = frequencyQuery(kco, query, vc)$webUIRequestUrl,
+ fx = frequencyQuery(kco, node, vc)$totalResults,
+ fy = frequencyQuery(kco, collocate, vc)$totalResults,
+ N = frequencyQuery(kco, node, vc)$total,
+ E = w * as.double(fx) * fy / N,
+ MI = log2(O / E),
+ MI2 = log2(O ^ 2 / E),
+ MI3 = log2(O ^ 3 / E),
+ logDice = 14 + log2(2 * O / (w * fy + fx)),
+ llr = ca_ll(fx, fy, O, N, w)
+ ) %>%
+ tidyr::pivot_longer(c(MI, MI2, MI3, logDice, llr),
+ names_to = "measure",
+ values_to = "score")
+ })