Add method collocationScoreQuery

Change-Id: Ibe7937951ad067dd463e45dfd67df01247dd99b7
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 3ce4773..027d119 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -65,6 +65,8 @@
 setGeneric("fetchNext", function(kqo, ...)  standardGeneric("fetchNext") )
 setGeneric("fetchRest", function(kqo, ...)  standardGeneric("fetchRest") )
 setGeneric("frequencyQuery", function(kco, ...)  standardGeneric("frequencyQuery") )
+setGeneric("collocationScoreQuery", function(kco, ...)  standardGeneric("collocationScoreQuery") )
+
 
 maxResultsPerPage <- 50
 
@@ -381,3 +383,123 @@
 setMethod("show", "KorAPQuery", function(object) {
   format(object)
 })
+
+
+#'
+#' @importFrom dplyr if_else
+#'
+ca_ll <- function(w1, w2, w12, n, true_window_size) {
+  r1 = as.double(w1) * true_window_size
+  r2 = as.double(n) - r1
+  c1 = w2
+  c2 = n - c1
+  o11 = w12
+  o12 = r1 - o11
+  o21 = c1 - w12
+  o22 = r2 - o21
+  e11 = r1 * c1 / n
+  e12 = r1 * c2 / n
+  e21 = r2 * c1 / n
+  e22 = r2 * c2 / n
+  2 * ( dplyr::if_else(o11>0, o11 * log(o11/e11), 0)
+        + dplyr::if_else(o12>0, o12 * log(o12/e12), 0)
+        + dplyr::if_else(o21>0, o21 * log(o21/e21), 0)
+        + dplyr::if_else(o22>0, o22 * log(o22/e22), 0))
+}
+
+lemmatizeWordQuery <- function(w) {
+  paste0('[tt/l=', w, ']')
+}
+
+#' Query frequencies of a node and a collocate and calculate collocation association scores
+#'
+#' \bold{\code{collocationScoreQuery}} computes various collocation association scores
+#' based on \code{\link{frequencyQuery}}s for a target worf and a collocate.
+#'
+#' @aliases collocationScoreQuery
+#' @rdname KorAPQuery-class
+#'
+#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
+#' @param node               target word
+#' @param collocate          collocate of target word
+#' @param vc                 string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
+#' @param lemmatizeNodeQuery      logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
+#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
+#' @param leftContextSize    size of the left context window
+#' @param rightContextSize   size of the right context window
+#'
+#' @examples
+#' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#'   collocationScoreQuery("Grund", "triftiger")
+#' }
+#'
+#' \donttest{
+#' library(highcharter)
+#' new("KorAPConnection", verbose = TRUE) %>%
+#'   collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+#'                         lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
+#'   hchart(type="spline", hcaes(label, score, group=measure))
+#' }
+#'
+#' @importFrom tidyr pivot_longer
+#' @export
+setMethod("collocationScoreQuery", "KorAPConnection",
+          function(kco,
+                   node,
+                   collocate,
+                   vc = "",
+                   lemmatizeNodeQuery = FALSE,
+                   lemmatizeCollocateQuery = FALSE,
+                   leftContextSize = 5,
+                   rightContextSize = 5) {
+            if (leftContextSize <= 0 && rightContextSize <= 0) {
+              stop("At least one of leftContextSize and rightContextSize must be > 0",
+                   call. = FALSE)
+            }
+
+            if (lemmatizeNodeQuery) {
+              node <- lemmatizeWordQuery(node)
+            }
+
+            if (lemmatizeCollocateQuery) {
+              collocate <- lemmatizeWordQuery(collocate)
+            }
+
+            query <- ""
+
+            if (leftContextSize > 0) {
+              query <-
+                paste0(collocate, " []{0,", leftContextSize - 1, "} ", node,
+                        if (rightContextSize > 0)  " | " else "")
+            }
+
+            if (rightContextSize > 0) {
+              query <-
+                paste0(query, node, " []{0,", rightContextSize - 1, "} ", collocate)
+            }
+
+            w <- leftContextSize + rightContextSize
+
+            tibble(
+              node = node,
+              collocate = collocate,
+              label = queryStringToLabel(vc),
+              vc = vc,
+
+              O = as.double(frequencyQuery(kco, query, vc)$totalResults),
+              webUIRequestUrl = frequencyQuery(kco, query, vc)$webUIRequestUrl,
+              fx = frequencyQuery(kco, node, vc)$totalResults,
+              fy = frequencyQuery(kco, collocate, vc)$totalResults,
+              N  = frequencyQuery(kco, node, vc)$total,
+              E = w * as.double(fx) * fy / N,
+              MI      = log2(O   / E),
+              MI2     = log2(O ^ 2 / E),
+              MI3     = log2(O ^ 3 / E),
+              logDice = 14 + log2(2 * O / (w * fy + fx)),
+              llr     = ca_ll(fx, fy, O, N, w)
+            ) %>%
+              tidyr::pivot_longer(c(MI, MI2, MI3, logDice, llr),
+                           names_to = "measure",
+                           values_to = "score")
+          })