Association score funtions as parameters to collocationScoreQuery
also factor out hc_add_onclick_korap_search
Change-Id: I48f93761b9bda4e21669a99517c17c55cf3436ee
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 027d119..fa84839 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -66,6 +66,7 @@
setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery") )
setGeneric("collocationScoreQuery", function(kco, ...) standardGeneric("collocationScoreQuery") )
+setGeneric("collocationScoreQueryNew", function(kco, ...) standardGeneric("collocationScoreQueryNew") )
maxResultsPerPage <- 50
@@ -385,27 +386,6 @@
})
-#'
-#' @importFrom dplyr if_else
-#'
-ca_ll <- function(w1, w2, w12, n, true_window_size) {
- r1 = as.double(w1) * true_window_size
- r2 = as.double(n) - r1
- c1 = w2
- c2 = n - c1
- o11 = w12
- o12 = r1 - o11
- o21 = c1 - w12
- o22 = r2 - o21
- e11 = r1 * c1 / n
- e12 = r1 * c2 / n
- e21 = r2 * c1 / n
- e22 = r2 * c2 / n
- 2 * ( dplyr::if_else(o11>0, o11 * log(o11/e11), 0)
- + dplyr::if_else(o12>0, o12 * log(o12/e12), 0)
- + dplyr::if_else(o21>0, o21 * log(o21/e21), 0)
- + dplyr::if_else(o22>0, o22 * log(o22/e22), 0))
-}
lemmatizeWordQuery <- function(w) {
paste0('[tt/l=', w, ']')
@@ -414,7 +394,7 @@
#' Query frequencies of a node and a collocate and calculate collocation association scores
#'
#' \bold{\code{collocationScoreQuery}} computes various collocation association scores
-#' based on \code{\link{frequencyQuery}}s for a target worf and a collocate.
+#' based on \code{\link{frequencyQuery}}s for a target word and a collocate.
#'
#' @aliases collocationScoreQuery
#' @rdname KorAPQuery-class
@@ -427,6 +407,10 @@
#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
#' @param leftContextSize size of the left context window
#' @param rightContextSize size of the right context window
+#' @param scoreFunctions named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}
+#' @param smoothingConstant smoothing constant will be added to all observed values
+#'
+#' @return tibble with query KorAP web request URL, all observed values and association scores
#'
#' @examples
#' \donttest{
@@ -435,11 +419,20 @@
#' }
#'
#' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Grund", c("guter", "triftiger"),
+#' scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
+#' }
+#'
+#' \donttest{
#' library(highcharter)
+#' library(tidyr)
#' new("KorAPConnection", verbose = TRUE) %>%
#' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
#' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
-#' hchart(type="spline", hcaes(label, score, group=measure))
+#' pivot_longer(14:last_col(), names_to = "measure", values_to = "score") %>%
+#' hchart(type="spline", hcaes(label, score, group=measure)) %>%
+#' hc_add_onclick_korap_search()
#' }
#'
#' @importFrom tidyr pivot_longer
@@ -452,7 +445,13 @@
lemmatizeNodeQuery = FALSE,
lemmatizeCollocateQuery = FALSE,
leftContextSize = 5,
- rightContextSize = 5) {
+ rightContextSize = 5,
+ scoreFunctions = defaultAssociationScoreFunctions(),
+ smoothingConstant = .5
+ ) {
+ # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+ O1 <- O2 <- O <- N <- E <- w <- 0
+
if (leftContextSize <= 0 && rightContextSize <= 0) {
stop("At least one of leftContextSize and rightContextSize must be > 0",
call. = FALSE)
@@ -470,36 +469,34 @@
if (leftContextSize > 0) {
query <-
- paste0(collocate, " []{0,", leftContextSize - 1, "} ", node,
- if (rightContextSize > 0) " | " else "")
+ paste0(collocate,
+ if (leftContextSize > 1) paste0(" []{0,", leftContextSize - 1, "} ") else " ",
+ node,
+ if (rightContextSize > 0) " | ")
}
if (rightContextSize > 0) {
query <-
- paste0(query, node, " []{0,", rightContextSize - 1, "} ", collocate)
+ paste0(query, node,
+ if (rightContextSize > 1) paste0(" []{0,", rightContextSize - 1, "} ") else " ", collocate)
}
- w <- leftContextSize + rightContextSize
tibble(
node = node,
collocate = collocate,
label = queryStringToLabel(vc),
vc = vc,
-
- O = as.double(frequencyQuery(kco, query, vc)$totalResults),
webUIRequestUrl = frequencyQuery(kco, query, vc)$webUIRequestUrl,
- fx = frequencyQuery(kco, node, vc)$totalResults,
- fy = frequencyQuery(kco, collocate, vc)$totalResults,
- N = frequencyQuery(kco, node, vc)$total,
- E = w * as.double(fx) * fy / N,
- MI = log2(O / E),
- MI2 = log2(O ^ 2 / E),
- MI3 = log2(O ^ 3 / E),
- logDice = 14 + log2(2 * O / (w * fy + fx)),
- llr = ca_ll(fx, fy, O, N, w)
+ w = leftContextSize + rightContextSize,
+ leftContextSize,
+ rightContextSize,
+ N = frequencyQuery(kco, node, vc)$total + smoothingConstant,
+ O = as.double(frequencyQuery(kco, query, vc)$totalResults) + smoothingConstant,
+ O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant,
+ O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant,
+ E = w * as.double(O1) * O2 / N
) %>%
- tidyr::pivot_longer(c(MI, MI2, MI3, logDice, llr),
- names_to = "measure",
- values_to = "score")
+ mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w))
+
})