Association score funtions as parameters to collocationScoreQuery
also factor out hc_add_onclick_korap_search
Change-Id: I48f93761b9bda4e21669a99517c17c55cf3436ee
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 027d119..fa84839 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -66,6 +66,7 @@
setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery") )
setGeneric("collocationScoreQuery", function(kco, ...) standardGeneric("collocationScoreQuery") )
+setGeneric("collocationScoreQueryNew", function(kco, ...) standardGeneric("collocationScoreQueryNew") )
maxResultsPerPage <- 50
@@ -385,27 +386,6 @@
})
-#'
-#' @importFrom dplyr if_else
-#'
-ca_ll <- function(w1, w2, w12, n, true_window_size) {
- r1 = as.double(w1) * true_window_size
- r2 = as.double(n) - r1
- c1 = w2
- c2 = n - c1
- o11 = w12
- o12 = r1 - o11
- o21 = c1 - w12
- o22 = r2 - o21
- e11 = r1 * c1 / n
- e12 = r1 * c2 / n
- e21 = r2 * c1 / n
- e22 = r2 * c2 / n
- 2 * ( dplyr::if_else(o11>0, o11 * log(o11/e11), 0)
- + dplyr::if_else(o12>0, o12 * log(o12/e12), 0)
- + dplyr::if_else(o21>0, o21 * log(o21/e21), 0)
- + dplyr::if_else(o22>0, o22 * log(o22/e22), 0))
-}
lemmatizeWordQuery <- function(w) {
paste0('[tt/l=', w, ']')
@@ -414,7 +394,7 @@
#' Query frequencies of a node and a collocate and calculate collocation association scores
#'
#' \bold{\code{collocationScoreQuery}} computes various collocation association scores
-#' based on \code{\link{frequencyQuery}}s for a target worf and a collocate.
+#' based on \code{\link{frequencyQuery}}s for a target word and a collocate.
#'
#' @aliases collocationScoreQuery
#' @rdname KorAPQuery-class
@@ -427,6 +407,10 @@
#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
#' @param leftContextSize size of the left context window
#' @param rightContextSize size of the right context window
+#' @param scoreFunctions named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}
+#' @param smoothingConstant smoothing constant will be added to all observed values
+#'
+#' @return tibble with query KorAP web request URL, all observed values and association scores
#'
#' @examples
#' \donttest{
@@ -435,11 +419,20 @@
#' }
#'
#' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Grund", c("guter", "triftiger"),
+#' scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
+#' }
+#'
+#' \donttest{
#' library(highcharter)
+#' library(tidyr)
#' new("KorAPConnection", verbose = TRUE) %>%
#' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
#' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
-#' hchart(type="spline", hcaes(label, score, group=measure))
+#' pivot_longer(14:last_col(), names_to = "measure", values_to = "score") %>%
+#' hchart(type="spline", hcaes(label, score, group=measure)) %>%
+#' hc_add_onclick_korap_search()
#' }
#'
#' @importFrom tidyr pivot_longer
@@ -452,7 +445,13 @@
lemmatizeNodeQuery = FALSE,
lemmatizeCollocateQuery = FALSE,
leftContextSize = 5,
- rightContextSize = 5) {
+ rightContextSize = 5,
+ scoreFunctions = defaultAssociationScoreFunctions(),
+ smoothingConstant = .5
+ ) {
+ # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+ O1 <- O2 <- O <- N <- E <- w <- 0
+
if (leftContextSize <= 0 && rightContextSize <= 0) {
stop("At least one of leftContextSize and rightContextSize must be > 0",
call. = FALSE)
@@ -470,36 +469,34 @@
if (leftContextSize > 0) {
query <-
- paste0(collocate, " []{0,", leftContextSize - 1, "} ", node,
- if (rightContextSize > 0) " | " else "")
+ paste0(collocate,
+ if (leftContextSize > 1) paste0(" []{0,", leftContextSize - 1, "} ") else " ",
+ node,
+ if (rightContextSize > 0) " | ")
}
if (rightContextSize > 0) {
query <-
- paste0(query, node, " []{0,", rightContextSize - 1, "} ", collocate)
+ paste0(query, node,
+ if (rightContextSize > 1) paste0(" []{0,", rightContextSize - 1, "} ") else " ", collocate)
}
- w <- leftContextSize + rightContextSize
tibble(
node = node,
collocate = collocate,
label = queryStringToLabel(vc),
vc = vc,
-
- O = as.double(frequencyQuery(kco, query, vc)$totalResults),
webUIRequestUrl = frequencyQuery(kco, query, vc)$webUIRequestUrl,
- fx = frequencyQuery(kco, node, vc)$totalResults,
- fy = frequencyQuery(kco, collocate, vc)$totalResults,
- N = frequencyQuery(kco, node, vc)$total,
- E = w * as.double(fx) * fy / N,
- MI = log2(O / E),
- MI2 = log2(O ^ 2 / E),
- MI3 = log2(O ^ 3 / E),
- logDice = 14 + log2(2 * O / (w * fy + fx)),
- llr = ca_ll(fx, fy, O, N, w)
+ w = leftContextSize + rightContextSize,
+ leftContextSize,
+ rightContextSize,
+ N = frequencyQuery(kco, node, vc)$total + smoothingConstant,
+ O = as.double(frequencyQuery(kco, query, vc)$totalResults) + smoothingConstant,
+ O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant,
+ O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant,
+ E = w * as.double(O1) * O2 / N
) %>%
- tidyr::pivot_longer(c(MI, MI2, MI3, logDice, llr),
- names_to = "measure",
- values_to = "score")
+ mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w))
+
})
diff --git a/R/association-scores.R b/R/association-scores.R
new file mode 100644
index 0000000..494477e
--- /dev/null
+++ b/R/association-scores.R
@@ -0,0 +1,124 @@
+#' Default association score functions
+#'
+#' @family association-score-functions
+#'
+#' @return list of default association score functions
+#' @export
+#'
+#' @examples
+#' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Perlen", c("verziertes", "Säue"),
+#' scoreFunctions = append(defaultAssociationScoreFunctions(),
+#' list(localMI = function(O1, O2, O, N, E, window_size) {
+#' O * log2(O/E)
+#' })))
+#' }
+#'
+defaultAssociationScoreFunctions <- function() {
+ list(pmi=pmi, mi2=mi2, mi3=mi3, logDice=logDice, ll=ll)
+}
+
+#' Pointwise mutual information
+#'
+#' @family association-score-functions
+#'
+#' @param O1 observed absolute frequency of node
+#' @param O2 observed absolute frequency of collocate
+#' @param O observed absolute frequency of collocation
+#' @param N corpus size
+#' @param E expected absolute frequency of collocation (already adjusted to window size)
+#' @param window_size total window size around node (left neighbour count + right neighbour count)
+#'
+#' @return association score
+#' @export
+#'
+
+pmi <- function(O1, O2, O, N, E, window_size) {
+ log2(O / E)
+}
+
+#' Pointwise mutual information squared
+#'
+#' @family association-score-functions
+#'
+#' @details
+#' Also referenced to as mutual dependency (MD)
+#'
+#' @inheritParams pmi
+#' @export
+#'
+mi2 <- function(O1, O2, O, N, E, window_size) {
+ log2(O ^ 2 / E)
+}
+
+#' Pointwise mutual information cubed
+#'
+#' @family association-score-functions
+#'
+#' @details
+#' Also referenced to as log-frequency biased mutual dependency (LFMD)
+#'
+#' @inheritParams pmi
+#' @export
+#'
+#' @references
+#' Daille, B. (1994): Approche mixte pour l’extraction automatique de terminologie: statistiques lexicales et filtres linguistiques. PhD thesis, Université Paris 7.
+#'
+#' Thanopoulos, A., Fakotakis, N., Kokkinakis, G. (2002): Comparative evaluation of collocation extraction metrics. In: Proc. of LREC 2002: 620–625.
+#'
+mi3 <- function(O1, O2, O, N, E, window_size) {
+ log2(O ^ 3 / E)
+}
+
+#' log-Dice coefficient
+#'
+#' @family association-score-functions
+#' @inheritParams pmi
+#' @export
+#'
+#' @examples
+#'
+#' @references
+#' Rychlý, Pavel (2008): <a href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A lexicographer-friendly association score.</a> In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
+#'
+
+logDice <- function(O1, O2, O, N, E, window_size) {
+ 14 + log2(2 * O / (window_size * O2 + O1))
+}
+
+
+#' Log likelihood
+#'
+#' @family association-score-functions
+#'
+#' @export
+#'
+#' @importFrom dplyr if_else
+#'
+#' @inheritParams pmi
+#'
+#' @references
+#' Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
+#'
+#' Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
+#' Free PDF available from <http://purl.org/stefan.evert/PUB/Evert2004phd.pdf>
+#'
+ll <- function(O1, O2, O, N, E, window_size) {
+ r1 = as.double(O1) * window_size
+ r2 = as.double(N) - r1
+ c1 = O2
+ c2 = N - c1
+ o11 = O
+ o12 = r1 - o11
+ o21 = c1 - O
+ o22 = r2 - o21
+ e11 = r1 * c1 / N
+ e12 = r1 * c2 / N
+ e21 = r2 * c1 / N
+ e22 = r2 * c2 / N
+ 2 * ( dplyr::if_else(o11>0, o11 * log(o11/e11), 0)
+ + dplyr::if_else(o12>0, o12 * log(o12/e12), 0)
+ + dplyr::if_else(o21>0, o21 * log(o21/e21), 0)
+ + dplyr::if_else(o22>0, o22 * log(o22/e22), 0))
+}
diff --git a/R/highcharter-helper.R b/R/highcharter-helper.R
index 2d4d7c1..2b9a557 100644
--- a/R/highcharter-helper.R
+++ b/R/highcharter-helper.R
@@ -75,14 +75,7 @@
) %>%
hc_xAxis(allowDecimals=FALSE) %>%
hc_add_theme(hc_theme_google(colors=palette)) %>%
- hc_plotOptions(
- series = list(enabled = TRUE),
- spline = list(cursor = 'pointer', point = list(events = list(
- click = JS("function() { window.open(this.click, 'korap'); }")
- ))),
- line = list(cursor = 'pointer', point = list(events = list(
- click = JS("function() { window.open(this.click, 'korap'); }")
- )))) %>%
+ hc_add_onclick_korap_search() %>%
hc_credits(enabled = TRUE,
text = "KorAP R Client Package",
href = "https://github.com/KorAP/RKorAPClient/") %>%
@@ -125,7 +118,7 @@
year = dat$year,
value = if (as.alternatives) dat$f else dat$ipm,
count = dat$totalResults,
- click = dat$webUIRequestUrl
+ webUIRequestUrl = dat$webUIRequestUrl
),
hcaes(year, value),
type = type,
@@ -151,6 +144,43 @@
hc
}
+#' Add KorAP search click events to highchart
+#'
+#' @description
+#' Adds on-click events to data points of highcarts that were constructed with
+#' \ref{frequencyQuery} or ref \ref{collocationScoreQuery}. Clicks on data points
+#' then launch KorAP web UI queries for the given query term and virtual corpus in
+#' a separate frame.
+#'
+#' @param hc highchart
+#'
+#' @export
+#'
+#' @examples
+#' \donttest{
+#' library(highcharter)
+#' library(tidyr)
+#'
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+#' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
+#' pivot_longer(c("O", "E")) %>%
+#' hchart(type="spline", hcaes(label, value, group=name)) %>%
+#' hc_add_onclick_korap_search()
+#' }
+#'
+hc_add_onclick_korap_search <- function(hc) {
+ hc_plotOptions(
+ hc,
+ series = list(enabled = TRUE),
+ spline = list(cursor = 'pointer', point = list(events = list(
+ click = JS("function() { window.open(this.webUIRequestUrl, 'korap'); }")
+ ))),
+ line = list(cursor = 'pointer', point = list(events = list(
+ click = JS("function() { window.open(this.webUIRequestUrl, 'korap'); }")
+ ))))
+}
+
.onAttach <- function(libname = find.package("RKorAPClient"),
pkgname = "RKorAPClient") {
packageStartupMessage(