Association score funtions as parameters to collocationScoreQuery also factor out hc_add_onclick_korap_search Change-Id: I48f93761b9bda4e21669a99517c17c55cf3436ee

commit: e20383224d88cb02a65cb800e9cf196588d1d3cc [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Mar 04 18:24:02 2021 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Fri Mar 05 00:42:29 2021 +0100
tree: 63c5ebc566db11d91779b18447969cead761359b
parent: e02fef51e42085feedb8d3f5254c861b9c4721b8 [diff]
diff --git a/DESCRIPTION b/DESCRIPTION
index c6d28c4..776dc04 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION

@@ -26,7 +26,7 @@
 License: BSD_2_clause + file LICENSE
 Encoding: UTF-8
 LazyData: false
-RoxygenNote: 7.1.0
+RoxygenNote: 7.1.1
 Imports:
     R.cache,
     broom,
@@ -52,6 +52,7 @@
     'KorAPCorpusStats.R'
     'RKorAPClient.R'
     'KorAPQuery.R'
+    'association-scores.R'
     'ci.R'
     'highcharter-helper.R'
     'misc.R'

diff --git a/NAMESPACE b/NAMESPACE
index 5f1754d..a3c590f 100644
--- a/NAMESPACE
+++ b/NAMESPACE

@@ -6,15 +6,22 @@
 export(bind_cols)
 export(ci)
 export(complete)
+export(defaultAssociationScoreFunctions)
 export(expand_grid)
 export(geom_freq_by_year_ci)
 export(ggplotly)
 export(group_by)
+export(hc_add_onclick_korap_search)
 export(hc_freq_by_year_ci)
 export(ipm)
+export(ll)
+export(logDice)
+export(mi2)
+export(mi3)
 export(mutate)
 export(n)
 export(percent)
+export(pmi)
 export(queryStringToLabel)
 export(select)
 export(summarise)

diff --git a/NEWS.md b/NEWS.md
index c97e269..4393d3a 100644
--- a/NEWS.md
+++ b/NEWS.md

@@ -1,4 +1,7 @@
 # RKorAPClient 0.5.10
+## Changes
+- collocationScoreQuery method added
+- hc_add_onclick_korap_search function added
 
 # RKorAPClient 0.5.9
 

diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 027d119..fa84839 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R

@@ -66,6 +66,7 @@
 setGeneric("fetchRest", function(kqo, ...)  standardGeneric("fetchRest") )
 setGeneric("frequencyQuery", function(kco, ...)  standardGeneric("frequencyQuery") )
 setGeneric("collocationScoreQuery", function(kco, ...)  standardGeneric("collocationScoreQuery") )
+setGeneric("collocationScoreQueryNew", function(kco, ...)  standardGeneric("collocationScoreQueryNew") )
 
 
 maxResultsPerPage <- 50
@@ -385,27 +386,6 @@
 })
 
 
-#'
-#' @importFrom dplyr if_else
-#'
-ca_ll <- function(w1, w2, w12, n, true_window_size) {
-  r1 = as.double(w1) * true_window_size
-  r2 = as.double(n) - r1
-  c1 = w2
-  c2 = n - c1
-  o11 = w12
-  o12 = r1 - o11
-  o21 = c1 - w12
-  o22 = r2 - o21
-  e11 = r1 * c1 / n
-  e12 = r1 * c2 / n
-  e21 = r2 * c1 / n
-  e22 = r2 * c2 / n
-  2 * ( dplyr::if_else(o11>0, o11 * log(o11/e11), 0)
-        + dplyr::if_else(o12>0, o12 * log(o12/e12), 0)
-        + dplyr::if_else(o21>0, o21 * log(o21/e21), 0)
-        + dplyr::if_else(o22>0, o22 * log(o22/e22), 0))
-}
 
 lemmatizeWordQuery <- function(w) {
   paste0('[tt/l=', w, ']')
@@ -414,7 +394,7 @@
 #' Query frequencies of a node and a collocate and calculate collocation association scores
 #'
 #' \bold{\code{collocationScoreQuery}} computes various collocation association scores
-#' based on \code{\link{frequencyQuery}}s for a target worf and a collocate.
+#' based on \code{\link{frequencyQuery}}s for a target word and a collocate.
 #'
 #' @aliases collocationScoreQuery
 #' @rdname KorAPQuery-class
@@ -427,6 +407,10 @@
 #' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
 #' @param leftContextSize    size of the left context window
 #' @param rightContextSize   size of the right context window
+#' @param scoreFunctions     named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}
+#' @param smoothingConstant  smoothing constant will be added to all observed values
+#'
+#' @return tibble with query KorAP web request URL, all observed values and association scores
 #'
 #' @examples
 #' \donttest{
@@ -435,11 +419,20 @@
 #' }
 #'
 #' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Grund", c("guter", "triftiger"),
+#'    scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
+#' }
+#'
+#' \donttest{
 #' library(highcharter)
+#' library(tidyr)
 #' new("KorAPConnection", verbose = TRUE) %>%
 #'   collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
 #'                         lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
-#'   hchart(type="spline", hcaes(label, score, group=measure))
+#'                          pivot_longer(14:last_col(), names_to = "measure", values_to = "score") %>%
+#'   hchart(type="spline", hcaes(label, score, group=measure)) %>%
+#'   hc_add_onclick_korap_search()
 #' }
 #'
 #' @importFrom tidyr pivot_longer
@@ -452,7 +445,13 @@
                    lemmatizeNodeQuery = FALSE,
                    lemmatizeCollocateQuery = FALSE,
                    leftContextSize = 5,
-                   rightContextSize = 5) {
+                   rightContextSize = 5,
+                   scoreFunctions = defaultAssociationScoreFunctions(),
+                   smoothingConstant = .5
+                   ) {
+            # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+            O1 <- O2 <- O <- N <- E <- w <- 0
+
             if (leftContextSize <= 0 && rightContextSize <= 0) {
               stop("At least one of leftContextSize and rightContextSize must be > 0",
                    call. = FALSE)
@@ -470,36 +469,34 @@
 
             if (leftContextSize > 0) {
               query <-
-                paste0(collocate, " []{0,", leftContextSize - 1, "} ", node,
-                        if (rightContextSize > 0)  " | " else "")
+                paste0(collocate,
+                       if (leftContextSize > 1) paste0(" []{0,", leftContextSize - 1, "} ") else " ",
+                       node,
+                       if (rightContextSize > 0)  " | ")
             }
 
             if (rightContextSize > 0) {
               query <-
-                paste0(query, node, " []{0,", rightContextSize - 1, "} ", collocate)
+                paste0(query, node,
+                       if (rightContextSize > 1) paste0(" []{0,", rightContextSize - 1, "} ") else " ", collocate)
             }
 
-            w <- leftContextSize + rightContextSize
 
             tibble(
               node = node,
               collocate = collocate,
               label = queryStringToLabel(vc),
               vc = vc,
-
-              O = as.double(frequencyQuery(kco, query, vc)$totalResults),
               webUIRequestUrl = frequencyQuery(kco, query, vc)$webUIRequestUrl,
-              fx = frequencyQuery(kco, node, vc)$totalResults,
-              fy = frequencyQuery(kco, collocate, vc)$totalResults,
-              N  = frequencyQuery(kco, node, vc)$total,
-              E = w * as.double(fx) * fy / N,
-              MI      = log2(O   / E),
-              MI2     = log2(O ^ 2 / E),
-              MI3     = log2(O ^ 3 / E),
-              logDice = 14 + log2(2 * O / (w * fy + fx)),
-              llr     = ca_ll(fx, fy, O, N, w)
+              w = leftContextSize + rightContextSize,
+              leftContextSize,
+              rightContextSize,
+              N  = frequencyQuery(kco, node, vc)$total + smoothingConstant,
+              O = as.double(frequencyQuery(kco, query, vc)$totalResults) + smoothingConstant,
+              O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant,
+              O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant,
+              E = w * as.double(O1) * O2 / N
             ) %>%
-              tidyr::pivot_longer(c(MI, MI2, MI3, logDice, llr),
-                           names_to = "measure",
-                           values_to = "score")
+              mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w))
+
           })

diff --git a/R/association-scores.R b/R/association-scores.R
new file mode 100644
index 0000000..494477e
--- /dev/null
+++ b/R/association-scores.R

@@ -0,0 +1,124 @@
+#' Default association score functions
+#'
+#' @family association-score-functions
+#'
+#' @return list of default association score functions
+#' @export
+#'
+#' @examples
+#' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Perlen", c("verziertes", "Säue"),
+#'   scoreFunctions = append(defaultAssociationScoreFunctions(),
+#'      list(localMI = function(O1, O2, O, N, E, window_size) {
+#'                        O * log2(O/E)
+#'                     })))
+#' }
+#'
+defaultAssociationScoreFunctions <- function() {
+  list(pmi=pmi, mi2=mi2, mi3=mi3, logDice=logDice, ll=ll)
+}
+
+#' Pointwise mutual information
+#'
+#' @family association-score-functions
+#'
+#' @param O1            observed absolute frequency of node
+#' @param O2            observed absolute frequency of collocate
+#' @param O             observed absolute frequency of collocation
+#' @param N             corpus size
+#' @param E             expected absolute frequency of collocation (already adjusted to window size)
+#' @param window_size   total window size around node (left neighbour count + right neighbour count)
+#'
+#' @return              association score
+#' @export
+#'
+
+pmi <- function(O1, O2, O, N, E, window_size) {
+  log2(O / E)
+}
+
+#' Pointwise mutual information squared
+#'
+#' @family association-score-functions
+#'
+#' @details
+#' Also referenced to as mutual dependency (MD)
+#'
+#' @inheritParams pmi
+#' @export
+#'
+mi2 <- function(O1, O2, O, N, E, window_size) {
+  log2(O ^ 2 / E)
+}
+
+#' Pointwise mutual information cubed
+#'
+#' @family association-score-functions
+#'
+#' @details
+#' Also referenced to as log-frequency biased mutual dependency (LFMD)
+#'
+#' @inheritParams pmi
+#' @export
+#'
+#' @references
+#' Daille, B. (1994): Approche mixte pour l’extraction automatique de terminologie: statistiques lexicales et filtres linguistiques. PhD thesis, Université Paris 7.
+#'
+#' Thanopoulos, A., Fakotakis, N., Kokkinakis, G. (2002): Comparative evaluation of collocation extraction metrics. In: Proc. of LREC 2002: 620–625.
+#'
+mi3 <- function(O1, O2, O, N, E, window_size) {
+  log2(O ^ 3 / E)
+}
+
+#' log-Dice coefficient
+#'
+#' @family association-score-functions
+#' @inheritParams pmi
+#' @export
+#'
+#' @examples
+#'
+#' @references
+#' Rychlý, Pavel (2008): <a href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A lexicographer-friendly association score.</a> In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
+#'
+
+logDice <-  function(O1, O2, O, N, E, window_size) {
+  14 + log2(2 * O / (window_size * O2 + O1))
+}
+
+
+#' Log likelihood
+#'
+#' @family association-score-functions
+#'
+#' @export
+#'
+#' @importFrom dplyr if_else
+#'
+#' @inheritParams pmi
+#'
+#' @references
+#' Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
+#'
+#' Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
+#' Free PDF available from <http://purl.org/stefan.evert/PUB/Evert2004phd.pdf>
+#'
+ll <- function(O1, O2, O, N, E, window_size) {
+  r1 = as.double(O1) * window_size
+  r2 = as.double(N) - r1
+  c1 = O2
+  c2 = N - c1
+  o11 = O
+  o12 = r1 - o11
+  o21 = c1 - O
+  o22 = r2 - o21
+  e11 = r1 * c1 / N
+  e12 = r1 * c2 / N
+  e21 = r2 * c1 / N
+  e22 = r2 * c2 / N
+  2 * ( dplyr::if_else(o11>0, o11 * log(o11/e11), 0)
+        + dplyr::if_else(o12>0, o12 * log(o12/e12), 0)
+        + dplyr::if_else(o21>0, o21 * log(o21/e21), 0)
+        + dplyr::if_else(o22>0, o22 * log(o22/e22), 0))
+}

diff --git a/R/highcharter-helper.R b/R/highcharter-helper.R
index 2d4d7c1..2b9a557 100644
--- a/R/highcharter-helper.R
+++ b/R/highcharter-helper.R

@@ -75,14 +75,7 @@
     ) %>%
     hc_xAxis(allowDecimals=FALSE) %>%
     hc_add_theme(hc_theme_google(colors=palette)) %>%
-    hc_plotOptions(
-      series = list(enabled = TRUE),
-      spline = list(cursor = 'pointer', point = list(events = list(
-        click = JS("function() { window.open(this.click, 'korap'); }")
-      ))),
-      line = list(cursor = 'pointer', point = list(events = list(
-        click = JS("function() { window.open(this.click, 'korap'); }")
-      )))) %>%
+    hc_add_onclick_korap_search() %>%
     hc_credits(enabled = TRUE,
                text = "KorAP R Client Package",
                href = "https://github.com/KorAP/RKorAPClient/") %>%
@@ -125,7 +118,7 @@
         year = dat$year,
         value = if (as.alternatives) dat$f else dat$ipm,
         count = dat$totalResults,
-        click = dat$webUIRequestUrl
+        webUIRequestUrl = dat$webUIRequestUrl
       ),
       hcaes(year, value),
       type = type,
@@ -151,6 +144,43 @@
   hc
 }
 
+#' Add KorAP search click events to highchart
+#'
+#' @description
+#' Adds on-click events to data points of highcarts that were constructed with
+#' \ref{frequencyQuery} or ref \ref{collocationScoreQuery}. Clicks on data points
+#' then launch KorAP web UI queries for the given query term and virtual corpus in
+#' a separate frame.
+#'
+#' @param hc  highchart
+#'
+#' @export
+#'
+#' @examples
+#' \donttest{
+#' library(highcharter)
+#' library(tidyr)
+#'
+#' new("KorAPConnection", verbose = TRUE) %>%
+#'   collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+#'                         lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
+#'                          pivot_longer(c("O", "E")) %>%
+#'   hchart(type="spline", hcaes(label, value, group=name)) %>%
+#'   hc_add_onclick_korap_search()
+#' }
+#'
+hc_add_onclick_korap_search <- function(hc) {
+  hc_plotOptions(
+    hc,
+    series = list(enabled = TRUE),
+    spline = list(cursor = 'pointer', point = list(events = list(
+      click = JS("function() { window.open(this.webUIRequestUrl, 'korap'); }")
+    ))),
+    line = list(cursor = 'pointer', point = list(events = list(
+      click = JS("function() { window.open(this.webUIRequestUrl, 'korap'); }")
+    ))))
+}
+
 .onAttach <- function(libname = find.package("RKorAPClient"),
                       pkgname = "RKorAPClient") {
   packageStartupMessage(

diff --git a/man/KorAPQuery-class.Rd b/man/KorAPQuery-class.Rd
index f9acc9f..46eb322 100644
--- a/man/KorAPQuery-class.Rd
+++ b/man/KorAPQuery-class.Rd

@@ -15,6 +15,8 @@
 \alias{frequencyQuery}
 \alias{format.KorAPQuery}
 \alias{show,KorAPQuery-method}
+\alias{collocationScoreQuery,KorAPConnection-method}
+\alias{collocationScoreQuery}
 \title{Class KorAPQuery}
 \usage{
 \S4method{initialize}{KorAPQuery}(
@@ -56,6 +58,19 @@
 \method{format}{KorAPQuery}(x, ...)
 
 \S4method{show}{KorAPQuery}(object)
+
+\S4method{collocationScoreQuery}{KorAPConnection}(
+  kco,
+  node,
+  collocate,
+  vc = "",
+  lemmatizeNodeQuery = FALSE,
+  lemmatizeCollocateQuery = FALSE,
+  leftContextSize = 5,
+  rightContextSize = 5,
+  scoreFunctions = defaultAssociationScoreFunctions(),
+  smoothingConstant = 0.5
+)
 }
 \arguments{
 \item{.Object}{…}
@@ -64,7 +79,7 @@
 
 \item{request}{query part of the request URL}
 
-\item{vc}{definition of a virtual corpus}
+\item{vc}{string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.}
 
 \item{totalResults}{number of hits the query has yielded}
 
@@ -103,9 +118,27 @@
 \item{x}{KorAPQuery object}
 
 \item{object}{KorAPQuery object}
+
+\item{node}{target word}
+
+\item{collocate}{collocate of target word}
+
+\item{lemmatizeNodeQuery}{logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]}
+
+\item{lemmatizeCollocateQuery}{logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]}
+
+\item{leftContextSize}{size of the left context window}
+
+\item{rightContextSize}{size of the right context window}
+
+\item{scoreFunctions}{named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}}
+
+\item{smoothingConstant}{smoothing constant will be added to all observed values}
 }
 \value{
 The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
+
+tibble with query KorAP web request URL, all observed values and association scores
 }
 \description{
 \code{KorAPQuery} objects represent the current state of a query to a KorAP server.
@@ -117,6 +150,9 @@
 \code{\link{ci}} to compute a table with the relative frequencies and
 confidence intervals of one ore multiple search terms across one or multiple
 virtual corpora.
+
+\bold{\code{collocationScoreQuery}} computes various collocation association scores
+based on \code{\link{frequencyQuery}}s for a target word and a collocate.
 }
 \examples{
 \donttest{q <- new("KorAPConnection") \%>\% corpusQuery("Ameisenplage") \%>\% fetchNext()
@@ -138,6 +174,28 @@
   frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
 }
 
+\donttest{
+new("KorAPConnection", verbose = TRUE) \%>\%
+  collocationScoreQuery("Grund", "triftiger")
+}
+
+\donttest{
+new("KorAPConnection", verbose = TRUE) \%>\%
+collocationScoreQuery("Grund", c("guter", "triftiger"),
+   scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
+}
+
+\donttest{
+library(highcharter)
+library(tidyr)
+new("KorAPConnection", verbose = TRUE) \%>\%
+  collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+                        lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) \%>\%
+                         pivot_longer(14:last_col(), names_to = "measure", values_to = "score") \%>\%
+  hchart(type="spline", hcaes(label, score, group=measure)) \%>\%
+  hc_add_onclick_korap_search()
+}
+
 }
 \references{
 \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}

diff --git a/man/defaultAssociationScoreFunctions.Rd b/man/defaultAssociationScoreFunctions.Rd
new file mode 100644
index 0000000..09a87d8
--- /dev/null
+++ b/man/defaultAssociationScoreFunctions.Rd

@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{defaultAssociationScoreFunctions}
+\alias{defaultAssociationScoreFunctions}
+\title{Default association score functions}
+\usage{
+defaultAssociationScoreFunctions()
+}
+\value{
+list of default association score functions
+}
+\description{
+Default association score functions
+}
+\examples{
+\donttest{
+new("KorAPConnection", verbose = TRUE) \%>\%
+collocationScoreQuery("Perlen", c("verziertes", "Säue"),
+  scoreFunctions = append(associationScoreFunctions(),
+     list(localMI = function(O1, O2, O, N, E, window_size) {
+                       O * log2(O/E)
+                    })))
+}
+
+}
+\seealso{
+Other association-score-functions: 
+\code{\link{ll}()},
+\code{\link{logDice}()},
+\code{\link{mi2}()},
+\code{\link{mi3}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}

diff --git a/man/hc_add_onclick_korap_search.Rd b/man/hc_add_onclick_korap_search.Rd
new file mode 100644
index 0000000..4e3ac06
--- /dev/null
+++ b/man/hc_add_onclick_korap_search.Rd

@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/highcharter-helper.R
+\name{hc_add_onclick_korap_search}
+\alias{hc_add_onclick_korap_search}
+\title{Add KorAP search click events to highchart}
+\usage{
+hc_add_onclick_korap_search(hc)
+}
+\arguments{
+\item{hc}{highchart}
+}
+\description{
+Adds on-click events to data points of highcarts that were constructed with
+\ref{frequencyQuery} or ref \ref{collocationScoreQuery}. Clicks on data points
+then launch KorAP web UI queries for the given query term and virtual corpus in
+a separate frame.
+}
+\examples{
+\donttest{
+library(highcharter)
+new("KorAPConnection", verbose = TRUE) \%>\%
+  collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+                        lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) \%>\%
+                         pivot_longer(c("O", "E")) \%>\%
+  hchart(type="spline", hcaes(label, score, group=name)) \%>\%
+  hc_add_onclick_korap_search()
+}
+
+}

diff --git a/man/ll.Rd b/man/ll.Rd
new file mode 100644
index 0000000..95f2107
--- /dev/null
+++ b/man/ll.Rd

@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{ll}
+\alias{ll}
+\title{Log likelihood}
+\usage{
+ll(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\description{
+Log likelihood
+}
+\references{
+Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
+
+Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
+Free PDF available from <http://purl.org/stefan.evert/PUB/Evert2004phd.pdf>
+}
+\seealso{
+Other association-score-functions: 
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{logDice}()},
+\code{\link{mi2}()},
+\code{\link{mi3}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}

diff --git a/man/logDice.Rd b/man/logDice.Rd
new file mode 100644
index 0000000..9ecdc24
--- /dev/null
+++ b/man/logDice.Rd

@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{logDice}
+\alias{logDice}
+\title{log-Dice coefficient}
+\usage{
+logDice(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\description{
+log-Dice coefficient
+}
+\examples{
+
+}
+\references{
+Rychlý, Pavel (2008): <a href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A lexicographer-friendly association score.</a> In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
+}
+\seealso{
+Other association-score-functions: 
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{ll}()},
+\code{\link{mi2}()},
+\code{\link{mi3}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}

diff --git a/man/mi2.Rd b/man/mi2.Rd
new file mode 100644
index 0000000..4cbbab4
--- /dev/null
+++ b/man/mi2.Rd

@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{mi2}
+\alias{mi2}
+\title{Pointwise mutual information squared}
+\usage{
+mi2(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\description{
+Pointwise mutual information squared
+}
+\details{
+Also referenced to as mutual dependency (MD)
+}
+\seealso{
+Other association-score-functions: 
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{ll}()},
+\code{\link{logDice}()},
+\code{\link{mi3}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}

diff --git a/man/mi3.Rd b/man/mi3.Rd
new file mode 100644
index 0000000..7c8815e
--- /dev/null
+++ b/man/mi3.Rd

@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{mi3}
+\alias{mi3}
+\title{Pointwise mutual information cubed}
+\usage{
+mi3(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\description{
+Pointwise mutual information cubed
+}
+\details{
+Also referenced to as log-frequency biased mutual dependency (LFMD)
+}
+\references{
+Daille, B. (1994): Approche mixte pour l’extraction automatique de terminologie: statistiques lexicales et filtres linguistiques. PhD thesis, Université Paris 7.
+
+Thanopoulos, A., Fakotakis, N., Kokkinakis, G. (2002): Comparative evaluation of collocation extraction metrics. In: Proc. of LREC 2002: 620–625.
+}
+\seealso{
+Other association-score-functions: 
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{ll}()},
+\code{\link{logDice}()},
+\code{\link{mi2}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}

diff --git a/man/pmi.Rd b/man/pmi.Rd
new file mode 100644
index 0000000..015e00d
--- /dev/null
+++ b/man/pmi.Rd

@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{pmi}
+\alias{pmi}
+\title{Pointwise mutual information}
+\usage{
+pmi(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\value{
+association score
+}
+\description{
+Pointwise mutual information
+}
+\seealso{
+Other association-score-functions: 
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{ll}()},
+\code{\link{logDice}()},
+\code{\link{mi2}()},
+\code{\link{mi3}()}
+}
+\concept{association-score-functions}

diff --git a/tests/testthat/test-association-score-functions.R b/tests/testthat/test-association-score-functions.R
new file mode 100644
index 0000000..404478b
--- /dev/null
+++ b/tests/testthat/test-association-score-functions.R

@@ -0,0 +1,16 @@
+test_that("association scores are calculated correctly", {
+  x <- sapply(defaultAssociationScoreFunctions(), mapply, 4258869, 2165, 32, 21304641202, 4.327907, 10)
+  expect_that(x[["ll"]], equals(73.05347, tolerance=0.01))
+  expect_that(x[["pmi"]], equals(2.886331, tolerance=0.01))
+  expect_that(x[["mi2"]], equals(7.886331, tolerance=0.01))
+  expect_that(x[["mi3"]], equals(12.886331, tolerance=0.01))
+  expect_that(x[["logDice"]], equals(-2.029354, tolerance=0.01))
+
+  x <- sapply(defaultAssociationScoreFunctions(), mapply, 4258869, 2165, 0, 21304641202, 4.327907, 10)
+  expect_that(x[["ll"]], equals(8.664477, tolerance=0.01))
+  expect_equal(x[["pmi"]], -Inf)
+  expect_equal(x[["mi2"]], -Inf)
+  expect_equal(x[["mi3"]], -Inf)
+  expect_equal(x[["logDice"]], -Inf)
+
+})

diff --git a/tests/testthat/test-corpusQuery.R b/tests/testthat/test-corpusQuery.R
index 5f84fab..72b9f36 100644
--- a/tests/testthat/test-corpusQuery.R
+++ b/tests/testthat/test-corpusQuery.R

@@ -4,6 +4,17 @@
   expect_output(frequencyQuery(kco, "Ameisenplage", "pubDate since 2014"), "cached")
 })
 
+test_that("collocationScoreQuery works", {
+  kco <- new("KorAPConnection", cache = TRUE, verbose = TRUE)
+  df <- collocationScoreQuery(kco,"Ameisenplage", "heimgesucht", leftContextSize=0, rightContextSize=1)
+  expect_gt(df$logDice, 1)
+  expect_equal(df$ll, ll(df$O1, df$O2, df$O, df$N, df$E, df$w))
+  expect_equal(df$pmi, pmi(df$O1, df$O2, df$O, df$N, df$E, df$w))
+  expect_equal(df$mi2, mi2(df$O1, df$O2, df$O, df$N, df$E, df$w))
+  expect_equal(df$mi3, mi3(df$O1, df$O2, df$O, df$N, df$E, df$w))
+  expect_equal(df$logDice, logDice(df$O1, df$O2, df$O, df$N, df$E, df$w))
+})
+
 test_that("Cache depends on indexRevision (handling also NULL values)", {
   kco <- new("KorAPConnection", cache = TRUE, verbose = TRUE)
   kco@indexRevision <- NULL
commit	e20383224d88cb02a65cb800e9cf196588d1d3cc	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Mar 04 18:24:02 2021 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Fri Mar 05 00:42:29 2021 +0100
tree	63c5ebc566db11d91779b18447969cead761359b
parent	e02fef51e42085feedb8d3f5254c861b9c4721b8 [diff]