Association score funtions as parameters to collocationScoreQuery
also factor out hc_add_onclick_korap_search
Change-Id: I48f93761b9bda4e21669a99517c17c55cf3436ee
diff --git a/DESCRIPTION b/DESCRIPTION
index c6d28c4..776dc04 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -26,7 +26,7 @@
License: BSD_2_clause + file LICENSE
Encoding: UTF-8
LazyData: false
-RoxygenNote: 7.1.0
+RoxygenNote: 7.1.1
Imports:
R.cache,
broom,
@@ -52,6 +52,7 @@
'KorAPCorpusStats.R'
'RKorAPClient.R'
'KorAPQuery.R'
+ 'association-scores.R'
'ci.R'
'highcharter-helper.R'
'misc.R'
diff --git a/NAMESPACE b/NAMESPACE
index 5f1754d..a3c590f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -6,15 +6,22 @@
export(bind_cols)
export(ci)
export(complete)
+export(defaultAssociationScoreFunctions)
export(expand_grid)
export(geom_freq_by_year_ci)
export(ggplotly)
export(group_by)
+export(hc_add_onclick_korap_search)
export(hc_freq_by_year_ci)
export(ipm)
+export(ll)
+export(logDice)
+export(mi2)
+export(mi3)
export(mutate)
export(n)
export(percent)
+export(pmi)
export(queryStringToLabel)
export(select)
export(summarise)
diff --git a/NEWS.md b/NEWS.md
index c97e269..4393d3a 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,7 @@
# RKorAPClient 0.5.10
+## Changes
+- collocationScoreQuery method added
+- hc_add_onclick_korap_search function added
# RKorAPClient 0.5.9
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 027d119..fa84839 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -66,6 +66,7 @@
setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery") )
setGeneric("collocationScoreQuery", function(kco, ...) standardGeneric("collocationScoreQuery") )
+setGeneric("collocationScoreQueryNew", function(kco, ...) standardGeneric("collocationScoreQueryNew") )
maxResultsPerPage <- 50
@@ -385,27 +386,6 @@
})
-#'
-#' @importFrom dplyr if_else
-#'
-ca_ll <- function(w1, w2, w12, n, true_window_size) {
- r1 = as.double(w1) * true_window_size
- r2 = as.double(n) - r1
- c1 = w2
- c2 = n - c1
- o11 = w12
- o12 = r1 - o11
- o21 = c1 - w12
- o22 = r2 - o21
- e11 = r1 * c1 / n
- e12 = r1 * c2 / n
- e21 = r2 * c1 / n
- e22 = r2 * c2 / n
- 2 * ( dplyr::if_else(o11>0, o11 * log(o11/e11), 0)
- + dplyr::if_else(o12>0, o12 * log(o12/e12), 0)
- + dplyr::if_else(o21>0, o21 * log(o21/e21), 0)
- + dplyr::if_else(o22>0, o22 * log(o22/e22), 0))
-}
lemmatizeWordQuery <- function(w) {
paste0('[tt/l=', w, ']')
@@ -414,7 +394,7 @@
#' Query frequencies of a node and a collocate and calculate collocation association scores
#'
#' \bold{\code{collocationScoreQuery}} computes various collocation association scores
-#' based on \code{\link{frequencyQuery}}s for a target worf and a collocate.
+#' based on \code{\link{frequencyQuery}}s for a target word and a collocate.
#'
#' @aliases collocationScoreQuery
#' @rdname KorAPQuery-class
@@ -427,6 +407,10 @@
#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
#' @param leftContextSize size of the left context window
#' @param rightContextSize size of the right context window
+#' @param scoreFunctions named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}
+#' @param smoothingConstant smoothing constant will be added to all observed values
+#'
+#' @return tibble with query KorAP web request URL, all observed values and association scores
#'
#' @examples
#' \donttest{
@@ -435,11 +419,20 @@
#' }
#'
#' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Grund", c("guter", "triftiger"),
+#' scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
+#' }
+#'
+#' \donttest{
#' library(highcharter)
+#' library(tidyr)
#' new("KorAPConnection", verbose = TRUE) %>%
#' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
#' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
-#' hchart(type="spline", hcaes(label, score, group=measure))
+#' pivot_longer(14:last_col(), names_to = "measure", values_to = "score") %>%
+#' hchart(type="spline", hcaes(label, score, group=measure)) %>%
+#' hc_add_onclick_korap_search()
#' }
#'
#' @importFrom tidyr pivot_longer
@@ -452,7 +445,13 @@
lemmatizeNodeQuery = FALSE,
lemmatizeCollocateQuery = FALSE,
leftContextSize = 5,
- rightContextSize = 5) {
+ rightContextSize = 5,
+ scoreFunctions = defaultAssociationScoreFunctions(),
+ smoothingConstant = .5
+ ) {
+ # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+ O1 <- O2 <- O <- N <- E <- w <- 0
+
if (leftContextSize <= 0 && rightContextSize <= 0) {
stop("At least one of leftContextSize and rightContextSize must be > 0",
call. = FALSE)
@@ -470,36 +469,34 @@
if (leftContextSize > 0) {
query <-
- paste0(collocate, " []{0,", leftContextSize - 1, "} ", node,
- if (rightContextSize > 0) " | " else "")
+ paste0(collocate,
+ if (leftContextSize > 1) paste0(" []{0,", leftContextSize - 1, "} ") else " ",
+ node,
+ if (rightContextSize > 0) " | ")
}
if (rightContextSize > 0) {
query <-
- paste0(query, node, " []{0,", rightContextSize - 1, "} ", collocate)
+ paste0(query, node,
+ if (rightContextSize > 1) paste0(" []{0,", rightContextSize - 1, "} ") else " ", collocate)
}
- w <- leftContextSize + rightContextSize
tibble(
node = node,
collocate = collocate,
label = queryStringToLabel(vc),
vc = vc,
-
- O = as.double(frequencyQuery(kco, query, vc)$totalResults),
webUIRequestUrl = frequencyQuery(kco, query, vc)$webUIRequestUrl,
- fx = frequencyQuery(kco, node, vc)$totalResults,
- fy = frequencyQuery(kco, collocate, vc)$totalResults,
- N = frequencyQuery(kco, node, vc)$total,
- E = w * as.double(fx) * fy / N,
- MI = log2(O / E),
- MI2 = log2(O ^ 2 / E),
- MI3 = log2(O ^ 3 / E),
- logDice = 14 + log2(2 * O / (w * fy + fx)),
- llr = ca_ll(fx, fy, O, N, w)
+ w = leftContextSize + rightContextSize,
+ leftContextSize,
+ rightContextSize,
+ N = frequencyQuery(kco, node, vc)$total + smoothingConstant,
+ O = as.double(frequencyQuery(kco, query, vc)$totalResults) + smoothingConstant,
+ O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant,
+ O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant,
+ E = w * as.double(O1) * O2 / N
) %>%
- tidyr::pivot_longer(c(MI, MI2, MI3, logDice, llr),
- names_to = "measure",
- values_to = "score")
+ mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w))
+
})
diff --git a/R/association-scores.R b/R/association-scores.R
new file mode 100644
index 0000000..494477e
--- /dev/null
+++ b/R/association-scores.R
@@ -0,0 +1,124 @@
+#' Default association score functions
+#'
+#' @family association-score-functions
+#'
+#' @return list of default association score functions
+#' @export
+#'
+#' @examples
+#' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Perlen", c("verziertes", "Säue"),
+#' scoreFunctions = append(defaultAssociationScoreFunctions(),
+#' list(localMI = function(O1, O2, O, N, E, window_size) {
+#' O * log2(O/E)
+#' })))
+#' }
+#'
+defaultAssociationScoreFunctions <- function() {
+ list(pmi=pmi, mi2=mi2, mi3=mi3, logDice=logDice, ll=ll)
+}
+
+#' Pointwise mutual information
+#'
+#' @family association-score-functions
+#'
+#' @param O1 observed absolute frequency of node
+#' @param O2 observed absolute frequency of collocate
+#' @param O observed absolute frequency of collocation
+#' @param N corpus size
+#' @param E expected absolute frequency of collocation (already adjusted to window size)
+#' @param window_size total window size around node (left neighbour count + right neighbour count)
+#'
+#' @return association score
+#' @export
+#'
+
+pmi <- function(O1, O2, O, N, E, window_size) {
+ log2(O / E)
+}
+
+#' Pointwise mutual information squared
+#'
+#' @family association-score-functions
+#'
+#' @details
+#' Also referenced to as mutual dependency (MD)
+#'
+#' @inheritParams pmi
+#' @export
+#'
+mi2 <- function(O1, O2, O, N, E, window_size) {
+ log2(O ^ 2 / E)
+}
+
+#' Pointwise mutual information cubed
+#'
+#' @family association-score-functions
+#'
+#' @details
+#' Also referenced to as log-frequency biased mutual dependency (LFMD)
+#'
+#' @inheritParams pmi
+#' @export
+#'
+#' @references
+#' Daille, B. (1994): Approche mixte pour l’extraction automatique de terminologie: statistiques lexicales et filtres linguistiques. PhD thesis, Université Paris 7.
+#'
+#' Thanopoulos, A., Fakotakis, N., Kokkinakis, G. (2002): Comparative evaluation of collocation extraction metrics. In: Proc. of LREC 2002: 620–625.
+#'
+mi3 <- function(O1, O2, O, N, E, window_size) {
+ log2(O ^ 3 / E)
+}
+
+#' log-Dice coefficient
+#'
+#' @family association-score-functions
+#' @inheritParams pmi
+#' @export
+#'
+#' @examples
+#'
+#' @references
+#' Rychlý, Pavel (2008): <a href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A lexicographer-friendly association score.</a> In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
+#'
+
+logDice <- function(O1, O2, O, N, E, window_size) {
+ 14 + log2(2 * O / (window_size * O2 + O1))
+}
+
+
+#' Log likelihood
+#'
+#' @family association-score-functions
+#'
+#' @export
+#'
+#' @importFrom dplyr if_else
+#'
+#' @inheritParams pmi
+#'
+#' @references
+#' Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
+#'
+#' Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
+#' Free PDF available from <http://purl.org/stefan.evert/PUB/Evert2004phd.pdf>
+#'
+ll <- function(O1, O2, O, N, E, window_size) {
+ r1 = as.double(O1) * window_size
+ r2 = as.double(N) - r1
+ c1 = O2
+ c2 = N - c1
+ o11 = O
+ o12 = r1 - o11
+ o21 = c1 - O
+ o22 = r2 - o21
+ e11 = r1 * c1 / N
+ e12 = r1 * c2 / N
+ e21 = r2 * c1 / N
+ e22 = r2 * c2 / N
+ 2 * ( dplyr::if_else(o11>0, o11 * log(o11/e11), 0)
+ + dplyr::if_else(o12>0, o12 * log(o12/e12), 0)
+ + dplyr::if_else(o21>0, o21 * log(o21/e21), 0)
+ + dplyr::if_else(o22>0, o22 * log(o22/e22), 0))
+}
diff --git a/R/highcharter-helper.R b/R/highcharter-helper.R
index 2d4d7c1..2b9a557 100644
--- a/R/highcharter-helper.R
+++ b/R/highcharter-helper.R
@@ -75,14 +75,7 @@
) %>%
hc_xAxis(allowDecimals=FALSE) %>%
hc_add_theme(hc_theme_google(colors=palette)) %>%
- hc_plotOptions(
- series = list(enabled = TRUE),
- spline = list(cursor = 'pointer', point = list(events = list(
- click = JS("function() { window.open(this.click, 'korap'); }")
- ))),
- line = list(cursor = 'pointer', point = list(events = list(
- click = JS("function() { window.open(this.click, 'korap'); }")
- )))) %>%
+ hc_add_onclick_korap_search() %>%
hc_credits(enabled = TRUE,
text = "KorAP R Client Package",
href = "https://github.com/KorAP/RKorAPClient/") %>%
@@ -125,7 +118,7 @@
year = dat$year,
value = if (as.alternatives) dat$f else dat$ipm,
count = dat$totalResults,
- click = dat$webUIRequestUrl
+ webUIRequestUrl = dat$webUIRequestUrl
),
hcaes(year, value),
type = type,
@@ -151,6 +144,43 @@
hc
}
+#' Add KorAP search click events to highchart
+#'
+#' @description
+#' Adds on-click events to data points of highcarts that were constructed with
+#' \ref{frequencyQuery} or ref \ref{collocationScoreQuery}. Clicks on data points
+#' then launch KorAP web UI queries for the given query term and virtual corpus in
+#' a separate frame.
+#'
+#' @param hc highchart
+#'
+#' @export
+#'
+#' @examples
+#' \donttest{
+#' library(highcharter)
+#' library(tidyr)
+#'
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+#' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
+#' pivot_longer(c("O", "E")) %>%
+#' hchart(type="spline", hcaes(label, value, group=name)) %>%
+#' hc_add_onclick_korap_search()
+#' }
+#'
+hc_add_onclick_korap_search <- function(hc) {
+ hc_plotOptions(
+ hc,
+ series = list(enabled = TRUE),
+ spline = list(cursor = 'pointer', point = list(events = list(
+ click = JS("function() { window.open(this.webUIRequestUrl, 'korap'); }")
+ ))),
+ line = list(cursor = 'pointer', point = list(events = list(
+ click = JS("function() { window.open(this.webUIRequestUrl, 'korap'); }")
+ ))))
+}
+
.onAttach <- function(libname = find.package("RKorAPClient"),
pkgname = "RKorAPClient") {
packageStartupMessage(
diff --git a/man/KorAPQuery-class.Rd b/man/KorAPQuery-class.Rd
index f9acc9f..46eb322 100644
--- a/man/KorAPQuery-class.Rd
+++ b/man/KorAPQuery-class.Rd
@@ -15,6 +15,8 @@
\alias{frequencyQuery}
\alias{format.KorAPQuery}
\alias{show,KorAPQuery-method}
+\alias{collocationScoreQuery,KorAPConnection-method}
+\alias{collocationScoreQuery}
\title{Class KorAPQuery}
\usage{
\S4method{initialize}{KorAPQuery}(
@@ -56,6 +58,19 @@
\method{format}{KorAPQuery}(x, ...)
\S4method{show}{KorAPQuery}(object)
+
+\S4method{collocationScoreQuery}{KorAPConnection}(
+ kco,
+ node,
+ collocate,
+ vc = "",
+ lemmatizeNodeQuery = FALSE,
+ lemmatizeCollocateQuery = FALSE,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ scoreFunctions = defaultAssociationScoreFunctions(),
+ smoothingConstant = 0.5
+)
}
\arguments{
\item{.Object}{…}
@@ -64,7 +79,7 @@
\item{request}{query part of the request URL}
-\item{vc}{definition of a virtual corpus}
+\item{vc}{string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.}
\item{totalResults}{number of hits the query has yielded}
@@ -103,9 +118,27 @@
\item{x}{KorAPQuery object}
\item{object}{KorAPQuery object}
+
+\item{node}{target word}
+
+\item{collocate}{collocate of target word}
+
+\item{lemmatizeNodeQuery}{logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]}
+
+\item{lemmatizeCollocateQuery}{logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]}
+
+\item{leftContextSize}{size of the left context window}
+
+\item{rightContextSize}{size of the right context window}
+
+\item{scoreFunctions}{named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}}
+
+\item{smoothingConstant}{smoothing constant will be added to all observed values}
}
\value{
The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
+
+tibble with query KorAP web request URL, all observed values and association scores
}
\description{
\code{KorAPQuery} objects represent the current state of a query to a KorAP server.
@@ -117,6 +150,9 @@
\code{\link{ci}} to compute a table with the relative frequencies and
confidence intervals of one ore multiple search terms across one or multiple
virtual corpora.
+
+\bold{\code{collocationScoreQuery}} computes various collocation association scores
+based on \code{\link{frequencyQuery}}s for a target word and a collocate.
}
\examples{
\donttest{q <- new("KorAPConnection") \%>\% corpusQuery("Ameisenplage") \%>\% fetchNext()
@@ -138,6 +174,28 @@
frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
}
+\donttest{
+new("KorAPConnection", verbose = TRUE) \%>\%
+ collocationScoreQuery("Grund", "triftiger")
+}
+
+\donttest{
+new("KorAPConnection", verbose = TRUE) \%>\%
+collocationScoreQuery("Grund", c("guter", "triftiger"),
+ scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
+}
+
+\donttest{
+library(highcharter)
+library(tidyr)
+new("KorAPConnection", verbose = TRUE) \%>\%
+ collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+ lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) \%>\%
+ pivot_longer(14:last_col(), names_to = "measure", values_to = "score") \%>\%
+ hchart(type="spline", hcaes(label, score, group=measure)) \%>\%
+ hc_add_onclick_korap_search()
+}
+
}
\references{
\url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
diff --git a/man/defaultAssociationScoreFunctions.Rd b/man/defaultAssociationScoreFunctions.Rd
new file mode 100644
index 0000000..09a87d8
--- /dev/null
+++ b/man/defaultAssociationScoreFunctions.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{defaultAssociationScoreFunctions}
+\alias{defaultAssociationScoreFunctions}
+\title{Default association score functions}
+\usage{
+defaultAssociationScoreFunctions()
+}
+\value{
+list of default association score functions
+}
+\description{
+Default association score functions
+}
+\examples{
+\donttest{
+new("KorAPConnection", verbose = TRUE) \%>\%
+collocationScoreQuery("Perlen", c("verziertes", "Säue"),
+ scoreFunctions = append(associationScoreFunctions(),
+ list(localMI = function(O1, O2, O, N, E, window_size) {
+ O * log2(O/E)
+ })))
+}
+
+}
+\seealso{
+Other association-score-functions:
+\code{\link{ll}()},
+\code{\link{logDice}()},
+\code{\link{mi2}()},
+\code{\link{mi3}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}
diff --git a/man/hc_add_onclick_korap_search.Rd b/man/hc_add_onclick_korap_search.Rd
new file mode 100644
index 0000000..4e3ac06
--- /dev/null
+++ b/man/hc_add_onclick_korap_search.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/highcharter-helper.R
+\name{hc_add_onclick_korap_search}
+\alias{hc_add_onclick_korap_search}
+\title{Add KorAP search click events to highchart}
+\usage{
+hc_add_onclick_korap_search(hc)
+}
+\arguments{
+\item{hc}{highchart}
+}
+\description{
+Adds on-click events to data points of highcarts that were constructed with
+\ref{frequencyQuery} or ref \ref{collocationScoreQuery}. Clicks on data points
+then launch KorAP web UI queries for the given query term and virtual corpus in
+a separate frame.
+}
+\examples{
+\donttest{
+library(highcharter)
+new("KorAPConnection", verbose = TRUE) \%>\%
+ collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+ lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) \%>\%
+ pivot_longer(c("O", "E")) \%>\%
+ hchart(type="spline", hcaes(label, score, group=name)) \%>\%
+ hc_add_onclick_korap_search()
+}
+
+}
diff --git a/man/ll.Rd b/man/ll.Rd
new file mode 100644
index 0000000..95f2107
--- /dev/null
+++ b/man/ll.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{ll}
+\alias{ll}
+\title{Log likelihood}
+\usage{
+ll(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\description{
+Log likelihood
+}
+\references{
+Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
+
+Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
+Free PDF available from <http://purl.org/stefan.evert/PUB/Evert2004phd.pdf>
+}
+\seealso{
+Other association-score-functions:
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{logDice}()},
+\code{\link{mi2}()},
+\code{\link{mi3}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}
diff --git a/man/logDice.Rd b/man/logDice.Rd
new file mode 100644
index 0000000..9ecdc24
--- /dev/null
+++ b/man/logDice.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{logDice}
+\alias{logDice}
+\title{log-Dice coefficient}
+\usage{
+logDice(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\description{
+log-Dice coefficient
+}
+\examples{
+
+}
+\references{
+Rychlý, Pavel (2008): <a href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A lexicographer-friendly association score.</a> In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
+}
+\seealso{
+Other association-score-functions:
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{ll}()},
+\code{\link{mi2}()},
+\code{\link{mi3}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}
diff --git a/man/mi2.Rd b/man/mi2.Rd
new file mode 100644
index 0000000..4cbbab4
--- /dev/null
+++ b/man/mi2.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{mi2}
+\alias{mi2}
+\title{Pointwise mutual information squared}
+\usage{
+mi2(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\description{
+Pointwise mutual information squared
+}
+\details{
+Also referenced to as mutual dependency (MD)
+}
+\seealso{
+Other association-score-functions:
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{ll}()},
+\code{\link{logDice}()},
+\code{\link{mi3}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}
diff --git a/man/mi3.Rd b/man/mi3.Rd
new file mode 100644
index 0000000..7c8815e
--- /dev/null
+++ b/man/mi3.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{mi3}
+\alias{mi3}
+\title{Pointwise mutual information cubed}
+\usage{
+mi3(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\description{
+Pointwise mutual information cubed
+}
+\details{
+Also referenced to as log-frequency biased mutual dependency (LFMD)
+}
+\references{
+Daille, B. (1994): Approche mixte pour l’extraction automatique de terminologie: statistiques lexicales et filtres linguistiques. PhD thesis, Université Paris 7.
+
+Thanopoulos, A., Fakotakis, N., Kokkinakis, G. (2002): Comparative evaluation of collocation extraction metrics. In: Proc. of LREC 2002: 620–625.
+}
+\seealso{
+Other association-score-functions:
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{ll}()},
+\code{\link{logDice}()},
+\code{\link{mi2}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}
diff --git a/man/pmi.Rd b/man/pmi.Rd
new file mode 100644
index 0000000..015e00d
--- /dev/null
+++ b/man/pmi.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{pmi}
+\alias{pmi}
+\title{Pointwise mutual information}
+\usage{
+pmi(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\value{
+association score
+}
+\description{
+Pointwise mutual information
+}
+\seealso{
+Other association-score-functions:
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{ll}()},
+\code{\link{logDice}()},
+\code{\link{mi2}()},
+\code{\link{mi3}()}
+}
+\concept{association-score-functions}
diff --git a/tests/testthat/test-association-score-functions.R b/tests/testthat/test-association-score-functions.R
new file mode 100644
index 0000000..404478b
--- /dev/null
+++ b/tests/testthat/test-association-score-functions.R
@@ -0,0 +1,16 @@
+test_that("association scores are calculated correctly", {
+ x <- sapply(defaultAssociationScoreFunctions(), mapply, 4258869, 2165, 32, 21304641202, 4.327907, 10)
+ expect_that(x[["ll"]], equals(73.05347, tolerance=0.01))
+ expect_that(x[["pmi"]], equals(2.886331, tolerance=0.01))
+ expect_that(x[["mi2"]], equals(7.886331, tolerance=0.01))
+ expect_that(x[["mi3"]], equals(12.886331, tolerance=0.01))
+ expect_that(x[["logDice"]], equals(-2.029354, tolerance=0.01))
+
+ x <- sapply(defaultAssociationScoreFunctions(), mapply, 4258869, 2165, 0, 21304641202, 4.327907, 10)
+ expect_that(x[["ll"]], equals(8.664477, tolerance=0.01))
+ expect_equal(x[["pmi"]], -Inf)
+ expect_equal(x[["mi2"]], -Inf)
+ expect_equal(x[["mi3"]], -Inf)
+ expect_equal(x[["logDice"]], -Inf)
+
+})
diff --git a/tests/testthat/test-corpusQuery.R b/tests/testthat/test-corpusQuery.R
index 5f84fab..72b9f36 100644
--- a/tests/testthat/test-corpusQuery.R
+++ b/tests/testthat/test-corpusQuery.R
@@ -4,6 +4,17 @@
expect_output(frequencyQuery(kco, "Ameisenplage", "pubDate since 2014"), "cached")
})
+test_that("collocationScoreQuery works", {
+ kco <- new("KorAPConnection", cache = TRUE, verbose = TRUE)
+ df <- collocationScoreQuery(kco,"Ameisenplage", "heimgesucht", leftContextSize=0, rightContextSize=1)
+ expect_gt(df$logDice, 1)
+ expect_equal(df$ll, ll(df$O1, df$O2, df$O, df$N, df$E, df$w))
+ expect_equal(df$pmi, pmi(df$O1, df$O2, df$O, df$N, df$E, df$w))
+ expect_equal(df$mi2, mi2(df$O1, df$O2, df$O, df$N, df$E, df$w))
+ expect_equal(df$mi3, mi3(df$O1, df$O2, df$O, df$N, df$E, df$w))
+ expect_equal(df$logDice, logDice(df$O1, df$O2, df$O, df$N, df$E, df$w))
+})
+
test_that("Cache depends on indexRevision (handling also NULL values)", {
kco <- new("KorAPConnection", cache = TRUE, verbose = TRUE)
kco@indexRevision <- NULL