blob: 2ab60212106ac8517b361a6ae4ea5502fca9a6dc [file] [log] [blame]
Marc Kupietza6e4ee62021-03-05 09:00:15 +01001#' Association score functions
Marc Kupietze2038322021-03-04 18:24:02 +01002#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +01003#' @param O1 observed absolute frequency of node
4#' @param O2 observed absolute frequency of collocate
5#' @param O observed absolute frequency of collocation
6#' @param N corpus size
7#' @param E expected absolute frequency of collocation (already adjusted to window size)
8#' @param window_size total window size around node (left neighbour count + right neighbour count)
Marc Kupietze2038322021-03-04 18:24:02 +01009#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @return association score
11#' @name association-score-functions
12#' @description
13#' Functions to calculate different collocation association scores between
14#' a node (target word) and words in a window around the it.
15#' The functions are primarily used by \code{\link{collocationScoreQuery}}.
16NULL
17#' NULL
18
19#' @rdname association-score-functions
20#'
Marc Kupietze2038322021-03-04 18:24:02 +010021#' @export
22#'
23#' @examples
24#' \donttest{
25#' new("KorAPConnection", verbose = TRUE) %>%
26#' collocationScoreQuery("Perlen", c("verziertes", "Säue"),
27#' scoreFunctions = append(defaultAssociationScoreFunctions(),
28#' list(localMI = function(O1, O2, O, N, E, window_size) {
29#' O * log2(O/E)
30#' })))
31#' }
32#'
33defaultAssociationScoreFunctions <- function() {
34 list(pmi=pmi, mi2=mi2, mi3=mi3, logDice=logDice, ll=ll)
35}
36
Marc Kupietza6e4ee62021-03-05 09:00:15 +010037#' @rdname association-score-functions
Marc Kupietze2038322021-03-04 18:24:02 +010038#'
Marc Kupietz92a28482021-03-05 10:50:32 +010039#' @description
40#' \bold{pmi}: pointwise mutual information
41#'
Marc Kupietze2038322021-03-04 18:24:02 +010042#' @export
43#'
Marc Kupietze2038322021-03-04 18:24:02 +010044pmi <- function(O1, O2, O, N, E, window_size) {
45 log2(O / E)
46}
47
Marc Kupietza6e4ee62021-03-05 09:00:15 +010048#' @rdname association-score-functions
Marc Kupietze2038322021-03-04 18:24:02 +010049#'
Marc Kupietz92a28482021-03-05 10:50:32 +010050#' @description
51#' \bold{mi2}: pointwise mutual information squared (Daille 1994), also referred to as mutual dependency
52#' (Thanopoulos et al. 2002)
Marc Kupietze2038322021-03-04 18:24:02 +010053#' @export
54#'
55mi2 <- function(O1, O2, O, N, E, window_size) {
56 log2(O ^ 2 / E)
57}
58
Marc Kupietza6e4ee62021-03-05 09:00:15 +010059#' @rdname association-score-functions
Marc Kupietze2038322021-03-04 18:24:02 +010060#' @family association-score-functions
61#'
Marc Kupietz92a28482021-03-05 10:50:32 +010062#' @description
63#' \bold{mi3}: pointwise mutual information cubed (Daille 1994), also referred to as log-frequency biased mutual dependency)
64#' (Thanopoulos et al. 2002)
Marc Kupietze2038322021-03-04 18:24:02 +010065#'
Marc Kupietze2038322021-03-04 18:24:02 +010066#' @export
67#'
68#' @references
69#' Daille, B. (1994): Approche mixte pour l’extraction automatique de terminologie: statistiques lexicales et filtres linguistiques. PhD thesis, Université Paris 7.
70#'
71#' Thanopoulos, A., Fakotakis, N., Kokkinakis, G. (2002): Comparative evaluation of collocation extraction metrics. In: Proc. of LREC 2002: 620–625.
72#'
73mi3 <- function(O1, O2, O, N, E, window_size) {
74 log2(O ^ 3 / E)
75}
76
Marc Kupietza6e4ee62021-03-05 09:00:15 +010077#' @rdname association-score-functions
Marc Kupietz92a28482021-03-05 10:50:32 +010078#'
79#' @description
80#' \bold{logDice}: log-Dice coefficient, a heuristic measure that is popular in lexicography (Rychlý 2008)
Marc Kupietze2038322021-03-04 18:24:02 +010081#' @export
82#'
Marc Kupietze2038322021-03-04 18:24:02 +010083#' @references
Marc Kupietza6e4ee62021-03-05 09:00:15 +010084#' Rychlý, Pavel (2008): A lexicographer-friendly association score. In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9. <http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf>.
Marc Kupietze2038322021-03-04 18:24:02 +010085#'
86
87logDice <- function(O1, O2, O, N, E, window_size) {
88 14 + log2(2 * O / (window_size * O2 + O1))
89}
90
91
92#' Log likelihood
93#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010094#' @rdname association-score-functions
Marc Kupietz92a28482021-03-05 10:50:32 +010095#' @description
96#' \bold{ll}: log-likelihood (Dunning 1993) using Stefan Evert's (2004) simplified implementation
Marc Kupietze2038322021-03-04 18:24:02 +010097#'
98#' @export
99#'
100#' @importFrom dplyr if_else
101#'
Marc Kupietze2038322021-03-04 18:24:02 +0100102#' @references
103#' Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
104#'
105#' Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
Marc Kupietz92a28482021-03-05 10:50:32 +0100106#' Free PDF available from \url{http://purl.org/stefan.evert/PUB/Evert2004phd.pdf}
Marc Kupietze2038322021-03-04 18:24:02 +0100107#'
108ll <- function(O1, O2, O, N, E, window_size) {
109 r1 = as.double(O1) * window_size
110 r2 = as.double(N) - r1
111 c1 = O2
112 c2 = N - c1
113 o11 = O
114 o12 = r1 - o11
115 o21 = c1 - O
116 o22 = r2 - o21
117 e11 = r1 * c1 / N
118 e12 = r1 * c2 / N
119 e21 = r2 * c1 / N
120 e22 = r2 * c2 / N
121 2 * ( dplyr::if_else(o11>0, o11 * log(o11/e11), 0)
122 + dplyr::if_else(o12>0, o12 * log(o12/e12), 0)
123 + dplyr::if_else(o21>0, o21 * log(o21/e21), 0)
124 + dplyr::if_else(o22>0, o22 * log(o22/e22), 0))
125}