blob: ddf3f5bb71d4083b5af88e79bc8d6caea6ee5237 [file] [log] [blame]
Marc Kupietza6e4ee62021-03-05 09:00:15 +01001#' Association score functions
Marc Kupietze2038322021-03-04 18:24:02 +01002#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +01003#' @param O1 observed absolute frequency of node
4#' @param O2 observed absolute frequency of collocate
5#' @param O observed absolute frequency of collocation
6#' @param N corpus size
7#' @param E expected absolute frequency of collocation (already adjusted to window size)
8#' @param window_size total window size around node (left neighbour count + right neighbour count)
Marc Kupietze2038322021-03-04 18:24:02 +01009#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010010#' @return association score
11#' @name association-score-functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +020012#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010013#' @description
14#' Functions to calculate different collocation association scores between
15#' a node (target word) and words in a window around the it.
Marc Kupietz67edcb52021-09-20 21:54:24 +020016#' The functions are primarily used by [collocationScoreQuery()].
Marc Kupietza6e4ee62021-03-05 09:00:15 +010017NULL
18#' NULL
19
20#' @rdname association-score-functions
21#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020022#' @family collocation analysis functions
23#'
Marc Kupietze2038322021-03-04 18:24:02 +010024#' @export
25#'
26#' @examples
27#' \donttest{
28#' new("KorAPConnection", verbose = TRUE) %>%
29#' collocationScoreQuery("Perlen", c("verziertes", "Säue"),
30#' scoreFunctions = append(defaultAssociationScoreFunctions(),
31#' list(localMI = function(O1, O2, O, N, E, window_size) {
32#' O * log2(O/E)
33#' })))
34#' }
35#'
36defaultAssociationScoreFunctions <- function() {
37 list(pmi=pmi, mi2=mi2, mi3=mi3, logDice=logDice, ll=ll)
38}
39
Marc Kupietza6e4ee62021-03-05 09:00:15 +010040#' @rdname association-score-functions
Marc Kupietze2038322021-03-04 18:24:02 +010041#'
Marc Kupietz92a28482021-03-05 10:50:32 +010042#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +020043#' **pmi**: pointwise mutual information
Marc Kupietz92a28482021-03-05 10:50:32 +010044#'
Marc Kupietze2038322021-03-04 18:24:02 +010045#' @export
46#'
Marc Kupietze2038322021-03-04 18:24:02 +010047pmi <- function(O1, O2, O, N, E, window_size) {
48 log2(O / E)
49}
50
Marc Kupietza6e4ee62021-03-05 09:00:15 +010051#' @rdname association-score-functions
Marc Kupietze2038322021-03-04 18:24:02 +010052#'
Marc Kupietz92a28482021-03-05 10:50:32 +010053#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +020054#' **mi2**: pointwise mutual information squared (Daille 1994), also referred to as mutual dependency
Marc Kupietz92a28482021-03-05 10:50:32 +010055#' (Thanopoulos et al. 2002)
Marc Kupietze2038322021-03-04 18:24:02 +010056#' @export
57#'
58mi2 <- function(O1, O2, O, N, E, window_size) {
59 log2(O ^ 2 / E)
60}
61
Marc Kupietza6e4ee62021-03-05 09:00:15 +010062#' @rdname association-score-functions
Marc Kupietze2038322021-03-04 18:24:02 +010063#' @family association-score-functions
64#'
Marc Kupietz92a28482021-03-05 10:50:32 +010065#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +020066#' **mi3**: pointwise mutual information cubed (Daille 1994), also referred to as log-frequency biased mutual dependency)
Marc Kupietz92a28482021-03-05 10:50:32 +010067#' (Thanopoulos et al. 2002)
Marc Kupietze2038322021-03-04 18:24:02 +010068#'
Marc Kupietze2038322021-03-04 18:24:02 +010069#' @export
70#'
71#' @references
72#' Daille, B. (1994): Approche mixte pour l’extraction automatique de terminologie: statistiques lexicales et filtres linguistiques. PhD thesis, Université Paris 7.
73#'
74#' Thanopoulos, A., Fakotakis, N., Kokkinakis, G. (2002): Comparative evaluation of collocation extraction metrics. In: Proc. of LREC 2002: 620–625.
75#'
76mi3 <- function(O1, O2, O, N, E, window_size) {
77 log2(O ^ 3 / E)
78}
79
Marc Kupietza6e4ee62021-03-05 09:00:15 +010080#' @rdname association-score-functions
Marc Kupietz92a28482021-03-05 10:50:32 +010081#'
82#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +020083#' **logDice**: log-Dice coefficient, a heuristic measure that is popular in lexicography (Rychlý 2008)
Marc Kupietze2038322021-03-04 18:24:02 +010084#' @export
85#'
Marc Kupietze2038322021-03-04 18:24:02 +010086#' @references
Marc Kupietzbf004932021-09-21 06:57:20 +020087#' Rychlý, Pavel (2008): A lexicographer-friendly association score. In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9. <https://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf>.
Marc Kupietze2038322021-03-04 18:24:02 +010088#'
89
90logDice <- function(O1, O2, O, N, E, window_size) {
Marc Kupietz00858082021-03-12 09:27:35 +010091 14 + log2(2 * O / (window_size * O1 + O2))
Marc Kupietze2038322021-03-04 18:24:02 +010092}
93
94
95#' Log likelihood
96#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010097#' @rdname association-score-functions
Marc Kupietz92a28482021-03-05 10:50:32 +010098#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +020099#' **ll**: log-likelihood (Dunning 1993) using Stefan Evert's (2004) simplified implementation
Marc Kupietze2038322021-03-04 18:24:02 +0100100#'
101#' @export
102#'
103#' @importFrom dplyr if_else
104#'
Marc Kupietze2038322021-03-04 18:24:02 +0100105#' @references
106#' Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
107#'
108#' Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
Marc Kupietz67edcb52021-09-20 21:54:24 +0200109#' Free PDF available from <http://purl.org/stefan.evert/PUB/Evert2004phd.pdf>
Marc Kupietze2038322021-03-04 18:24:02 +0100110#'
111ll <- function(O1, O2, O, N, E, window_size) {
112 r1 = as.double(O1) * window_size
113 r2 = as.double(N) - r1
114 c1 = O2
115 c2 = N - c1
116 o11 = O
117 o12 = r1 - o11
118 o21 = c1 - O
119 o22 = r2 - o21
120 e11 = r1 * c1 / N
121 e12 = r1 * c2 / N
122 e21 = r2 * c1 / N
123 e22 = r2 * c2 / N
124 2 * ( dplyr::if_else(o11>0, o11 * log(o11/e11), 0)
125 + dplyr::if_else(o12>0, o12 * log(o12/e12), 0)
126 + dplyr::if_else(o21>0, o21 * log(o21/e21), 0)
127 + dplyr::if_else(o22>0, o22 * log(o22/e22), 0))
128}