Blame - R/association-scores.R - KorAP/RKorAPClient

blob: 2ab60212106ac8517b361a6ae4ea5502fca9a6dc [file] [log] [blame]

Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	1	#' Association score functions
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	2	#'
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	3	#' @param O1 observed absolute frequency of node
				4	#' @param O2 observed absolute frequency of collocate
				5	#' @param O observed absolute frequency of collocation
				6	#' @param N corpus size
				7	#' @param E expected absolute frequency of collocation (already adjusted to window size)
				8	#' @param window_size total window size around node (left neighbour count + right neighbour count)
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	9	#'
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	10	#' @return association score
				11	#' @name association-score-functions
				12	#' @description
				13	#' Functions to calculate different collocation association scores between
				14	#' a node (target word) and words in a window around the it.
				15	#' The functions are primarily used by \code{\link{collocationScoreQuery}}.
				16	NULL
				17	#' NULL
				18
				19	#' @rdname association-score-functions
				20	#'
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	21	#' @export
				22	#'
				23	#' @examples
				24	#' \donttest{
				25	#' new("KorAPConnection", verbose = TRUE) %>%
				26	#' collocationScoreQuery("Perlen", c("verziertes", "Säue"),
				27	#' scoreFunctions = append(defaultAssociationScoreFunctions(),
				28	#' list(localMI = function(O1, O2, O, N, E, window_size) {
				29	#' O * log2(O/E)
				30	#' })))
				31	#' }
				32	#'
				33	defaultAssociationScoreFunctions <- function() {
				34	list(pmi=pmi, mi2=mi2, mi3=mi3, logDice=logDice, ll=ll)
				35	}
				36
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	37	#' @rdname association-score-functions
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	38	#'
Marc Kupietz	92a2848	2021-03-05 10:50:32 +0100	[diff] [blame^]	39	#' @description
				40	#' \bold{pmi}: pointwise mutual information
				41	#'
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	42	#' @export
				43	#'
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	44	pmi <- function(O1, O2, O, N, E, window_size) {
				45	log2(O / E)
				46	}
				47
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	48	#' @rdname association-score-functions
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	49	#'
Marc Kupietz	92a2848	2021-03-05 10:50:32 +0100	[diff] [blame^]	50	#' @description
				51	#' \bold{mi2}: pointwise mutual information squared (Daille 1994), also referred to as mutual dependency
				52	#' (Thanopoulos et al. 2002)
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	53	#' @export
				54	#'
				55	mi2 <- function(O1, O2, O, N, E, window_size) {
				56	log2(O ^ 2 / E)
				57	}
				58
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	59	#' @rdname association-score-functions
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	60	#' @family association-score-functions
				61	#'
Marc Kupietz	92a2848	2021-03-05 10:50:32 +0100	[diff] [blame^]	62	#' @description
				63	#' \bold{mi3}: pointwise mutual information cubed (Daille 1994), also referred to as log-frequency biased mutual dependency)
				64	#' (Thanopoulos et al. 2002)
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	65	#'
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	66	#' @export
				67	#'
				68	#' @references
				69	#' Daille, B. (1994): Approche mixte pour l’extraction automatique de terminologie: statistiques lexicales et filtres linguistiques. PhD thesis, Université Paris 7.
				70	#'
				71	#' Thanopoulos, A., Fakotakis, N., Kokkinakis, G. (2002): Comparative evaluation of collocation extraction metrics. In: Proc. of LREC 2002: 620–625.
				72	#'
				73	mi3 <- function(O1, O2, O, N, E, window_size) {
				74	log2(O ^ 3 / E)
				75	}
				76
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	77	#' @rdname association-score-functions
Marc Kupietz	92a2848	2021-03-05 10:50:32 +0100	[diff] [blame^]	78	#'
				79	#' @description
				80	#' \bold{logDice}: log-Dice coefficient, a heuristic measure that is popular in lexicography (Rychlý 2008)
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	81	#' @export
				82	#'
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	83	#' @references
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	84	#' Rychlý, Pavel (2008): A lexicographer-friendly association score. In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9. <http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf>.
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	85	#'
				86
				87	logDice <- function(O1, O2, O, N, E, window_size) {
				88	14 + log2(2 * O / (window_size * O2 + O1))
				89	}
				90
				91
				92	#' Log likelihood
				93	#'
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	94	#' @rdname association-score-functions
Marc Kupietz	92a2848	2021-03-05 10:50:32 +0100	[diff] [blame^]	95	#' @description
				96	#' \bold{ll}: log-likelihood (Dunning 1993) using Stefan Evert's (2004) simplified implementation
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	97	#'
				98	#' @export
				99	#'
				100	#' @importFrom dplyr if_else
				101	#'
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	102	#' @references
				103	#' Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
				104	#'
				105	#' Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
Marc Kupietz	92a2848	2021-03-05 10:50:32 +0100	[diff] [blame^]	106	#' Free PDF available from \url{http://purl.org/stefan.evert/PUB/Evert2004phd.pdf}
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	107	#'
				108	ll <- function(O1, O2, O, N, E, window_size) {
				109	r1 = as.double(O1) * window_size
				110	r2 = as.double(N) - r1
				111	c1 = O2
				112	c2 = N - c1
				113	o11 = O
				114	o12 = r1 - o11
				115	o21 = c1 - O
				116	o22 = r2 - o21
				117	e11 = r1 * c1 / N
				118	e12 = r1 * c2 / N
				119	e21 = r2 * c1 / N
				120	e22 = r2 * c2 / N
				121	2 * ( dplyr::if_else(o11>0, o11 * log(o11/e11), 0)
				122	+ dplyr::if_else(o12>0, o12 * log(o12/e12), 0)
				123	+ dplyr::if_else(o21>0, o21 * log(o21/e21), 0)
				124	+ dplyr::if_else(o22>0, o22 * log(o22/e22), 0))
				125	}