Blame - R/association-scores.R - KorAP/RKorAPClient

blob: ddf3f5bb71d4083b5af88e79bc8d6caea6ee5237 [file] [log] [blame]

Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	1	#' Association score functions
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	2	#'
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	3	#' @param O1 observed absolute frequency of node
				4	#' @param O2 observed absolute frequency of collocate
				5	#' @param O observed absolute frequency of collocation
				6	#' @param N corpus size
				7	#' @param E expected absolute frequency of collocation (already adjusted to window size)
				8	#' @param window_size total window size around node (left neighbour count + right neighbour count)
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	9	#'
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	10	#' @return association score
				11	#' @name association-score-functions
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	12	#'
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	13	#' @description
				14	#' Functions to calculate different collocation association scores between
				15	#' a node (target word) and words in a window around the it.
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	16	#' The functions are primarily used by [collocationScoreQuery()].
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	17	NULL
				18	#' NULL
				19
				20	#' @rdname association-score-functions
				21	#'
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	22	#' @family collocation analysis functions
				23	#'
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	24	#' @export
				25	#'
				26	#' @examples
				27	#' \donttest{
				28	#' new("KorAPConnection", verbose = TRUE) %>%
				29	#' collocationScoreQuery("Perlen", c("verziertes", "Säue"),
				30	#' scoreFunctions = append(defaultAssociationScoreFunctions(),
				31	#' list(localMI = function(O1, O2, O, N, E, window_size) {
				32	#' O * log2(O/E)
				33	#' })))
				34	#' }
				35	#'
				36	defaultAssociationScoreFunctions <- function() {
				37	list(pmi=pmi, mi2=mi2, mi3=mi3, logDice=logDice, ll=ll)
				38	}
				39
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	40	#' @rdname association-score-functions
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	41	#'
Marc Kupietz	92a2848	2021-03-05 10:50:32 +0100	[diff] [blame]	42	#' @description
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	43	#' pmi: pointwise mutual information
Marc Kupietz	92a2848	2021-03-05 10:50:32 +0100	[diff] [blame]	44	#'
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	45	#' @export
				46	#'
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	47	pmi <- function(O1, O2, O, N, E, window_size) {
				48	log2(O / E)
				49	}
				50
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	51	#' @rdname association-score-functions
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	52	#'
Marc Kupietz	92a2848	2021-03-05 10:50:32 +0100	[diff] [blame]	53	#' @description
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	54	#' mi2: pointwise mutual information squared (Daille 1994), also referred to as mutual dependency
Marc Kupietz	92a2848	2021-03-05 10:50:32 +0100	[diff] [blame]	55	#' (Thanopoulos et al. 2002)
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	56	#' @export
				57	#'
				58	mi2 <- function(O1, O2, O, N, E, window_size) {
				59	log2(O ^ 2 / E)
				60	}
				61
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	62	#' @rdname association-score-functions
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	63	#' @family association-score-functions
				64	#'
Marc Kupietz	92a2848	2021-03-05 10:50:32 +0100	[diff] [blame]	65	#' @description
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	66	#' mi3: pointwise mutual information cubed (Daille 1994), also referred to as log-frequency biased mutual dependency)
Marc Kupietz	92a2848	2021-03-05 10:50:32 +0100	[diff] [blame]	67	#' (Thanopoulos et al. 2002)
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	68	#'
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	69	#' @export
				70	#'
				71	#' @references
				72	#' Daille, B. (1994): Approche mixte pour l’extraction automatique de terminologie: statistiques lexicales et filtres linguistiques. PhD thesis, Université Paris 7.
				73	#'
				74	#' Thanopoulos, A., Fakotakis, N., Kokkinakis, G. (2002): Comparative evaluation of collocation extraction metrics. In: Proc. of LREC 2002: 620–625.
				75	#'
				76	mi3 <- function(O1, O2, O, N, E, window_size) {
				77	log2(O ^ 3 / E)
				78	}
				79
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	80	#' @rdname association-score-functions
Marc Kupietz	92a2848	2021-03-05 10:50:32 +0100	[diff] [blame]	81	#'
				82	#' @description
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	83	#' logDice: log-Dice coefficient, a heuristic measure that is popular in lexicography (Rychlý 2008)
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	84	#' @export
				85	#'
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	86	#' @references
Marc Kupietz	bf00493	2021-09-21 06:57:20 +0200	[diff] [blame^]	87	#' Rychlý, Pavel (2008): A lexicographer-friendly association score. In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9. <https://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf>.
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	88	#'
				89
				90	logDice <- function(O1, O2, O, N, E, window_size) {
Marc Kupietz	0085808	2021-03-12 09:27:35 +0100	[diff] [blame]	91	14 + log2(2 * O / (window_size * O1 + O2))
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	92	}
				93
				94
				95	#' Log likelihood
				96	#'
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	97	#' @rdname association-score-functions
Marc Kupietz	92a2848	2021-03-05 10:50:32 +0100	[diff] [blame]	98	#' @description
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	99	#' ll: log-likelihood (Dunning 1993) using Stefan Evert's (2004) simplified implementation
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	100	#'
				101	#' @export
				102	#'
				103	#' @importFrom dplyr if_else
				104	#'
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	105	#' @references
				106	#' Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
				107	#'
				108	#' Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	109	#' Free PDF available from <http://purl.org/stefan.evert/PUB/Evert2004phd.pdf>
Marc Kupietz	e203832	2021-03-04 18:24:02 +0100	[diff] [blame]	110	#'
				111	ll <- function(O1, O2, O, N, E, window_size) {
				112	r1 = as.double(O1) * window_size
				113	r2 = as.double(N) - r1
				114	c1 = O2
				115	c2 = N - c1
				116	o11 = O
				117	o12 = r1 - o11
				118	o21 = c1 - O
				119	o22 = r2 - o21
				120	e11 = r1 * c1 / N
				121	e12 = r1 * c2 / N
				122	e21 = r2 * c1 / N
				123	e22 = r2 * c2 / N
				124	2 * ( dplyr::if_else(o11>0, o11 * log(o11/e11), 0)
				125	+ dplyr::if_else(o12>0, o12 * log(o12/e12), 0)
				126	+ dplyr::if_else(o21>0, o21 * log(o21/e21), 0)
				127	+ dplyr::if_else(o22>0, o22 * log(o22/e22), 0))
				128	}