Blame - R/collocationAnalysis.R - KorAP/RKorAPClient

blob: 52a0e33206b573feb03c9694dc3f524e9de1ab6a [file] [log] [blame]

Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1	setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
				2
				3	#' Collocation analysis
				4	#'
				5	#' @aliases collocationAnalysis
				6	#'
				7	#' @description
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	8	#' `r lifecycle::badge("experimental")`
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	9	#'
				10	#' Performs a collocation analysis for the given node (or query)
				11	#' in the given virtual corpus.
				12	#'
				13	#' @details
				14	#' The collocation analysis is currently implemented on the client side, as some of the
				15	#' functionality is not yet provided by the KorAP backend. Mainly for this reason
				16	#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
				17	#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
				18	#' and look for expression-internal collocates using the focus function (see examples and demo).
				19	#'
				20	#' To increase speed at the cost of accuracy and possible false negatives,
				21	#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
				22	#'
				23	#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
				24	#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
				25	#' user interface.
				26	#'
				27	#' @family collocation analysis functions
				28	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	29	#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	30	#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
				31	#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
				32	#' @param searchHitsSampleLimit limit the size of the search hits sample
				33	#' @param stopwords vector of stopwords not to be considered as collocates
				34	#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
				35	#' @param seed seed for random page collecting order
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	36	#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
				37	#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	38	#' @inheritParams collocationScoreQuery,KorAPConnection-method
				39	#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
				40	#'
				41	#' @importFrom stringr str_match str_split str_detect
				42	#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
				43	#' @importFrom purrr pmap
				44	#' @importFrom tidyr expand_grid
				45	#'
				46	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	47	#' \dontrun{
				48	#'
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	49	#' # Find top collocates of "Packung" inside and outside the sports domain.
				50	#' new("KorAPConnection", verbose = TRUE) %>%
				51	#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
				52	#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
				53	#' dplyr::filter(logDice >= 5)
				54	#' }
				55	#'
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	56	#' \dontrun{
				57	#'
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	58	#' # Identify the most prominent light verb construction with "in ... setzen".
				59	#' # Note that, currently, the use of focus function disallows exactFrequencies.
				60	#' new("KorAPConnection", verbose = TRUE) %>%
				61	#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
				62	#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
				63	#' }
				64	#'
				65	#' @export
				66	setMethod("collocationAnalysis", "KorAPConnection",
				67	function(kco,
				68	node,
				69	vc = "",
				70	lemmatizeNodeQuery = FALSE,
				71	minOccur = 5,
				72	leftContextSize = 5,
				73	rightContextSize = 5,
				74	topCollocatesLimit = 200,
				75	searchHitsSampleLimit = 20000,
				76	ignoreCollocateCase = FALSE,
				77	withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
				78	exactFrequencies = TRUE,
				79	stopwords = RKorAPClient::synsemanticStopwords(),
				80	seed = 7,
				81	expand = length(vc) != length(node),
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	82	maxRecurse = 0,
				83	addExamples = TRUE,
Marc Kupietz	419f21f	2021-12-07 10:27:30 +0100	[diff] [blame]	84	thresholdScore = "logDice",
				85	threshold = 2.0,
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	86	localStopwords = c(),
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	87	...) {
				88	# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
				89	word <- frequency <- NULL
				90
				91	if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
				92	stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
				93	}
				94
Marc Kupietz	581a29b	2021-09-04 20:51:04 +0200	[diff] [blame]	95	warnIfNoAccessToken(kco)
				96
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	97	if (lemmatizeNodeQuery) {
				98	node <- lemmatizeWordQuery(node)
				99	}
				100
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	101	result <- if (length(node) > 1 \|\| length(vc) > 1) {
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	102	grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
				103	purrr::pmap(grid, function(node, vc, ...)
				104	collocationAnalysis(kco,
				105	node =node,
				106	vc = vc,
				107	minOccur = minOccur,
				108	leftContextSize = leftContextSize,
				109	rightContextSize = rightContextSize,
				110	topCollocatesLimit = topCollocatesLimit,
				111	searchHitsSampleLimit = searchHitsSampleLimit,
				112	ignoreCollocateCase = ignoreCollocateCase,
				113	withinSpan = withinSpan,
				114	exactFrequencies = exactFrequencies,
				115	stopwords = stopwords,
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	116	addExamples = TRUE,
				117	localStopwords = localStopwords,
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	118	seed = seed,
				119	expand = expand,
				120	...) ) %>%
				121	bind_rows()
				122	} else {
				123	set.seed(seed)
				124	candidates <- collocatesQuery(
				125	kco,
				126	node,
				127	vc = vc,
				128	minOccur = minOccur,
				129	leftContextSize = leftContextSize,
				130	rightContextSize = rightContextSize,
				131	searchHitsSampleLimit = searchHitsSampleLimit,
				132	ignoreCollocateCase = ignoreCollocateCase,
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	133	stopwords = append(stopwords, localStopwords),
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	134	...
				135	)
				136
				137	if (nrow(candidates) > 0) {
				138	candidates <- candidates %>%
				139	filter(frequency >= minOccur) %>%
				140	head(topCollocatesLimit)
				141	collocationScoreQuery(
				142	kco,
				143	node = node,
				144	collocate = candidates$word,
				145	vc = vc,
				146	leftContextSize = leftContextSize,
				147	rightContextSize = rightContextSize,
				148	observed = if (exactFrequencies) NA else candidates$frequency,
				149	ignoreCollocateCase = ignoreCollocateCase,
				150	withinSpan = withinSpan,
				151	...
				152	) %>%
				153	filter(.$O >= minOccur) %>%
				154	dplyr::arrange(dplyr::desc(logDice))
				155	} else {
				156	tibble()
				157	}
				158	}
Marc Kupietz	419f21f	2021-12-07 10:27:30 +0100	[diff] [blame]	159	if (maxRecurse > 0 & any(!!as.name(thresholdScore) >= threshold)) {
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	160	recurseWith <- result %>%
Marc Kupietz	419f21f	2021-12-07 10:27:30 +0100	[diff] [blame]	161	filter(!!as.name(thresholdScore) >= threshold)
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	162	result <- collocationAnalysis(
				163	kco,
				164	node = paste0("(", buildCollocationQuery(
				165	removeWithinSpan(recurseWith$node, withinSpan),
				166	recurseWith$collocate,
				167	leftContextSize = leftContextSize,
				168	rightContextSize = rightContextSize,
				169	withinSpan = ""
				170	), ")"),
				171	vc = vc,
				172	minOccur = minOccur,
				173	leftContextSize = leftContextSize,
				174	rightContextSize = rightContextSize,
				175	withinSpan = withinSpan,
				176	maxRecurse = maxRecurse - 1,
				177	stopwords = stopwords,
				178	localStopwords = recurseWith$collocate,
				179	exactFrequencies = exactFrequencies,
				180	searchHitsSampleLimit = searchHitsSampleLimit,
				181	topCollocatesLimit = topCollocatesLimit,
				182	addExamples = FALSE
				183	) %>%
				184	bind_rows(result) %>%
				185	filter(logDice >= 2) %>%
				186	filter(.$O >= minOccur) %>%
				187	dplyr::arrange(dplyr::desc(logDice))
				188	}
				189	if (addExamples && length(result) > 0) {
Marc Kupietz	1678c3a	2021-12-07 10:24:49 +0100	[diff] [blame]	190	result$query <-buildCollocationQuery(
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	191	result$node,
				192	result$collocate,
				193	leftContextSize = leftContextSize,
				194	rightContextSize = rightContextSize,
				195	withinSpan = ""
				196	)
				197	result$example <- findExample(
				198	kco,
Marc Kupietz	1678c3a	2021-12-07 10:24:49 +0100	[diff] [blame]	199	query = result$query,
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	200	vc = result$vc
				201	)
				202	}
				203	result
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	204	}
				205	)
				206
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	207	#' @export
				208	removeWithinSpan <- function(query, withinSpan) {
				209	if (withinSpan == "") {
				210	return(query)
				211	}
				212	needle <- sprintf("^\$contains\\(<%s>, ?(.*)\${2}$", withinSpan)
				213	res <- gsub(needle, '\\1', query)
				214	needle <- sprintf("^contains\$<%s>, ?(.*)\$$", withinSpan)
				215	res <- gsub(needle, '\\1', res)
				216	return(res)
				217	}
				218
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	219	#' @importFrom magrittr debug_pipe
				220	#' @importFrom stringr str_match str_split str_detect
				221	#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
				222	#'
				223	snippet2FreqTable <- function(snippet,
				224	minOccur = 5,
				225	leftContextSize = 5,
				226	rightContextSize = 5,
				227	ignoreCollocateCase = FALSE,
				228	stopwords = c(),
				229	tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+\|")",
				230	oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
				231	verbose = TRUE) {
				232	word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
				233	frequency <- NULL
				234
				235	if (length(snippet) < 1) {
				236	dplyr::tibble(word=c(), frequency=c())
				237	} else if (length(snippet) > 1) {
Marc Kupietz	8fe0bee	2021-12-07 10:25:45 +0100	[diff] [blame]	238	log.info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	239	for (s in snippet) {
				240	oldTable <- snippet2FreqTable(
				241	s,
				242	leftContextSize = leftContextSize,
				243	rightContextSize = rightContextSize,
				244	oldTable = oldTable,
				245	stopwords = stopwords
				246	)
				247	}
Marc Kupietz	d07bf19	2021-09-04 20:24:44 +0200	[diff] [blame]	248	log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	249	oldTable %>%
				250	group_by(word) %>%
				251	mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
				252	summarise(frequency=sum(frequency), .groups = "drop") %>%
				253	arrange(desc(frequency))
				254	} else {
				255	stopwordsTable <- dplyr::tibble(word=stopwords)
				256	match <-
				257	str_match(
				258	snippet,
				259	'<span class="context-left">(<span class="more"></span>)?(.[^ ]) </span><span class="match"><mark>.</mark></span><span class="context-right"> ([^<]*)'
				260	)
				261
				262	left <- if(leftContextSize > 0)
				263	tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
				264	else
				265	""
				266	# cat(paste("left:", left, "\n", collapse=" "))
				267
				268	right <- if(rightContextSize > 0)
				269	head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
				270	else
				271	""
				272	# cat(paste("right:", right, "\n", collapse=" "))
				273
				274	if(is.na(left) \|\| is.na(right) \|\| length(left) + length(right) == 0) {
				275	oldTable
				276	} else {
				277	table(c(left, right)) %>%
				278	dplyr::as_tibble(.name_repair = "minimal") %>%
				279	dplyr::rename(word = 1, frequency = 2) %>%
				280	dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
				281	dplyr::anti_join(stopwordsTable, by="word") %>%
				282	dplyr::bind_rows(oldTable)
				283	}
				284	}
				285	}
				286
				287	#' Preliminary synsemantic stopwords function
				288	#'
				289	#' @description
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	290	#' `r lifecycle::badge("experimental")`
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	291	#'
				292	#' Preliminary synsemantic stopwords function to be used in collocation analysis.
				293	#'
				294	#' @details
				295	#' Currently only suitable for German. See stopwords package for other languages.
				296	#'
				297	#' @param ... future arguments for language detection
				298	#'
				299	#' @family collocation analysis functions
				300	#' @return Vector of synsemantic stopwords.
				301	#' @export
				302	synsemanticStopwords <- function(...) {
				303	res <- c(
				304	"der",
				305	"die",
				306	"und",
				307	"in",
				308	"den",
				309	"von",
				310	"mit",
				311	"das",
				312	"zu",
				313	"im",
				314	"ist",
				315	"auf",
				316	"sich",
				317	"Die",
				318	"des",
				319	"dem",
				320	"nicht",
				321	"ein",
Marc Kupietz	d2c08cb	2021-12-07 10:28:21 +0100	[diff] [blame^]	322	"Ein",
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	323	"eine",
Marc Kupietz	d2c08cb	2021-12-07 10:28:21 +0100	[diff] [blame^]	324	"Eine",
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	325	"es",
				326	"auch",
				327	"an",
				328	"als",
				329	"am",
				330	"aus",
				331	"Der",
				332	"bei",
				333	"er",
				334	"dass",
				335	"sie",
				336	"nach",
				337	"um",
				338	"Das",
				339	"zum",
				340	"noch",
				341	"war",
				342	"einen",
				343	"einer",
				344	"wie",
				345	"einem",
				346	"vor",
				347	"bis",
				348	"\u00fcber",
				349	"so",
				350	"aber",
				351	"Eine",
				352	"diese",
				353	"Diese",
				354	"oder"
				355	)
				356	return(res)
				357	}
				358
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	359
				360	#' @export
				361	findExample <-
				362	function(kco,
				363	query,
				364	vc = "",
				365	matchOnly = TRUE) {
				366	out <- character(length = length(query))
				367
				368	if (length(vc) < length(query))
				369	vc <- rep(vc, length(query))
				370
				371	for (i in seq_along(query)) {
				372	q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
				373	q <- fetchNext(q, maxFetch=50, randomizePageOrder=F)
				374	example <- as.character((q@collectedMatches)$snippet[1])
				375	out[i] <- if(matchOnly) {
				376	gsub('.<mark>(.+)</mark>.', '\\1', example)
				377	} else {
				378	stringr::str_replace(example, '<[^>]*>', '')
				379	}
				380	}
				381	out
				382	}
				383
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	384	collocatesQuery <-
				385	function(kco,
				386	query,
				387	vc = "",
				388	minOccur = 5,
				389	leftContextSize = 5,
				390	rightContextSize = 5,
				391	searchHitsSampleLimit = 20000,
				392	ignoreCollocateCase = FALSE,
				393	stopwords = c(),
				394	...) {
				395	frequency <- NULL
				396	q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
				397	if(q@totalResults == 0) {
				398	tibble(word=c(), frequency=c())
				399	} else {
				400	q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
				401	snippet2FreqTable((q@collectedMatches)$snippet,
				402	minOccur = minOccur,
				403	leftContextSize = leftContextSize,
				404	rightContextSize = rightContextSize,
				405	ignoreCollocateCase = ignoreCollocateCase,
				406	stopwords = stopwords,
				407	verbose = kco@verbose) %>%
				408	mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
				409	filter(frequency >= minOccur)
				410	}
				411	}