Blame - R/collocationAnalysis.R - KorAP/RKorAPClient

blob: 5e3f7ddcb60c26b38d1ccf8d9f8f165be8ec6499 [file] [log] [blame]

Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1	setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
				2
				3	#' Collocation analysis
				4	#'
				5	#' @aliases collocationAnalysis
				6	#'
				7	#' @description
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	8	#' `r lifecycle::badge("experimental")`
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	9	#'
				10	#' Performs a collocation analysis for the given node (or query)
				11	#' in the given virtual corpus.
				12	#'
				13	#' @details
				14	#' The collocation analysis is currently implemented on the client side, as some of the
				15	#' functionality is not yet provided by the KorAP backend. Mainly for this reason
				16	#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
				17	#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
				18	#' and look for expression-internal collocates using the focus function (see examples and demo).
				19	#'
				20	#' To increase speed at the cost of accuracy and possible false negatives,
				21	#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
				22	#'
				23	#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
				24	#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
				25	#' user interface.
				26	#'
				27	#' @family collocation analysis functions
				28	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	29	#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	30	#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
				31	#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
				32	#' @param searchHitsSampleLimit limit the size of the search hits sample
				33	#' @param stopwords vector of stopwords not to be considered as collocates
				34	#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
				35	#' @param seed seed for random page collecting order
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	36	#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz	47d0d2b	2021-12-19 16:38:52 +0100	[diff] [blame^]	37	#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	38	#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	39	#' @inheritParams collocationScoreQuery,KorAPConnection-method
				40	#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
				41	#'
				42	#' @importFrom stringr str_match str_split str_detect
				43	#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
				44	#' @importFrom purrr pmap
				45	#' @importFrom tidyr expand_grid
				46	#'
				47	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	48	#' \dontrun{
				49	#'
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	50	#' # Find top collocates of "Packung" inside and outside the sports domain.
				51	#' new("KorAPConnection", verbose = TRUE) %>%
				52	#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
				53	#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
				54	#' dplyr::filter(logDice >= 5)
				55	#' }
				56	#'
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	57	#' \dontrun{
				58	#'
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	59	#' # Identify the most prominent light verb construction with "in ... setzen".
				60	#' # Note that, currently, the use of focus function disallows exactFrequencies.
				61	#' new("KorAPConnection", verbose = TRUE) %>%
				62	#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
				63	#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
				64	#' }
				65	#'
				66	#' @export
				67	setMethod("collocationAnalysis", "KorAPConnection",
				68	function(kco,
				69	node,
				70	vc = "",
				71	lemmatizeNodeQuery = FALSE,
				72	minOccur = 5,
				73	leftContextSize = 5,
				74	rightContextSize = 5,
				75	topCollocatesLimit = 200,
				76	searchHitsSampleLimit = 20000,
				77	ignoreCollocateCase = FALSE,
				78	withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
				79	exactFrequencies = TRUE,
				80	stopwords = RKorAPClient::synsemanticStopwords(),
				81	seed = 7,
				82	expand = length(vc) != length(node),
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	83	maxRecurse = 0,
				84	addExamples = TRUE,
Marc Kupietz	419f21f	2021-12-07 10:27:30 +0100	[diff] [blame]	85	thresholdScore = "logDice",
				86	threshold = 2.0,
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	87	localStopwords = c(),
Marc Kupietz	47d0d2b	2021-12-19 16:38:52 +0100	[diff] [blame^]	88	collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	89	...) {
				90	# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
				91	word <- frequency <- NULL
				92
				93	if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
				94	stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
				95	}
				96
Marc Kupietz	581a29b	2021-09-04 20:51:04 +0200	[diff] [blame]	97	warnIfNoAccessToken(kco)
				98
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	99	if (lemmatizeNodeQuery) {
				100	node <- lemmatizeWordQuery(node)
				101	}
				102
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	103	result <- if (length(node) > 1 \|\| length(vc) > 1) {
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	104	grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
				105	purrr::pmap(grid, function(node, vc, ...)
				106	collocationAnalysis(kco,
				107	node =node,
				108	vc = vc,
				109	minOccur = minOccur,
				110	leftContextSize = leftContextSize,
				111	rightContextSize = rightContextSize,
				112	topCollocatesLimit = topCollocatesLimit,
				113	searchHitsSampleLimit = searchHitsSampleLimit,
				114	ignoreCollocateCase = ignoreCollocateCase,
				115	withinSpan = withinSpan,
				116	exactFrequencies = exactFrequencies,
				117	stopwords = stopwords,
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	118	addExamples = TRUE,
				119	localStopwords = localStopwords,
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	120	seed = seed,
				121	expand = expand,
				122	...) ) %>%
				123	bind_rows()
				124	} else {
				125	set.seed(seed)
				126	candidates <- collocatesQuery(
				127	kco,
				128	node,
				129	vc = vc,
				130	minOccur = minOccur,
				131	leftContextSize = leftContextSize,
				132	rightContextSize = rightContextSize,
				133	searchHitsSampleLimit = searchHitsSampleLimit,
				134	ignoreCollocateCase = ignoreCollocateCase,
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	135	stopwords = append(stopwords, localStopwords),
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	136	...
				137	)
				138
				139	if (nrow(candidates) > 0) {
				140	candidates <- candidates %>%
				141	filter(frequency >= minOccur) %>%
				142	head(topCollocatesLimit)
				143	collocationScoreQuery(
				144	kco,
				145	node = node,
				146	collocate = candidates$word,
				147	vc = vc,
				148	leftContextSize = leftContextSize,
				149	rightContextSize = rightContextSize,
				150	observed = if (exactFrequencies) NA else candidates$frequency,
				151	ignoreCollocateCase = ignoreCollocateCase,
				152	withinSpan = withinSpan,
				153	...
				154	) %>%
				155	filter(.$O >= minOccur) %>%
				156	dplyr::arrange(dplyr::desc(logDice))
				157	} else {
				158	tibble()
				159	}
				160	}
Marc Kupietz	419f21f	2021-12-07 10:27:30 +0100	[diff] [blame]	161	if (maxRecurse > 0 & any(!!as.name(thresholdScore) >= threshold)) {
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	162	recurseWith <- result %>%
Marc Kupietz	419f21f	2021-12-07 10:27:30 +0100	[diff] [blame]	163	filter(!!as.name(thresholdScore) >= threshold)
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	164	result <- collocationAnalysis(
				165	kco,
				166	node = paste0("(", buildCollocationQuery(
				167	removeWithinSpan(recurseWith$node, withinSpan),
				168	recurseWith$collocate,
				169	leftContextSize = leftContextSize,
				170	rightContextSize = rightContextSize,
				171	withinSpan = ""
				172	), ")"),
				173	vc = vc,
				174	minOccur = minOccur,
				175	leftContextSize = leftContextSize,
				176	rightContextSize = rightContextSize,
				177	withinSpan = withinSpan,
				178	maxRecurse = maxRecurse - 1,
				179	stopwords = stopwords,
				180	localStopwords = recurseWith$collocate,
				181	exactFrequencies = exactFrequencies,
				182	searchHitsSampleLimit = searchHitsSampleLimit,
				183	topCollocatesLimit = topCollocatesLimit,
				184	addExamples = FALSE
				185	) %>%
				186	bind_rows(result) %>%
				187	filter(logDice >= 2) %>%
				188	filter(.$O >= minOccur) %>%
				189	dplyr::arrange(dplyr::desc(logDice))
				190	}
				191	if (addExamples && length(result) > 0) {
Marc Kupietz	1678c3a	2021-12-07 10:24:49 +0100	[diff] [blame]	192	result$query <-buildCollocationQuery(
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	193	result$node,
				194	result$collocate,
				195	leftContextSize = leftContextSize,
				196	rightContextSize = rightContextSize,
				197	withinSpan = ""
				198	)
				199	result$example <- findExample(
				200	kco,
Marc Kupietz	1678c3a	2021-12-07 10:24:49 +0100	[diff] [blame]	201	query = result$query,
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	202	vc = result$vc
				203	)
				204	}
				205	result
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	206	}
				207	)
				208
Marc Kupietz	76b0559	2021-12-19 16:26:15 +0100	[diff] [blame]	209	# #' @export
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	210	removeWithinSpan <- function(query, withinSpan) {
				211	if (withinSpan == "") {
				212	return(query)
				213	}
				214	needle <- sprintf("^\$contains\\(<%s>, ?(.*)\${2}$", withinSpan)
				215	res <- gsub(needle, '\\1', query)
				216	needle <- sprintf("^contains\$<%s>, ?(.*)\$$", withinSpan)
				217	res <- gsub(needle, '\\1', res)
				218	return(res)
				219	}
				220
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	221	#' @importFrom magrittr debug_pipe
				222	#' @importFrom stringr str_match str_split str_detect
				223	#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
				224	#'
				225	snippet2FreqTable <- function(snippet,
				226	minOccur = 5,
				227	leftContextSize = 5,
				228	rightContextSize = 5,
				229	ignoreCollocateCase = FALSE,
				230	stopwords = c(),
				231	tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+\|")",
Marc Kupietz	47d0d2b	2021-12-19 16:38:52 +0100	[diff] [blame^]	232	collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	233	oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
				234	verbose = TRUE) {
				235	word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
				236	frequency <- NULL
				237
				238	if (length(snippet) < 1) {
				239	dplyr::tibble(word=c(), frequency=c())
				240	} else if (length(snippet) > 1) {
Marc Kupietz	8fe0bee	2021-12-07 10:25:45 +0100	[diff] [blame]	241	log.info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	242	for (s in snippet) {
				243	oldTable <- snippet2FreqTable(
				244	s,
				245	leftContextSize = leftContextSize,
				246	rightContextSize = rightContextSize,
Marc Kupietz	47d0d2b	2021-12-19 16:38:52 +0100	[diff] [blame^]	247	collocateFilterRegex = collocateFilterRegex,
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	248	oldTable = oldTable,
				249	stopwords = stopwords
				250	)
				251	}
Marc Kupietz	d07bf19	2021-09-04 20:24:44 +0200	[diff] [blame]	252	log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	253	oldTable %>%
				254	group_by(word) %>%
				255	mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
				256	summarise(frequency=sum(frequency), .groups = "drop") %>%
				257	arrange(desc(frequency))
				258	} else {
				259	stopwordsTable <- dplyr::tibble(word=stopwords)
				260	match <-
				261	str_match(
				262	snippet,
				263	'<span class="context-left">(<span class="more"></span>)?(.[^ ]) </span><span class="match"><mark>.</mark></span><span class="context-right"> ([^<]*)'
				264	)
				265
				266	left <- if(leftContextSize > 0)
				267	tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
				268	else
				269	""
				270	# cat(paste("left:", left, "\n", collapse=" "))
				271
				272	right <- if(rightContextSize > 0)
				273	head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
				274	else
				275	""
				276	# cat(paste("right:", right, "\n", collapse=" "))
				277
				278	if(is.na(left) \|\| is.na(right) \|\| length(left) + length(right) == 0) {
				279	oldTable
				280	} else {
				281	table(c(left, right)) %>%
				282	dplyr::as_tibble(.name_repair = "minimal") %>%
				283	dplyr::rename(word = 1, frequency = 2) %>%
Marc Kupietz	47d0d2b	2021-12-19 16:38:52 +0100	[diff] [blame^]	284	dplyr::filter(str_detect(word, collocateFilterRegex)) %>%
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	285	dplyr::anti_join(stopwordsTable, by="word") %>%
				286	dplyr::bind_rows(oldTable)
				287	}
				288	}
				289	}
				290
				291	#' Preliminary synsemantic stopwords function
				292	#'
				293	#' @description
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	294	#' `r lifecycle::badge("experimental")`
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	295	#'
				296	#' Preliminary synsemantic stopwords function to be used in collocation analysis.
				297	#'
				298	#' @details
				299	#' Currently only suitable for German. See stopwords package for other languages.
				300	#'
				301	#' @param ... future arguments for language detection
				302	#'
				303	#' @family collocation analysis functions
				304	#' @return Vector of synsemantic stopwords.
				305	#' @export
				306	synsemanticStopwords <- function(...) {
				307	res <- c(
				308	"der",
				309	"die",
				310	"und",
				311	"in",
				312	"den",
				313	"von",
				314	"mit",
				315	"das",
				316	"zu",
				317	"im",
				318	"ist",
				319	"auf",
				320	"sich",
				321	"Die",
				322	"des",
				323	"dem",
				324	"nicht",
				325	"ein",
Marc Kupietz	d2c08cb	2021-12-07 10:28:21 +0100	[diff] [blame]	326	"Ein",
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	327	"eine",
Marc Kupietz	d2c08cb	2021-12-07 10:28:21 +0100	[diff] [blame]	328	"Eine",
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	329	"es",
				330	"auch",
				331	"an",
				332	"als",
				333	"am",
				334	"aus",
				335	"Der",
				336	"bei",
				337	"er",
				338	"dass",
				339	"sie",
				340	"nach",
				341	"um",
				342	"Das",
				343	"zum",
				344	"noch",
				345	"war",
				346	"einen",
				347	"einer",
				348	"wie",
				349	"einem",
				350	"vor",
				351	"bis",
				352	"\u00fcber",
				353	"so",
				354	"aber",
				355	"Eine",
				356	"diese",
				357	"Diese",
				358	"oder"
				359	)
				360	return(res)
				361	}
				362
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	363
Marc Kupietz	76b0559	2021-12-19 16:26:15 +0100	[diff] [blame]	364	# #' @export
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	365	findExample <-
				366	function(kco,
				367	query,
				368	vc = "",
				369	matchOnly = TRUE) {
				370	out <- character(length = length(query))
				371
				372	if (length(vc) < length(query))
				373	vc <- rep(vc, length(query))
				374
				375	for (i in seq_along(query)) {
				376	q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietz	b811ffb	2021-12-07 10:34:10 +0100	[diff] [blame]	377	if (q@totalResults > 0) {
				378	q <- fetchNext(q, maxFetch=50, randomizePageOrder=F)
				379	example <- as.character((q@collectedMatches)$snippet[1])
				380	out[i] <- if(matchOnly) {
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	381	gsub('.<mark>(.+)</mark>.', '\\1', example)
				382	} else {
				383	stringr::str_replace(example, '<[^>]*>', '')
				384	}
Marc Kupietz	b811ffb	2021-12-07 10:34:10 +0100	[diff] [blame]	385	} else {
				386	out[i] = ""
				387	}
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	388	}
				389	out
				390	}
				391
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	392	collocatesQuery <-
				393	function(kco,
				394	query,
				395	vc = "",
				396	minOccur = 5,
				397	leftContextSize = 5,
				398	rightContextSize = 5,
				399	searchHitsSampleLimit = 20000,
				400	ignoreCollocateCase = FALSE,
				401	stopwords = c(),
				402	...) {
				403	frequency <- NULL
				404	q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
				405	if(q@totalResults == 0) {
				406	tibble(word=c(), frequency=c())
				407	} else {
				408	q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
				409	snippet2FreqTable((q@collectedMatches)$snippet,
				410	minOccur = minOccur,
				411	leftContextSize = leftContextSize,
				412	rightContextSize = rightContextSize,
				413	ignoreCollocateCase = ignoreCollocateCase,
				414	stopwords = stopwords,
Marc Kupietz	47d0d2b	2021-12-19 16:38:52 +0100	[diff] [blame^]	415	...,
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	416	verbose = kco@verbose) %>%
				417	mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
				418	filter(frequency >= minOccur)
				419	}
				420	}