Blame - R/collocationAnalysis.R - KorAP/RKorAPClient

blob: f01c7033c206e6b53b2989715366fc85dad48759 [file] [log] [blame]

Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1	setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
				2
				3	#' Collocation analysis
				4	#'
				5	#' @aliases collocationAnalysis
				6	#'
				7	#' @description
				8	#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
				9	#'
				10	#' Performs a collocation analysis for the given node (or query)
				11	#' in the given virtual corpus.
				12	#'
				13	#' @details
				14	#' The collocation analysis is currently implemented on the client side, as some of the
				15	#' functionality is not yet provided by the KorAP backend. Mainly for this reason
				16	#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
				17	#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
				18	#' and look for expression-internal collocates using the focus function (see examples and demo).
				19	#'
				20	#' To increase speed at the cost of accuracy and possible false negatives,
				21	#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
				22	#'
				23	#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
				24	#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
				25	#' user interface.
				26	#'
				27	#' @family collocation analysis functions
				28	#'
				29	#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. x -> [tt/l=x]
				30	#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
				31	#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
				32	#' @param searchHitsSampleLimit limit the size of the search hits sample
				33	#' @param stopwords vector of stopwords not to be considered as collocates
				34	#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
				35	#' @param seed seed for random page collecting order
				36	#' @param expand if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations
				37	#' @param ... more arguments will be passed to \code{\link{collocationScoreQuery}}
				38	#' @inheritParams collocationScoreQuery,KorAPConnection-method
				39	#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
				40	#'
				41	#' @importFrom stringr str_match str_split str_detect
				42	#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
				43	#' @importFrom purrr pmap
				44	#' @importFrom tidyr expand_grid
Marc Kupietz	427d9ed	2021-09-04 21:17:53 +0200	[diff] [blame^]	45	#' @importFrom lifecycle badge
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	46	#'
				47	#' @examples
				48	#' \donttest{
				49	#' # Find top collocates of "Packung" inside and outside the sports domain.
				50	#' new("KorAPConnection", verbose = TRUE) %>%
				51	#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
				52	#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
				53	#' dplyr::filter(logDice >= 5)
				54	#' }
				55	#'
				56	#' \donttest{
				57	#' # Identify the most prominent light verb construction with "in ... setzen".
				58	#' # Note that, currently, the use of focus function disallows exactFrequencies.
				59	#' new("KorAPConnection", verbose = TRUE) %>%
				60	#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
				61	#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
				62	#' }
				63	#'
				64	#' @export
				65	setMethod("collocationAnalysis", "KorAPConnection",
				66	function(kco,
				67	node,
				68	vc = "",
				69	lemmatizeNodeQuery = FALSE,
				70	minOccur = 5,
				71	leftContextSize = 5,
				72	rightContextSize = 5,
				73	topCollocatesLimit = 200,
				74	searchHitsSampleLimit = 20000,
				75	ignoreCollocateCase = FALSE,
				76	withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
				77	exactFrequencies = TRUE,
				78	stopwords = RKorAPClient::synsemanticStopwords(),
				79	seed = 7,
				80	expand = length(vc) != length(node),
				81	...) {
				82	# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
				83	word <- frequency <- NULL
				84
				85	if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
				86	stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
				87	}
				88
Marc Kupietz	581a29b	2021-09-04 20:51:04 +0200	[diff] [blame]	89	warnIfNoAccessToken(kco)
				90
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	91	if (lemmatizeNodeQuery) {
				92	node <- lemmatizeWordQuery(node)
				93	}
				94
				95	if (length(node) > 1 \|\| length(vc) > 1) {
				96	grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
				97	purrr::pmap(grid, function(node, vc, ...)
				98	collocationAnalysis(kco,
				99	node =node,
				100	vc = vc,
				101	minOccur = minOccur,
				102	leftContextSize = leftContextSize,
				103	rightContextSize = rightContextSize,
				104	topCollocatesLimit = topCollocatesLimit,
				105	searchHitsSampleLimit = searchHitsSampleLimit,
				106	ignoreCollocateCase = ignoreCollocateCase,
				107	withinSpan = withinSpan,
				108	exactFrequencies = exactFrequencies,
				109	stopwords = stopwords,
				110	seed = seed,
				111	expand = expand,
				112	...) ) %>%
				113	bind_rows()
				114	} else {
				115	set.seed(seed)
				116	candidates <- collocatesQuery(
				117	kco,
				118	node,
				119	vc = vc,
				120	minOccur = minOccur,
				121	leftContextSize = leftContextSize,
				122	rightContextSize = rightContextSize,
				123	searchHitsSampleLimit = searchHitsSampleLimit,
				124	ignoreCollocateCase = ignoreCollocateCase,
				125	stopwords = stopwords,
				126	...
				127	)
				128
				129	if (nrow(candidates) > 0) {
				130	candidates <- candidates %>%
				131	filter(frequency >= minOccur) %>%
				132	head(topCollocatesLimit)
				133	collocationScoreQuery(
				134	kco,
				135	node = node,
				136	collocate = candidates$word,
				137	vc = vc,
				138	leftContextSize = leftContextSize,
				139	rightContextSize = rightContextSize,
				140	observed = if (exactFrequencies) NA else candidates$frequency,
				141	ignoreCollocateCase = ignoreCollocateCase,
				142	withinSpan = withinSpan,
				143	...
				144	) %>%
				145	filter(.$O >= minOccur) %>%
				146	dplyr::arrange(dplyr::desc(logDice))
				147	} else {
				148	tibble()
				149	}
				150	}
				151	}
				152	)
				153
				154	#' @importFrom magrittr debug_pipe
				155	#' @importFrom stringr str_match str_split str_detect
				156	#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
				157	#'
				158	snippet2FreqTable <- function(snippet,
				159	minOccur = 5,
				160	leftContextSize = 5,
				161	rightContextSize = 5,
				162	ignoreCollocateCase = FALSE,
				163	stopwords = c(),
				164	tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+\|")",
				165	oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
				166	verbose = TRUE) {
				167	word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
				168	frequency <- NULL
				169
				170	if (length(snippet) < 1) {
				171	dplyr::tibble(word=c(), frequency=c())
				172	} else if (length(snippet) > 1) {
Marc Kupietz	d07bf19	2021-09-04 20:24:44 +0200	[diff] [blame]	173	log.info(verbose, paste("Joinging", length(snippet), "kwics\n"))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	174	for (s in snippet) {
				175	oldTable <- snippet2FreqTable(
				176	s,
				177	leftContextSize = leftContextSize,
				178	rightContextSize = rightContextSize,
				179	oldTable = oldTable,
				180	stopwords = stopwords
				181	)
				182	}
Marc Kupietz	d07bf19	2021-09-04 20:24:44 +0200	[diff] [blame]	183	log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	184	oldTable %>%
				185	group_by(word) %>%
				186	mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
				187	summarise(frequency=sum(frequency), .groups = "drop") %>%
				188	arrange(desc(frequency))
				189	} else {
				190	stopwordsTable <- dplyr::tibble(word=stopwords)
				191	match <-
				192	str_match(
				193	snippet,
				194	'<span class="context-left">(<span class="more"></span>)?(.[^ ]) </span><span class="match"><mark>.</mark></span><span class="context-right"> ([^<]*)'
				195	)
				196
				197	left <- if(leftContextSize > 0)
				198	tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
				199	else
				200	""
				201	# cat(paste("left:", left, "\n", collapse=" "))
				202
				203	right <- if(rightContextSize > 0)
				204	head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
				205	else
				206	""
				207	# cat(paste("right:", right, "\n", collapse=" "))
				208
				209	if(is.na(left) \|\| is.na(right) \|\| length(left) + length(right) == 0) {
				210	oldTable
				211	} else {
				212	table(c(left, right)) %>%
				213	dplyr::as_tibble(.name_repair = "minimal") %>%
				214	dplyr::rename(word = 1, frequency = 2) %>%
				215	dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
				216	dplyr::anti_join(stopwordsTable, by="word") %>%
				217	dplyr::bind_rows(oldTable)
				218	}
				219	}
				220	}
				221
				222	#' Preliminary synsemantic stopwords function
				223	#'
				224	#' @description
				225	#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
				226	#'
				227	#' Preliminary synsemantic stopwords function to be used in collocation analysis.
				228	#'
				229	#' @details
				230	#' Currently only suitable for German. See stopwords package for other languages.
				231	#'
				232	#' @param ... future arguments for language detection
				233	#'
				234	#' @family collocation analysis functions
				235	#' @return Vector of synsemantic stopwords.
				236	#' @export
				237	synsemanticStopwords <- function(...) {
				238	res <- c(
				239	"der",
				240	"die",
				241	"und",
				242	"in",
				243	"den",
				244	"von",
				245	"mit",
				246	"das",
				247	"zu",
				248	"im",
				249	"ist",
				250	"auf",
				251	"sich",
				252	"Die",
				253	"des",
				254	"dem",
				255	"nicht",
				256	"ein",
				257	"eine",
				258	"es",
				259	"auch",
				260	"an",
				261	"als",
				262	"am",
				263	"aus",
				264	"Der",
				265	"bei",
				266	"er",
				267	"dass",
				268	"sie",
				269	"nach",
				270	"um",
				271	"Das",
				272	"zum",
				273	"noch",
				274	"war",
				275	"einen",
				276	"einer",
				277	"wie",
				278	"einem",
				279	"vor",
				280	"bis",
				281	"\u00fcber",
				282	"so",
				283	"aber",
				284	"Eine",
				285	"diese",
				286	"Diese",
				287	"oder"
				288	)
				289	return(res)
				290	}
				291
				292	collocatesQuery <-
				293	function(kco,
				294	query,
				295	vc = "",
				296	minOccur = 5,
				297	leftContextSize = 5,
				298	rightContextSize = 5,
				299	searchHitsSampleLimit = 20000,
				300	ignoreCollocateCase = FALSE,
				301	stopwords = c(),
				302	...) {
				303	frequency <- NULL
				304	q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
				305	if(q@totalResults == 0) {
				306	tibble(word=c(), frequency=c())
				307	} else {
				308	q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
				309	snippet2FreqTable((q@collectedMatches)$snippet,
				310	minOccur = minOccur,
				311	leftContextSize = leftContextSize,
				312	rightContextSize = rightContextSize,
				313	ignoreCollocateCase = ignoreCollocateCase,
				314	stopwords = stopwords,
				315	verbose = kco@verbose) %>%
				316	mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
				317	filter(frequency >= minOccur)
				318	}
				319	}