Blame - R/collocationAnalysis.R - KorAP/RKorAPClient

blob: 2443f8fe10459fc7a07d80e4d41234d0f3dffa27 [file] [log] [blame]

Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1	setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
				2
				3	#' Collocation analysis
				4	#'
				5	#' @aliases collocationAnalysis
				6	#'
				7	#' @description
				8	#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
				9	#'
				10	#' Performs a collocation analysis for the given node (or query)
				11	#' in the given virtual corpus.
				12	#'
				13	#' @details
				14	#' The collocation analysis is currently implemented on the client side, as some of the
				15	#' functionality is not yet provided by the KorAP backend. Mainly for this reason
				16	#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
				17	#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
				18	#' and look for expression-internal collocates using the focus function (see examples and demo).
				19	#'
				20	#' To increase speed at the cost of accuracy and possible false negatives,
				21	#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
				22	#'
				23	#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
				24	#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
				25	#' user interface.
				26	#'
				27	#' @family collocation analysis functions
				28	#'
				29	#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. x -> [tt/l=x]
				30	#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
				31	#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
				32	#' @param searchHitsSampleLimit limit the size of the search hits sample
				33	#' @param stopwords vector of stopwords not to be considered as collocates
				34	#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
				35	#' @param seed seed for random page collecting order
				36	#' @param expand if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations
				37	#' @param ... more arguments will be passed to \code{\link{collocationScoreQuery}}
				38	#' @inheritParams collocationScoreQuery,KorAPConnection-method
				39	#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
				40	#'
				41	#' @importFrom stringr str_match str_split str_detect
				42	#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
				43	#' @importFrom purrr pmap
				44	#' @importFrom tidyr expand_grid
				45	#'
				46	#' @examples
				47	#' \donttest{
				48	#' # Find top collocates of "Packung" inside and outside the sports domain.
				49	#' new("KorAPConnection", verbose = TRUE) %>%
				50	#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
				51	#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
				52	#' dplyr::filter(logDice >= 5)
				53	#' }
				54	#'
				55	#' \donttest{
				56	#' # Identify the most prominent light verb construction with "in ... setzen".
				57	#' # Note that, currently, the use of focus function disallows exactFrequencies.
				58	#' new("KorAPConnection", verbose = TRUE) %>%
				59	#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
				60	#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
				61	#' }
				62	#'
				63	#' @export
				64	setMethod("collocationAnalysis", "KorAPConnection",
				65	function(kco,
				66	node,
				67	vc = "",
				68	lemmatizeNodeQuery = FALSE,
				69	minOccur = 5,
				70	leftContextSize = 5,
				71	rightContextSize = 5,
				72	topCollocatesLimit = 200,
				73	searchHitsSampleLimit = 20000,
				74	ignoreCollocateCase = FALSE,
				75	withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
				76	exactFrequencies = TRUE,
				77	stopwords = RKorAPClient::synsemanticStopwords(),
				78	seed = 7,
				79	expand = length(vc) != length(node),
				80	...) {
				81	# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
				82	word <- frequency <- NULL
				83
				84	if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
				85	stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
				86	}
				87
				88	if (lemmatizeNodeQuery) {
				89	node <- lemmatizeWordQuery(node)
				90	}
				91
				92	if (length(node) > 1 \|\| length(vc) > 1) {
				93	grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
				94	purrr::pmap(grid, function(node, vc, ...)
				95	collocationAnalysis(kco,
				96	node =node,
				97	vc = vc,
				98	minOccur = minOccur,
				99	leftContextSize = leftContextSize,
				100	rightContextSize = rightContextSize,
				101	topCollocatesLimit = topCollocatesLimit,
				102	searchHitsSampleLimit = searchHitsSampleLimit,
				103	ignoreCollocateCase = ignoreCollocateCase,
				104	withinSpan = withinSpan,
				105	exactFrequencies = exactFrequencies,
				106	stopwords = stopwords,
				107	seed = seed,
				108	expand = expand,
				109	...) ) %>%
				110	bind_rows()
				111	} else {
				112	set.seed(seed)
				113	candidates <- collocatesQuery(
				114	kco,
				115	node,
				116	vc = vc,
				117	minOccur = minOccur,
				118	leftContextSize = leftContextSize,
				119	rightContextSize = rightContextSize,
				120	searchHitsSampleLimit = searchHitsSampleLimit,
				121	ignoreCollocateCase = ignoreCollocateCase,
				122	stopwords = stopwords,
				123	...
				124	)
				125
				126	if (nrow(candidates) > 0) {
				127	candidates <- candidates %>%
				128	filter(frequency >= minOccur) %>%
				129	head(topCollocatesLimit)
				130	collocationScoreQuery(
				131	kco,
				132	node = node,
				133	collocate = candidates$word,
				134	vc = vc,
				135	leftContextSize = leftContextSize,
				136	rightContextSize = rightContextSize,
				137	observed = if (exactFrequencies) NA else candidates$frequency,
				138	ignoreCollocateCase = ignoreCollocateCase,
				139	withinSpan = withinSpan,
				140	...
				141	) %>%
				142	filter(.$O >= minOccur) %>%
				143	dplyr::arrange(dplyr::desc(logDice))
				144	} else {
				145	tibble()
				146	}
				147	}
				148	}
				149	)
				150
				151	#' @importFrom magrittr debug_pipe
				152	#' @importFrom stringr str_match str_split str_detect
				153	#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
				154	#'
				155	snippet2FreqTable <- function(snippet,
				156	minOccur = 5,
				157	leftContextSize = 5,
				158	rightContextSize = 5,
				159	ignoreCollocateCase = FALSE,
				160	stopwords = c(),
				161	tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+\|")",
				162	oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
				163	verbose = TRUE) {
				164	word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
				165	frequency <- NULL
				166
				167	if (length(snippet) < 1) {
				168	dplyr::tibble(word=c(), frequency=c())
				169	} else if (length(snippet) > 1) {
Marc Kupietz	d07bf19	2021-09-04 20:24:44 +0200	[diff] [blame^]	170	log.info(verbose, paste("Joinging", length(snippet), "kwics\n"))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	171	for (s in snippet) {
				172	oldTable <- snippet2FreqTable(
				173	s,
				174	leftContextSize = leftContextSize,
				175	rightContextSize = rightContextSize,
				176	oldTable = oldTable,
				177	stopwords = stopwords
				178	)
				179	}
Marc Kupietz	d07bf19	2021-09-04 20:24:44 +0200	[diff] [blame^]	180	log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	181	oldTable %>%
				182	group_by(word) %>%
				183	mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
				184	summarise(frequency=sum(frequency), .groups = "drop") %>%
				185	arrange(desc(frequency))
				186	} else {
				187	stopwordsTable <- dplyr::tibble(word=stopwords)
				188	match <-
				189	str_match(
				190	snippet,
				191	'<span class="context-left">(<span class="more"></span>)?(.[^ ]) </span><span class="match"><mark>.</mark></span><span class="context-right"> ([^<]*)'
				192	)
				193
				194	left <- if(leftContextSize > 0)
				195	tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
				196	else
				197	""
				198	# cat(paste("left:", left, "\n", collapse=" "))
				199
				200	right <- if(rightContextSize > 0)
				201	head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
				202	else
				203	""
				204	# cat(paste("right:", right, "\n", collapse=" "))
				205
				206	if(is.na(left) \|\| is.na(right) \|\| length(left) + length(right) == 0) {
				207	oldTable
				208	} else {
				209	table(c(left, right)) %>%
				210	dplyr::as_tibble(.name_repair = "minimal") %>%
				211	dplyr::rename(word = 1, frequency = 2) %>%
				212	dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
				213	dplyr::anti_join(stopwordsTable, by="word") %>%
				214	dplyr::bind_rows(oldTable)
				215	}
				216	}
				217	}
				218
				219	#' Preliminary synsemantic stopwords function
				220	#'
				221	#' @description
				222	#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
				223	#'
				224	#' Preliminary synsemantic stopwords function to be used in collocation analysis.
				225	#'
				226	#' @details
				227	#' Currently only suitable for German. See stopwords package for other languages.
				228	#'
				229	#' @param ... future arguments for language detection
				230	#'
				231	#' @family collocation analysis functions
				232	#' @return Vector of synsemantic stopwords.
				233	#' @export
				234	synsemanticStopwords <- function(...) {
				235	res <- c(
				236	"der",
				237	"die",
				238	"und",
				239	"in",
				240	"den",
				241	"von",
				242	"mit",
				243	"das",
				244	"zu",
				245	"im",
				246	"ist",
				247	"auf",
				248	"sich",
				249	"Die",
				250	"des",
				251	"dem",
				252	"nicht",
				253	"ein",
				254	"eine",
				255	"es",
				256	"auch",
				257	"an",
				258	"als",
				259	"am",
				260	"aus",
				261	"Der",
				262	"bei",
				263	"er",
				264	"dass",
				265	"sie",
				266	"nach",
				267	"um",
				268	"Das",
				269	"zum",
				270	"noch",
				271	"war",
				272	"einen",
				273	"einer",
				274	"wie",
				275	"einem",
				276	"vor",
				277	"bis",
				278	"\u00fcber",
				279	"so",
				280	"aber",
				281	"Eine",
				282	"diese",
				283	"Diese",
				284	"oder"
				285	)
				286	return(res)
				287	}
				288
				289	collocatesQuery <-
				290	function(kco,
				291	query,
				292	vc = "",
				293	minOccur = 5,
				294	leftContextSize = 5,
				295	rightContextSize = 5,
				296	searchHitsSampleLimit = 20000,
				297	ignoreCollocateCase = FALSE,
				298	stopwords = c(),
				299	...) {
				300	frequency <- NULL
				301	q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
				302	if(q@totalResults == 0) {
				303	tibble(word=c(), frequency=c())
				304	} else {
				305	q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
				306	snippet2FreqTable((q@collectedMatches)$snippet,
				307	minOccur = minOccur,
				308	leftContextSize = leftContextSize,
				309	rightContextSize = rightContextSize,
				310	ignoreCollocateCase = ignoreCollocateCase,
				311	stopwords = stopwords,
				312	verbose = kco@verbose) %>%
				313	mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
				314	filter(frequency >= minOccur)
				315	}
				316	}
				317
				318