Blame - R/collocationAnalysis.R - KorAP/RKorAPClient

blob: c14fdcbca09fad383715002f993dd42361ce9b3e [file] [log] [blame]

Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1	setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
				2
				3	#' Collocation analysis
				4	#'
				5	#' @aliases collocationAnalysis
				6	#'
				7	#' @description
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	8	#' `r lifecycle::badge("experimental")`
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	9	#'
				10	#' Performs a collocation analysis for the given node (or query)
				11	#' in the given virtual corpus.
				12	#'
				13	#' @details
				14	#' The collocation analysis is currently implemented on the client side, as some of the
				15	#' functionality is not yet provided by the KorAP backend. Mainly for this reason
				16	#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
				17	#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
				18	#' and look for expression-internal collocates using the focus function (see examples and demo).
				19	#'
				20	#' To increase speed at the cost of accuracy and possible false negatives,
				21	#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
				22	#'
				23	#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
				24	#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
				25	#' user interface.
				26	#'
				27	#' @family collocation analysis functions
				28	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	29	#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	30	#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
				31	#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
				32	#' @param searchHitsSampleLimit limit the size of the search hits sample
				33	#' @param stopwords vector of stopwords not to be considered as collocates
				34	#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
				35	#' @param seed seed for random page collecting order
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	36	#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
				37	#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	38	#' @inheritParams collocationScoreQuery,KorAPConnection-method
				39	#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
				40	#'
				41	#' @importFrom stringr str_match str_split str_detect
				42	#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
				43	#' @importFrom purrr pmap
				44	#' @importFrom tidyr expand_grid
				45	#'
				46	#' @examples
				47	#' \donttest{
				48	#' # Find top collocates of "Packung" inside and outside the sports domain.
				49	#' new("KorAPConnection", verbose = TRUE) %>%
				50	#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
				51	#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
				52	#' dplyr::filter(logDice >= 5)
				53	#' }
				54	#'
				55	#' \donttest{
				56	#' # Identify the most prominent light verb construction with "in ... setzen".
				57	#' # Note that, currently, the use of focus function disallows exactFrequencies.
				58	#' new("KorAPConnection", verbose = TRUE) %>%
				59	#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
				60	#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
				61	#' }
				62	#'
				63	#' @export
				64	setMethod("collocationAnalysis", "KorAPConnection",
				65	function(kco,
				66	node,
				67	vc = "",
				68	lemmatizeNodeQuery = FALSE,
				69	minOccur = 5,
				70	leftContextSize = 5,
				71	rightContextSize = 5,
				72	topCollocatesLimit = 200,
				73	searchHitsSampleLimit = 20000,
				74	ignoreCollocateCase = FALSE,
				75	withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
				76	exactFrequencies = TRUE,
				77	stopwords = RKorAPClient::synsemanticStopwords(),
				78	seed = 7,
				79	expand = length(vc) != length(node),
				80	...) {
				81	# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
				82	word <- frequency <- NULL
				83
				84	if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
				85	stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
				86	}
				87
Marc Kupietz	581a29b	2021-09-04 20:51:04 +0200	[diff] [blame]	88	warnIfNoAccessToken(kco)
				89
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	90	if (lemmatizeNodeQuery) {
				91	node <- lemmatizeWordQuery(node)
				92	}
				93
				94	if (length(node) > 1 \|\| length(vc) > 1) {
				95	grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
				96	purrr::pmap(grid, function(node, vc, ...)
				97	collocationAnalysis(kco,
				98	node =node,
				99	vc = vc,
				100	minOccur = minOccur,
				101	leftContextSize = leftContextSize,
				102	rightContextSize = rightContextSize,
				103	topCollocatesLimit = topCollocatesLimit,
				104	searchHitsSampleLimit = searchHitsSampleLimit,
				105	ignoreCollocateCase = ignoreCollocateCase,
				106	withinSpan = withinSpan,
				107	exactFrequencies = exactFrequencies,
				108	stopwords = stopwords,
				109	seed = seed,
				110	expand = expand,
				111	...) ) %>%
				112	bind_rows()
				113	} else {
				114	set.seed(seed)
				115	candidates <- collocatesQuery(
				116	kco,
				117	node,
				118	vc = vc,
				119	minOccur = minOccur,
				120	leftContextSize = leftContextSize,
				121	rightContextSize = rightContextSize,
				122	searchHitsSampleLimit = searchHitsSampleLimit,
				123	ignoreCollocateCase = ignoreCollocateCase,
				124	stopwords = stopwords,
				125	...
				126	)
				127
				128	if (nrow(candidates) > 0) {
				129	candidates <- candidates %>%
				130	filter(frequency >= minOccur) %>%
				131	head(topCollocatesLimit)
				132	collocationScoreQuery(
				133	kco,
				134	node = node,
				135	collocate = candidates$word,
				136	vc = vc,
				137	leftContextSize = leftContextSize,
				138	rightContextSize = rightContextSize,
				139	observed = if (exactFrequencies) NA else candidates$frequency,
				140	ignoreCollocateCase = ignoreCollocateCase,
				141	withinSpan = withinSpan,
				142	...
				143	) %>%
				144	filter(.$O >= minOccur) %>%
				145	dplyr::arrange(dplyr::desc(logDice))
				146	} else {
				147	tibble()
				148	}
				149	}
				150	}
				151	)
				152
				153	#' @importFrom magrittr debug_pipe
				154	#' @importFrom stringr str_match str_split str_detect
				155	#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
				156	#'
				157	snippet2FreqTable <- function(snippet,
				158	minOccur = 5,
				159	leftContextSize = 5,
				160	rightContextSize = 5,
				161	ignoreCollocateCase = FALSE,
				162	stopwords = c(),
				163	tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+\|")",
				164	oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
				165	verbose = TRUE) {
				166	word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
				167	frequency <- NULL
				168
				169	if (length(snippet) < 1) {
				170	dplyr::tibble(word=c(), frequency=c())
				171	} else if (length(snippet) > 1) {
Marc Kupietz	d07bf19	2021-09-04 20:24:44 +0200	[diff] [blame]	172	log.info(verbose, paste("Joinging", length(snippet), "kwics\n"))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	173	for (s in snippet) {
				174	oldTable <- snippet2FreqTable(
				175	s,
				176	leftContextSize = leftContextSize,
				177	rightContextSize = rightContextSize,
				178	oldTable = oldTable,
				179	stopwords = stopwords
				180	)
				181	}
Marc Kupietz	d07bf19	2021-09-04 20:24:44 +0200	[diff] [blame]	182	log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	183	oldTable %>%
				184	group_by(word) %>%
				185	mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
				186	summarise(frequency=sum(frequency), .groups = "drop") %>%
				187	arrange(desc(frequency))
				188	} else {
				189	stopwordsTable <- dplyr::tibble(word=stopwords)
				190	match <-
				191	str_match(
				192	snippet,
				193	'<span class="context-left">(<span class="more"></span>)?(.[^ ]) </span><span class="match"><mark>.</mark></span><span class="context-right"> ([^<]*)'
				194	)
				195
				196	left <- if(leftContextSize > 0)
				197	tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
				198	else
				199	""
				200	# cat(paste("left:", left, "\n", collapse=" "))
				201
				202	right <- if(rightContextSize > 0)
				203	head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
				204	else
				205	""
				206	# cat(paste("right:", right, "\n", collapse=" "))
				207
				208	if(is.na(left) \|\| is.na(right) \|\| length(left) + length(right) == 0) {
				209	oldTable
				210	} else {
				211	table(c(left, right)) %>%
				212	dplyr::as_tibble(.name_repair = "minimal") %>%
				213	dplyr::rename(word = 1, frequency = 2) %>%
				214	dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
				215	dplyr::anti_join(stopwordsTable, by="word") %>%
				216	dplyr::bind_rows(oldTable)
				217	}
				218	}
				219	}
				220
				221	#' Preliminary synsemantic stopwords function
				222	#'
				223	#' @description
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	224	#' `r lifecycle::badge("experimental")`
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	225	#'
				226	#' Preliminary synsemantic stopwords function to be used in collocation analysis.
				227	#'
				228	#' @details
				229	#' Currently only suitable for German. See stopwords package for other languages.
				230	#'
				231	#' @param ... future arguments for language detection
				232	#'
				233	#' @family collocation analysis functions
				234	#' @return Vector of synsemantic stopwords.
				235	#' @export
				236	synsemanticStopwords <- function(...) {
				237	res <- c(
				238	"der",
				239	"die",
				240	"und",
				241	"in",
				242	"den",
				243	"von",
				244	"mit",
				245	"das",
				246	"zu",
				247	"im",
				248	"ist",
				249	"auf",
				250	"sich",
				251	"Die",
				252	"des",
				253	"dem",
				254	"nicht",
				255	"ein",
				256	"eine",
				257	"es",
				258	"auch",
				259	"an",
				260	"als",
				261	"am",
				262	"aus",
				263	"Der",
				264	"bei",
				265	"er",
				266	"dass",
				267	"sie",
				268	"nach",
				269	"um",
				270	"Das",
				271	"zum",
				272	"noch",
				273	"war",
				274	"einen",
				275	"einer",
				276	"wie",
				277	"einem",
				278	"vor",
				279	"bis",
				280	"\u00fcber",
				281	"so",
				282	"aber",
				283	"Eine",
				284	"diese",
				285	"Diese",
				286	"oder"
				287	)
				288	return(res)
				289	}
				290
				291	collocatesQuery <-
				292	function(kco,
				293	query,
				294	vc = "",
				295	minOccur = 5,
				296	leftContextSize = 5,
				297	rightContextSize = 5,
				298	searchHitsSampleLimit = 20000,
				299	ignoreCollocateCase = FALSE,
				300	stopwords = c(),
				301	...) {
				302	frequency <- NULL
				303	q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
				304	if(q@totalResults == 0) {
				305	tibble(word=c(), frequency=c())
				306	} else {
				307	q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
				308	snippet2FreqTable((q@collectedMatches)$snippet,
				309	minOccur = minOccur,
				310	leftContextSize = leftContextSize,
				311	rightContextSize = rightContextSize,
				312	ignoreCollocateCase = ignoreCollocateCase,
				313	stopwords = stopwords,
				314	verbose = kco@verbose) %>%
				315	mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
				316	filter(frequency >= minOccur)
				317	}
				318	}