Blame - R/collocationAnalysis.R - KorAP/RKorAPClient

blob: bf962f68ff91edd4a2748e4575ad8828fb822f3b [file] [log] [blame]

Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1	setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
				2
				3	#' Collocation analysis
				4	#'
				5	#' @aliases collocationAnalysis
				6	#'
				7	#' @description
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	8	#' `r lifecycle::badge("experimental")`
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	9	#'
				10	#' Performs a collocation analysis for the given node (or query)
				11	#' in the given virtual corpus.
				12	#'
				13	#' @details
				14	#' The collocation analysis is currently implemented on the client side, as some of the
				15	#' functionality is not yet provided by the KorAP backend. Mainly for this reason
				16	#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
				17	#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
				18	#' and look for expression-internal collocates using the focus function (see examples and demo).
				19	#'
				20	#' To increase speed at the cost of accuracy and possible false negatives,
				21	#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
				22	#'
				23	#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
				24	#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
				25	#' user interface.
				26	#'
				27	#' @family collocation analysis functions
				28	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	29	#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	30	#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
				31	#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
				32	#' @param searchHitsSampleLimit limit the size of the search hits sample
				33	#' @param stopwords vector of stopwords not to be considered as collocates
				34	#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
				35	#' @param seed seed for random page collecting order
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	36	#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
				37	#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	38	#' @inheritParams collocationScoreQuery,KorAPConnection-method
				39	#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
				40	#'
				41	#' @importFrom stringr str_match str_split str_detect
				42	#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
				43	#' @importFrom purrr pmap
				44	#' @importFrom tidyr expand_grid
				45	#'
				46	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	47	#' \dontrun{
				48	#'
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	49	#' # Find top collocates of "Packung" inside and outside the sports domain.
				50	#' new("KorAPConnection", verbose = TRUE) %>%
				51	#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
				52	#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
				53	#' dplyr::filter(logDice >= 5)
				54	#' }
				55	#'
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	56	#' \dontrun{
				57	#'
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	58	#' # Identify the most prominent light verb construction with "in ... setzen".
				59	#' # Note that, currently, the use of focus function disallows exactFrequencies.
				60	#' new("KorAPConnection", verbose = TRUE) %>%
				61	#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
				62	#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
				63	#' }
				64	#'
				65	#' @export
				66	setMethod("collocationAnalysis", "KorAPConnection",
				67	function(kco,
				68	node,
				69	vc = "",
				70	lemmatizeNodeQuery = FALSE,
				71	minOccur = 5,
				72	leftContextSize = 5,
				73	rightContextSize = 5,
				74	topCollocatesLimit = 200,
				75	searchHitsSampleLimit = 20000,
				76	ignoreCollocateCase = FALSE,
				77	withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
				78	exactFrequencies = TRUE,
				79	stopwords = RKorAPClient::synsemanticStopwords(),
				80	seed = 7,
				81	expand = length(vc) != length(node),
				82	...) {
				83	# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
				84	word <- frequency <- NULL
				85
				86	if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
				87	stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
				88	}
				89
Marc Kupietz	581a29b	2021-09-04 20:51:04 +0200	[diff] [blame]	90	warnIfNoAccessToken(kco)
				91
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	92	if (lemmatizeNodeQuery) {
				93	node <- lemmatizeWordQuery(node)
				94	}
				95
				96	if (length(node) > 1 \|\| length(vc) > 1) {
				97	grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
				98	purrr::pmap(grid, function(node, vc, ...)
				99	collocationAnalysis(kco,
				100	node =node,
				101	vc = vc,
				102	minOccur = minOccur,
				103	leftContextSize = leftContextSize,
				104	rightContextSize = rightContextSize,
				105	topCollocatesLimit = topCollocatesLimit,
				106	searchHitsSampleLimit = searchHitsSampleLimit,
				107	ignoreCollocateCase = ignoreCollocateCase,
				108	withinSpan = withinSpan,
				109	exactFrequencies = exactFrequencies,
				110	stopwords = stopwords,
				111	seed = seed,
				112	expand = expand,
				113	...) ) %>%
				114	bind_rows()
				115	} else {
				116	set.seed(seed)
				117	candidates <- collocatesQuery(
				118	kco,
				119	node,
				120	vc = vc,
				121	minOccur = minOccur,
				122	leftContextSize = leftContextSize,
				123	rightContextSize = rightContextSize,
				124	searchHitsSampleLimit = searchHitsSampleLimit,
				125	ignoreCollocateCase = ignoreCollocateCase,
				126	stopwords = stopwords,
				127	...
				128	)
				129
				130	if (nrow(candidates) > 0) {
				131	candidates <- candidates %>%
				132	filter(frequency >= minOccur) %>%
				133	head(topCollocatesLimit)
				134	collocationScoreQuery(
				135	kco,
				136	node = node,
				137	collocate = candidates$word,
				138	vc = vc,
				139	leftContextSize = leftContextSize,
				140	rightContextSize = rightContextSize,
				141	observed = if (exactFrequencies) NA else candidates$frequency,
				142	ignoreCollocateCase = ignoreCollocateCase,
				143	withinSpan = withinSpan,
				144	...
				145	) %>%
				146	filter(.$O >= minOccur) %>%
				147	dplyr::arrange(dplyr::desc(logDice))
				148	} else {
				149	tibble()
				150	}
				151	}
				152	}
				153	)
				154
				155	#' @importFrom magrittr debug_pipe
				156	#' @importFrom stringr str_match str_split str_detect
				157	#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
				158	#'
				159	snippet2FreqTable <- function(snippet,
				160	minOccur = 5,
				161	leftContextSize = 5,
				162	rightContextSize = 5,
				163	ignoreCollocateCase = FALSE,
				164	stopwords = c(),
				165	tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+\|")",
				166	oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
				167	verbose = TRUE) {
				168	word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
				169	frequency <- NULL
				170
				171	if (length(snippet) < 1) {
				172	dplyr::tibble(word=c(), frequency=c())
				173	} else if (length(snippet) > 1) {
Marc Kupietz	d07bf19	2021-09-04 20:24:44 +0200	[diff] [blame]	174	log.info(verbose, paste("Joinging", length(snippet), "kwics\n"))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	175	for (s in snippet) {
				176	oldTable <- snippet2FreqTable(
				177	s,
				178	leftContextSize = leftContextSize,
				179	rightContextSize = rightContextSize,
				180	oldTable = oldTable,
				181	stopwords = stopwords
				182	)
				183	}
Marc Kupietz	d07bf19	2021-09-04 20:24:44 +0200	[diff] [blame]	184	log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	185	oldTable %>%
				186	group_by(word) %>%
				187	mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
				188	summarise(frequency=sum(frequency), .groups = "drop") %>%
				189	arrange(desc(frequency))
				190	} else {
				191	stopwordsTable <- dplyr::tibble(word=stopwords)
				192	match <-
				193	str_match(
				194	snippet,
				195	'<span class="context-left">(<span class="more"></span>)?(.[^ ]) </span><span class="match"><mark>.</mark></span><span class="context-right"> ([^<]*)'
				196	)
				197
				198	left <- if(leftContextSize > 0)
				199	tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
				200	else
				201	""
				202	# cat(paste("left:", left, "\n", collapse=" "))
				203
				204	right <- if(rightContextSize > 0)
				205	head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
				206	else
				207	""
				208	# cat(paste("right:", right, "\n", collapse=" "))
				209
				210	if(is.na(left) \|\| is.na(right) \|\| length(left) + length(right) == 0) {
				211	oldTable
				212	} else {
				213	table(c(left, right)) %>%
				214	dplyr::as_tibble(.name_repair = "minimal") %>%
				215	dplyr::rename(word = 1, frequency = 2) %>%
				216	dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
				217	dplyr::anti_join(stopwordsTable, by="word") %>%
				218	dplyr::bind_rows(oldTable)
				219	}
				220	}
				221	}
				222
				223	#' Preliminary synsemantic stopwords function
				224	#'
				225	#' @description
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	226	#' `r lifecycle::badge("experimental")`
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	227	#'
				228	#' Preliminary synsemantic stopwords function to be used in collocation analysis.
				229	#'
				230	#' @details
				231	#' Currently only suitable for German. See stopwords package for other languages.
				232	#'
				233	#' @param ... future arguments for language detection
				234	#'
				235	#' @family collocation analysis functions
				236	#' @return Vector of synsemantic stopwords.
				237	#' @export
				238	synsemanticStopwords <- function(...) {
				239	res <- c(
				240	"der",
				241	"die",
				242	"und",
				243	"in",
				244	"den",
				245	"von",
				246	"mit",
				247	"das",
				248	"zu",
				249	"im",
				250	"ist",
				251	"auf",
				252	"sich",
				253	"Die",
				254	"des",
				255	"dem",
				256	"nicht",
				257	"ein",
				258	"eine",
				259	"es",
				260	"auch",
				261	"an",
				262	"als",
				263	"am",
				264	"aus",
				265	"Der",
				266	"bei",
				267	"er",
				268	"dass",
				269	"sie",
				270	"nach",
				271	"um",
				272	"Das",
				273	"zum",
				274	"noch",
				275	"war",
				276	"einen",
				277	"einer",
				278	"wie",
				279	"einem",
				280	"vor",
				281	"bis",
				282	"\u00fcber",
				283	"so",
				284	"aber",
				285	"Eine",
				286	"diese",
				287	"Diese",
				288	"oder"
				289	)
				290	return(res)
				291	}
				292
				293	collocatesQuery <-
				294	function(kco,
				295	query,
				296	vc = "",
				297	minOccur = 5,
				298	leftContextSize = 5,
				299	rightContextSize = 5,
				300	searchHitsSampleLimit = 20000,
				301	ignoreCollocateCase = FALSE,
				302	stopwords = c(),
				303	...) {
				304	frequency <- NULL
				305	q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
				306	if(q@totalResults == 0) {
				307	tibble(word=c(), frequency=c())
				308	} else {
				309	q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
				310	snippet2FreqTable((q@collectedMatches)$snippet,
				311	minOccur = minOccur,
				312	leftContextSize = leftContextSize,
				313	rightContextSize = rightContextSize,
				314	ignoreCollocateCase = ignoreCollocateCase,
				315	stopwords = stopwords,
				316	verbose = kco@verbose) %>%
				317	mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
				318	filter(frequency >= minOccur)
				319	}
				320	}