Blame - R/collocationAnalysis.R - KorAP/RKorAPClient

blob: 4344d1ecb4846676407c545332f53d3562caac3e [file] [log] [blame]

Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1	#' @include logging.R
				2	setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	3
				4	#' Collocation analysis
				5	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	6	#' @family collocation analysis functions
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	7	#' @aliases collocationAnalysis
				8	#'
				9	#' @description
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	10	#'
				11	#' Performs a collocation analysis for the given node (or query)
				12	#' in the given virtual corpus.
				13	#'
				14	#' @details
				15	#' The collocation analysis is currently implemented on the client side, as some of the
				16	#' functionality is not yet provided by the KorAP backend. Mainly for this reason
				17	#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
				18	#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
				19	#' and look for expression-internal collocates using the focus function (see examples and demo).
				20	#'
				21	#' To increase speed at the cost of accuracy and possible false negatives,
				22	#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
				23	#'
Marc Kupietz	e7f0d68	2025-02-19 10:50:59 +0100	[diff] [blame]	24	#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
				25	#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
				26	#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	27	#' user interface.
				28	#'
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	29	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	30	#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	31	#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
				32	#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
				33	#' @param searchHitsSampleLimit limit the size of the search hits sample
				34	#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz	6bd9cad	2024-12-18 15:57:26 +0100	[diff] [blame]	35	#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	36	#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
				37	#' @param seed seed for random page collecting order
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	38	#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz	7d400e0	2021-12-19 16:39:36 +0100	[diff] [blame]	39	#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
				40	#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
Marc Kupietz	2b0b0a1	2025-10-19 14:49:14 +0200	[diff] [blame^]	41	#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls (only applied when \code{maxRecurse > 0})
				42	#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively (only applied when \code{maxRecurse > 0})
Marc Kupietz	7d400e0	2021-12-19 16:39:36 +0100	[diff] [blame]	43	#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz	47d0d2b	2021-12-19 16:38:52 +0100	[diff] [blame]	44	#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietz	de679ea	2025-10-19 13:14:51 +0200	[diff] [blame]	45	#' @param queryMissingScores if TRUE, attempt to retrieve corpus-based association scores for vc/collocate combinations that would otherwise be imputed, by re-querying the KorAP backend without applying the collocate frequency threshold
Marc Kupietz	4cbb547	2025-10-19 12:15:25 +0200	[diff] [blame]	46	#' @param missingScoreQuantile lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora; a robust spread is subtracted from this anchor so the imputed values stay below the weakest observed scores
Marc Kupietz	e34a8be	2025-10-17 20:13:42 +0200	[diff] [blame]	47	#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	48	#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	49	#' @inheritParams collocationScoreQuery,KorAPConnection-method
Marc Kupietz	130a2a2	2025-10-18 16:09:23 +0200	[diff] [blame]	50	#' @return
				51	#' A tibble where each row represents a candidate collocate for the requested node.
				52	#' Columns include (depending on the selected association measures):
				53	#'
				54	#' \itemize{
				55	#' \item \code{node}, \code{collocate}, \code{vc}, \code{label}: identifiers for the query node, collocate, virtual corpus, and optional label.
				56	#' \item Frequency and contingency information such as \code{frequency}, \code{O}, \code{O1}, \code{O2}, \code{E}, \code{leftContextSize}, \code{rightContextSize}, and \code{w}.
				57	#' \item Association measures (e.g. \code{logDice}, \code{ll}, \code{mi}, ...), one column per requested scorer.
				58	#' \item Per-labelled association scores produced by multi-VC comparisons using the pattern \code{<measure>_<label>}.
				59	#' \item Ranks per label/measure with the pattern \code{rank_<label>_<measure>} (1 is best) and the corresponding percentile ranks \code{percentile_rank_<label>_<measure>}.
				60	#' \item Pairwise contrasts for two-label comparisons, e.g. \code{delta_<measure>}, \code{delta_rank_<measure>}, and \code{delta_percentile_rank_<measure>}.
				61	#' \item Summary columns describing the strongest labels per measure (\code{winner_}, \code{runner_up_}, \code{loser_}, and \code{max_delta_}).
				62	#' \item Optional helper columns such as \code{query}, \code{example}, or \code{url} when example retrieval is requested.
				63	#' }
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	64	#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	65	#' @importFrom purrr pmap
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	66	#' @importFrom tidyr expand_grid pivot_wider
				67	#' @importFrom rlang sym
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	68	#'
				69	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	70	#' \dontrun{
				71	#'
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	72	#' # Find top collocates of "Packung" inside and outside the sports domain.
				73	#' KorAPConnection(verbose = TRUE) \|>
				74	#' collocationAnalysis("Packung",
				75	#' vc = c("textClass=sport", "textClass!=sport"),
				76	#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
				77	#' ) \|>
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	78	#' dplyr::filter(logDice >= 5)
				79	#' }
				80	#'
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	81	#' \dontrun{
				82	#'
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	83	#' # Identify the most prominent light verb construction with "in ... setzen".
				84	#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz	4cd066d	2025-02-28 15:48:23 +0100	[diff] [blame]	85	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	86	#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	87	#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
				88	#' )
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	89	#' }
				90	#'
				91	#' @export
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	92	setMethod(
				93	"collocationAnalysis", "KorAPConnection",
				94	function(kco,
				95	node,
				96	vc = "",
				97	lemmatizeNodeQuery = FALSE,
				98	minOccur = 5,
				99	leftContextSize = 5,
				100	rightContextSize = 5,
				101	topCollocatesLimit = 200,
				102	searchHitsSampleLimit = 20000,
				103	ignoreCollocateCase = FALSE,
				104	withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
				105	exactFrequencies = TRUE,
				106	stopwords = append(RKorAPClient::synsemanticStopwords(), node),
				107	seed = 7,
				108	expand = length(vc) != length(node),
				109	maxRecurse = 0,
				110	addExamples = FALSE,
				111	thresholdScore = "logDice",
				112	threshold = 2.0,
				113	localStopwords = c(),
				114	collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz	de679ea	2025-10-19 13:14:51 +0200	[diff] [blame]	115	queryMissingScores = FALSE,
Marc Kupietz	9894a37	2025-10-18 14:51:29 +0200	[diff] [blame]	116	missingScoreQuantile = 0.05,
Marc Kupietz	e34a8be	2025-10-17 20:13:42 +0200	[diff] [blame]	117	vcLabel = NA_character_,
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	118	...) {
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	119	word <- frequency <- O <- NULL
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	120
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	121	if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nzchar(withinSpan))) {
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	122	stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
				123	}
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	124
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	125	warnIfNotAuthorized(kco)
Marc Kupietz	581a29b	2021-09-04 20:51:04 +0200	[diff] [blame]	126
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	127	if (lemmatizeNodeQuery) {
				128	node <- lemmatizeWordQuery(node)
				129	}
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	130
Marc Kupietz	e34a8be	2025-10-17 20:13:42 +0200	[diff] [blame]	131	vcNames <- names(vc)
Marc Kupietz	e34a8be	2025-10-17 20:13:42 +0200	[diff] [blame]	132	if (is.null(vcNames)) {
				133	vcNames <- rep(NA_character_, length(vc))
Marc Kupietz	e34a8be	2025-10-17 20:13:42 +0200	[diff] [blame]	134	}
				135
				136	label_lookup <- NULL
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	137	if (!is.null(names(vc)) && length(vc) > 0) {
				138	raw_names <- names(vc)
				139	if (any(!is.na(raw_names) & raw_names != "")) {
				140	label_lookup <- stats::setNames(raw_names, vc)
				141	}
Marc Kupietz	e34a8be	2025-10-17 20:13:42 +0200	[diff] [blame]	142	}
				143
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	144	result <- if (length(node) > 1 \|\| length(vc) > 1) {
Marc Kupietz	e34a8be	2025-10-17 20:13:42 +0200	[diff] [blame]	145	grid <- if (expand) {
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	146	tmp_grid <- tidyr::expand_grid(node = node, idx = seq_along(vc))
				147	tmp_grid$vc <- vc[tmp_grid$idx]
				148	tmp_grid$vcLabel <- vcNames[tmp_grid$idx]
				149	tmp_grid[, c("node", "vc", "vcLabel"), drop = FALSE]
Marc Kupietz	e34a8be	2025-10-17 20:13:42 +0200	[diff] [blame]	150	} else {
				151	tibble(node = node, vc = vc, vcLabel = vcNames)
				152	}
				153
				154	multi_result <- purrr::pmap(grid, function(node, vc, vcLabel, ...) {
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	155	collocationAnalysis(kco,
				156	node = node,
				157	vc = vc,
				158	minOccur = minOccur,
				159	leftContextSize = leftContextSize,
				160	rightContextSize = rightContextSize,
				161	topCollocatesLimit = topCollocatesLimit,
				162	searchHitsSampleLimit = searchHitsSampleLimit,
				163	ignoreCollocateCase = ignoreCollocateCase,
				164	withinSpan = withinSpan,
				165	exactFrequencies = exactFrequencies,
				166	stopwords = stopwords,
				167	addExamples = TRUE,
				168	localStopwords = localStopwords,
				169	seed = seed,
				170	expand = expand,
Marc Kupietz	9894a37	2025-10-18 14:51:29 +0200	[diff] [blame]	171	missingScoreQuantile = missingScoreQuantile,
Marc Kupietz	de679ea	2025-10-19 13:14:51 +0200	[diff] [blame]	172	queryMissingScores = queryMissingScores,
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	173	collocateFilterRegex = collocateFilterRegex,
Marc Kupietz	e34a8be	2025-10-17 20:13:42 +0200	[diff] [blame]	174	vcLabel = vcLabel,
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	175	...
				176	)
				177	}) \|>
Marc Kupietz	e31322e	2025-10-17 18:55:36 +0200	[diff] [blame]	178	bind_rows()
				179
				180	if (!"vc" %in% names(multi_result) \|\| nrow(multi_result) == 0) {
				181	multi_result
				182	} else {
Marc Kupietz	de679ea	2025-10-19 13:14:51 +0200	[diff] [blame]	183	if (queryMissingScores) {
				184	multi_result <- backfill_missing_scores(
				185	multi_result,
				186	grid = grid,
				187	kco = kco,
				188	ignoreCollocateCase = ignoreCollocateCase,
				189	...
				190	)
				191	}
				192
Marc Kupietz	e34a8be	2025-10-17 20:13:42 +0200	[diff] [blame]	193	if (!"label" %in% names(multi_result)) {
				194	multi_result$label <- NA_character_
				195	}
				196
				197	if (!is.null(label_lookup)) {
				198	override <- unname(label_lookup[multi_result$vc])
				199	missing_idx <- is.na(multi_result$label) \| multi_result$label == ""
				200	if (any(missing_idx)) {
				201	multi_result$label[missing_idx] <- override[missing_idx]
				202	}
				203	}
				204
				205	missing_idx <- is.na(multi_result$label) \| multi_result$label == ""
				206	if (any(missing_idx)) {
				207	multi_result$label[missing_idx] <- queryStringToLabel(multi_result$vc[missing_idx])
				208	}
				209
Marc Kupietz	e31322e	2025-10-17 18:55:36 +0200	[diff] [blame]	210	multi_result \|>
Marc Kupietz	9894a37	2025-10-18 14:51:29 +0200	[diff] [blame]	211	add_multi_vc_comparisons(
Marc Kupietz	9894a37	2025-10-18 14:51:29 +0200	[diff] [blame]	212	missingScoreQuantile = missingScoreQuantile
				213	)
Marc Kupietz	e31322e	2025-10-17 18:55:36 +0200	[diff] [blame]	214	}
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	215	} else {
Marc Kupietz	e34a8be	2025-10-17 20:13:42 +0200	[diff] [blame]	216	if ((is.na(vcLabel) \|\| vcLabel == "") && length(vcNames) >= 1) {
				217	vcLabel <- vcNames[1]
				218	}
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	219
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	220	set.seed(seed)
				221	candidates <- collocatesQuery(
				222	kco,
				223	node,
				224	vc = vc,
				225	minOccur = minOccur,
				226	leftContextSize = leftContextSize,
				227	rightContextSize = rightContextSize,
				228	searchHitsSampleLimit = searchHitsSampleLimit,
				229	ignoreCollocateCase = ignoreCollocateCase,
				230	stopwords = append(stopwords, localStopwords),
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	231	collocateFilterRegex = collocateFilterRegex,
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	232	...
				233	)
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	234
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	235	if (nrow(candidates) > 0) {
				236	candidates <- candidates \|>
				237	filter(frequency >= minOccur) \|>
				238	slice_head(n = topCollocatesLimit)
				239	collocationScoreQuery(
				240	kco,
				241	node = node,
				242	collocate = candidates$word,
				243	vc = vc,
				244	leftContextSize = leftContextSize,
				245	rightContextSize = rightContextSize,
				246	observed = if (exactFrequencies) NA else candidates$frequency,
				247	ignoreCollocateCase = ignoreCollocateCase,
				248	withinSpan = withinSpan,
				249	...
				250	) \|>
				251	filter(O >= minOccur) \|>
				252	dplyr::arrange(dplyr::desc(logDice))
				253	} else {
				254	tibble()
				255	}
				256	}
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	257
				258	if (!is.na(vcLabel) && vcLabel != "" && "label" %in% names(result)) {
				259	result$label <- rep(vcLabel, nrow(result))
				260	}
				261
				262	threshold_col <- thresholdScore
				263	if (maxRecurse > 0 && nrow(result) > 0 && threshold_col %in% names(result)) {
				264	threshold_values <- result[[threshold_col]]
				265	eligible_idx <- which(!is.na(threshold_values) & threshold_values >= threshold)
				266	if (length(eligible_idx) > 0) {
				267	recurseWith <- result[eligible_idx, , drop = FALSE]
				268	result <- collocationAnalysis(
				269	kco,
				270	node = paste0("(", buildCollocationQuery(
				271	removeWithinSpan(recurseWith$node, withinSpan),
				272	recurseWith$collocate,
				273	leftContextSize = leftContextSize,
				274	rightContextSize = rightContextSize,
				275	withinSpan = ""
				276	), ")"),
				277	vc = vc,
				278	minOccur = minOccur,
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	279	leftContextSize = leftContextSize,
				280	rightContextSize = rightContextSize,
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	281	withinSpan = withinSpan,
				282	maxRecurse = maxRecurse - 1,
				283	stopwords = stopwords,
				284	localStopwords = recurseWith$collocate,
				285	exactFrequencies = exactFrequencies,
				286	searchHitsSampleLimit = searchHitsSampleLimit,
				287	topCollocatesLimit = topCollocatesLimit,
				288	addExamples = FALSE,
Marc Kupietz	9894a37	2025-10-18 14:51:29 +0200	[diff] [blame]	289	missingScoreQuantile = missingScoreQuantile,
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	290	collocateFilterRegex = collocateFilterRegex,
Marc Kupietz	de679ea	2025-10-19 13:14:51 +0200	[diff] [blame]	291	queryMissingScores = queryMissingScores,
Marc Kupietz	2b0b0a1	2025-10-19 14:49:14 +0200	[diff] [blame^]	292	thresholdScore = thresholdScore,
				293	threshold = threshold,
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	294	vcLabel = vcLabel
				295	) \|>
Marc Kupietz	2b0b0a1	2025-10-19 14:49:14 +0200	[diff] [blame^]	296	bind_rows(result)
				297
				298	if (threshold_col %in% names(result)) {
				299	threshold_values <- result[[threshold_col]]
				300	keep_idx <- is.na(threshold_values) \| threshold_values >= threshold
				301	result <- result[keep_idx, , drop = FALSE]
				302	}
				303
				304	result <- result \|>
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	305	filter(O >= minOccur) \|>
				306	dplyr::arrange(dplyr::desc(logDice))
				307	}
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	308	}
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	309
				310	if (addExamples && nrow(result) > 0) {
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	311	result$query <- buildCollocationQuery(
				312	result$node,
				313	result$collocate,
				314	leftContextSize = leftContextSize,
				315	rightContextSize = rightContextSize,
				316	withinSpan = withinSpan
				317	)
				318	result$example <- findExample(
				319	kco,
				320	query = result$query,
				321	vc = result$vc
				322	)
				323	}
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	324
Marc Kupietz	0a29263	2025-10-19 14:04:36 +0200	[diff] [blame]	325	if (!is.null(withinSpan) && !is.na(withinSpan) && nzchar(withinSpan) &&
				326	nrow(result) > 0 &&
				327	"webUIRequestUrl" %in% names(result) &&
				328	"query" %in% names(result)) {
				329	candidate_rows <- which(!is.na(result$node) &
				330	!grepl("focus\\(", result$node, perl = TRUE) &
				331	!is.na(result$query) & nzchar(result$query))
				332
				333	if (length(candidate_rows) > 0) {
				334	focused_queries <- vapply(
				335	result$query[candidate_rows],
				336	inject_focus_into_query,
				337	character(1)
				338	)
				339
				340	changed <- focused_queries != result$query[candidate_rows]
				341	if (any(changed)) {
				342	indices <- candidate_rows[changed]
				343	vc_values <- as.character(result$vc)
				344	vc_values[is.na(vc_values)] <- ""
				345
				346	result$webUIRequestUrl[indices] <- mapply(
				347	function(new_query, vc_value) {
				348	buildWebUIRequestUrlFromString(
				349	kco@KorAPUrl,
				350	new_query,
				351	vc = vc_value,
				352	ql = "poliqarp"
				353	)
				354	},
				355	focused_queries[changed],
				356	vc_values[indices],
				357	USE.NAMES = FALSE
				358	)
				359	}
				360	}
				361	}
				362
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	363	result
				364	}
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	365	)
				366
Marc Kupietz	76b0559	2021-12-19 16:26:15 +0100	[diff] [blame]	367	# #' @export
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	368	removeWithinSpan <- function(query, withinSpan) {
				369	if (withinSpan == "") {
				370	return(query)
				371	}
				372	needle <- sprintf("^\$contains\\(<%s>, ?(.*)\${2}$", withinSpan)
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	373	res <- gsub(needle, "\\1", query)
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	374	needle <- sprintf("^contains\$<%s>, ?(.*)\$$", withinSpan)
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	375	res <- gsub(needle, "\\1", res)
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	376	return(res)
				377	}
				378
Marc Kupietz	de679ea	2025-10-19 13:14:51 +0200	[diff] [blame]	379	backfill_missing_scores <- function(result,
				380	grid,
				381	kco,
				382	ignoreCollocateCase,
				383	...) {
				384	if (!"vc" %in% names(result) \|\| !"node" %in% names(result) \|\| !"collocate" %in% names(result)) {
				385	return(result)
				386	}
				387
				388	if (nrow(result) == 0) {
				389	return(result)
				390	}
				391
				392	distinct_pairs <- dplyr::distinct(result, node, collocate)
				393	if (nrow(distinct_pairs) == 0) {
				394	return(result)
				395	}
				396
				397	collocates_by_node <- split(as.character(distinct_pairs$collocate), distinct_pairs$node)
				398	if (length(collocates_by_node) == 0) {
				399	return(result)
				400	}
				401
				402	required_combinations <- unique(as.data.frame(grid[, c("node", "vc", "vcLabel")], drop = FALSE))
				403	for (i in seq_len(nrow(required_combinations))) {
				404	node_value <- required_combinations$node[i]
				405	vc_value <- required_combinations$vc[i]
				406
				407	collocate_pool <- collocates_by_node[[node_value]]
				408	if (is.null(collocate_pool) \|\| length(collocate_pool) == 0) {
				409	next
				410	}
				411
				412	existing_idx <- result$node == node_value & result$vc == vc_value
				413	existing_collocates <- unique(as.character(result$collocate[existing_idx]))
				414	missing_collocates <- setdiff(unique(collocate_pool), existing_collocates)
				415	missing_collocates <- missing_collocates[!is.na(missing_collocates) & nzchar(missing_collocates)]
				416
				417	if (length(missing_collocates) == 0) {
				418	next
				419	}
				420
				421	context_rows <- result[result$node == node_value & result$vc == vc_value, , drop = FALSE]
				422	if (nrow(context_rows) == 0) {
				423	context_rows <- result[result$node == node_value, , drop = FALSE]
				424	}
				425
				426	left_size <- context_rows$leftContextSize[!is.na(context_rows$leftContextSize)][1]
				427	if (is.na(left_size) \|\| length(left_size) == 0) {
				428	left_size <- result$leftContextSize[!is.na(result$leftContextSize)][1]
				429	}
				430	if (is.na(left_size) \|\| length(left_size) == 0) {
				431	left_size <- 5
				432	}
				433
				434	right_size <- context_rows$rightContextSize[!is.na(context_rows$rightContextSize)][1]
				435	if (is.na(right_size) \|\| length(right_size) == 0) {
				436	right_size <- result$rightContextSize[!is.na(result$rightContextSize)][1]
				437	}
				438	if (is.na(right_size) \|\| length(right_size) == 0) {
				439	right_size <- 5
				440	}
				441
				442	within_span_value <- ""
				443	if ("query" %in% names(context_rows)) {
				444	query_candidate <- context_rows$query[!is.na(context_rows$query) & nzchar(context_rows$query)][1]
				445	if (!is.na(query_candidate) && nzchar(query_candidate)) {
				446	match_one <- regexec("^\\(*contains\\(<([^>]+)>,", query_candidate)
				447	matches <- regmatches(query_candidate, match_one)
				448	if (length(matches) >= 1 && length(matches[[1]]) >= 2) {
				449	within_span_value <- matches[[1]][2]
				450	}
				451	}
				452	}
				453
				454	new_rows <- collocationScoreQuery(
				455	kco,
				456	node = node_value,
				457	collocate = missing_collocates,
				458	vc = vc_value,
				459	leftContextSize = left_size,
				460	rightContextSize = right_size,
				461	ignoreCollocateCase = ignoreCollocateCase,
				462	withinSpan = within_span_value,
				463	...
				464	)
				465
				466	if (nrow(new_rows) == 0) {
				467	next
				468	}
				469
				470	if (!is.null(required_combinations$vcLabel[i]) && !is.na(required_combinations$vcLabel[i]) && required_combinations$vcLabel[i] != "" && "label" %in% names(new_rows)) {
				471	new_rows$label <- required_combinations$vcLabel[i]
				472	}
				473
				474	result <- dplyr::bind_rows(result, new_rows)
				475	}
				476
				477	result
				478	}
				479
Marc Kupietz	0a29263	2025-10-19 14:04:36 +0200	[diff] [blame]	480	inject_focus_into_query <- function(query) {
				481	if (is.null(query) \|\| is.na(query)) {
				482	return(query)
				483	}
				484
				485	trimmed <- trimws(query)
				486	if (!nzchar(trimmed)) {
				487	return(query)
				488	}
				489
				490	if (!grepl("^contains\\(<[^>]+>", trimmed, perl = TRUE)) {
				491	return(query)
				492	}
				493
				494	if (grepl("focus\\(", trimmed, perl = TRUE)) {
				495	return(query)
				496	}
				497
				498	pattern <- "^contains\$<([^>]+)>\\s,\\s\\((.)\$\\)\\s$"
				499	matches <- regexec(pattern, trimmed, perl = TRUE)
				500	components <- regmatches(trimmed, matches)
				501	if (length(components) == 0 \|\| length(components[[1]]) < 3) {
				502	return(query)
				503	}
				504
				505	span <- components[[1]][2]
				506	inner <- components[[1]][3]
				507	parts <- strsplit(inner, "\\\|", perl = TRUE)[[1]]
				508	parts <- trimws(parts)
				509	parts <- parts[nzchar(parts)]
				510
				511	if (length(parts) == 0) {
				512	return(query)
				513	}
				514
				515	focused <- paste0("focus({", parts, "})")
				516	combined <- paste(focused, collapse = " \| ")
				517
				518	sprintf("contains(<%s>, (%s))", span, combined)
				519	}
				520
Marc Kupietz	77852b2	2025-10-19 11:35:34 +0200	[diff] [blame]	521	add_multi_vc_comparisons <- function(result, missingScoreQuantile = 0.05) {
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	522	label <- node <- collocate <- NULL
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	523
				524	if (!"label" %in% names(result) \|\| dplyr::n_distinct(result$label) < 2) {
				525	return(result)
				526	}
				527
				528	numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
				529	non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
				530	score_cols <- setdiff(numeric_cols, non_score_cols)
				531
				532	if (length(score_cols) == 0) {
				533	return(result)
				534	}
				535
Marc Kupietz	9894a37	2025-10-18 14:51:29 +0200	[diff] [blame]	536	compute_score_floor <- function(values) {
Marc Kupietz	4cbb547	2025-10-19 12:15:25 +0200	[diff] [blame]	537	# Estimate a conservative floor so missing scores can be imputed without favoring any label
Marc Kupietz	9894a37	2025-10-18 14:51:29 +0200	[diff] [blame]	538	finite_values <- values[is.finite(values)]
				539	if (length(finite_values) == 0) {
				540	return(0)
				541	}
				542
				543	prob <- min(max(missingScoreQuantile, 0), 0.5)
Marc Kupietz	4cbb547	2025-10-19 12:15:25 +0200	[diff] [blame]	544	# Use a lower quantile as the anchor to stay near the weakest attested scores
Marc Kupietz	9894a37	2025-10-18 14:51:29 +0200	[diff] [blame]	545	q_val <- suppressWarnings(stats::quantile(finite_values,
				546	probs = prob,
				547	names = FALSE,
				548	type = 7
				549	))
				550
				551	if (!is.finite(q_val)) {
				552	q_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
				553	}
				554
				555	min_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
				556	if (!is.finite(min_val)) {
				557	min_val <- 0
				558	}
				559
				560	spread_candidates <- c(
				561	suppressWarnings(stats::IQR(finite_values, na.rm = TRUE, type = 7)),
				562	stats::sd(finite_values, na.rm = TRUE),
				563	abs(q_val) * 0.1,
				564	abs(min_val - q_val)
				565	)
				566	spread_candidates <- spread_candidates[is.finite(spread_candidates)]
				567
				568	spread <- 0
				569	if (length(spread_candidates) > 0) {
				570	spread <- max(spread_candidates)
				571	}
				572	if (!is.finite(spread) \|\| spread == 0) {
				573	spread <- max(abs(q_val), abs(min_val), 1e-06)
				574	}
				575
Marc Kupietz	4cbb547	2025-10-19 12:15:25 +0200	[diff] [blame]	576	# Step away from the anchor by a robust spread estimate to avoid ties with real scores
Marc Kupietz	9894a37	2025-10-18 14:51:29 +0200	[diff] [blame]	577	candidate <- q_val - spread
				578	if (!is.finite(candidate)) {
				579	candidate <- min_val
				580	}
				581
				582	floor_value <- suppressWarnings(min(c(candidate, min_val), na.rm = TRUE))
				583	if (!is.finite(floor_value)) {
				584	floor_value <- min_val
				585	}
				586	if (!is.finite(floor_value)) {
				587	floor_value <- 0
				588	}
				589
				590	floor_value
				591	}
				592
				593	score_replacements <- stats::setNames(
				594	vapply(score_cols, function(col) {
				595	compute_score_floor(result[[col]])
				596	}, numeric(1)),
				597	score_cols
				598	)
				599
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	600	comparison <- result \|>
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	601	dplyr::select(node, collocate, label, dplyr::all_of(score_cols)) \|>
				602	tidyr::pivot_wider(
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	603	names_from = label,
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	604	values_from = dplyr::all_of(score_cols),
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	605	names_glue = "{.value}_{make.names(label)}",
				606	values_fn = dplyr::first
				607	)
				608
Marc Kupietz	5e35d7a	2025-10-17 21:21:22 +0200	[diff] [blame]	609	raw_labels <- unique(result$label)
				610	labels <- make.names(raw_labels)
				611	label_map <- stats::setNames(raw_labels, labels)
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	612
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	613	rank_data <- result \|>
				614	dplyr::distinct(node, collocate)
				615
				616	for (i in seq_along(raw_labels)) {
				617	raw_lab <- raw_labels[i]
				618	safe_lab <- labels[i]
				619	label_df <- result[result$label == raw_lab, c("node", "collocate", score_cols), drop = FALSE]
				620	if (nrow(label_df) == 0) {
				621	next
				622	}
				623	label_df <- dplyr::distinct(label_df)
				624	rank_tbl <- label_df[, c("node", "collocate"), drop = FALSE]
				625	for (col in score_cols) {
				626	rank_col_name <- paste0("rank_", safe_lab, "_", col)
Marc Kupietz	130a2a2	2025-10-18 16:09:23 +0200	[diff] [blame]	627	percentile_col_name <- paste0("percentile_rank_", safe_lab, "_", col)
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	628	values <- label_df[[col]]
				629	ranks <- rep(NA_real_, length(values))
Marc Kupietz	130a2a2	2025-10-18 16:09:23 +0200	[diff] [blame]	630	percentiles <- rep(NA_real_, length(values))
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	631	valid_idx <- which(!is.na(values))
				632	if (length(valid_idx) > 0) {
				633	ranks[valid_idx] <- rank(-values[valid_idx], ties.method = "first")
Marc Kupietz	130a2a2	2025-10-18 16:09:23 +0200	[diff] [blame]	634	total <- length(valid_idx)
				635	percentiles[valid_idx] <- 1 - (ranks[valid_idx] - 1) / total
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	636	}
				637	rank_tbl[[rank_col_name]] <- ranks
Marc Kupietz	130a2a2	2025-10-18 16:09:23 +0200	[diff] [blame]	638	rank_tbl[[percentile_col_name]] <- percentiles
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	639	}
				640	rank_data <- dplyr::left_join(rank_data, rank_tbl, by = c("node", "collocate"))
				641	}
				642
				643	comparison <- dplyr::left_join(comparison, rank_data, by = c("node", "collocate"))
				644
				645	rank_replacements <- numeric(0)
				646	rank_column_names <- grep("^rank_", names(comparison), value = TRUE)
				647	if (length(rank_column_names) > 0) {
				648	rank_replacements <- stats::setNames(
				649	vapply(rank_column_names, function(col) {
				650	col_values <- comparison[[col]]
				651	valid_values <- col_values[!is.na(col_values)]
				652	if (length(valid_values) == 0) {
				653	nrow(comparison) + 1
				654	} else {
				655	suppressWarnings(max(valid_values, na.rm = TRUE)) + 1
				656	}
				657	}, numeric(1)),
				658	rank_column_names
				659	)
				660	}
				661
Marc Kupietz	130a2a2	2025-10-18 16:09:23 +0200	[diff] [blame]	662	percentile_replacements <- numeric(0)
				663	percentile_column_names <- grep("^percentile_rank_", names(comparison), value = TRUE)
				664	if (length(percentile_column_names) > 0) {
				665	percentile_replacements <- stats::setNames(
				666	rep(0, length(percentile_column_names)),
				667	percentile_column_names
				668	)
				669	}
				670
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	671	collapse_label_values <- function(indices, safe_labels_vec) {
				672	if (length(indices) == 0) {
				673	return(NA_character_)
				674	}
				675	labs <- label_map[safe_labels_vec[indices]]
				676	fallback <- safe_labels_vec[indices]
				677	labs[is.na(labs) \| labs == ""] <- fallback[is.na(labs) \| labs == ""]
				678	labs <- labs[!is.na(labs) & labs != ""]
				679	if (length(labs) == 0) {
				680	return(NA_character_)
				681	}
				682	paste(unique(labs), collapse = ", ")
				683	}
				684
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	685	if (length(labels) == 2) {
Marc Kupietz	9894a37	2025-10-18 14:51:29 +0200	[diff] [blame]	686	fill_scores <- function(x, y, measure_col) {
				687	replacement <- score_replacements[[measure_col]]
				688	fallback_min <- suppressWarnings(min(c(x, y), na.rm = TRUE))
				689	if (!is.finite(fallback_min)) {
				690	fallback_min <- 0
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	691	}
Marc Kupietz	9894a37	2025-10-18 14:51:29 +0200	[diff] [blame]	692	if (!is.null(replacement) && is.finite(replacement)) {
				693	replacement <- min(replacement, fallback_min)
				694	} else {
				695	replacement <- fallback_min
				696	}
				697	if (!is.finite(replacement)) {
				698	replacement <- 0
				699	}
				700	if (any(is.na(x))) {
				701	x[is.na(x)] <- replacement
				702	}
				703	if (any(is.na(y))) {
				704	y[is.na(y)] <- replacement
				705	}
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	706	list(x = x, y = y)
				707	}
				708
Marc Kupietz	130a2a2	2025-10-18 16:09:23 +0200	[diff] [blame]	709	fill_percentiles <- function(x, y, left_pct_col, right_pct_col) {
				710	replacement_left <- percentile_replacements[[left_pct_col]]
				711	if (is.null(replacement_left) \|\| !is.finite(replacement_left)) {
				712	replacement_left <- 0
				713	}
				714	replacement_right <- percentile_replacements[[right_pct_col]]
				715	if (is.null(replacement_right) \|\| !is.finite(replacement_right)) {
				716	replacement_right <- 0
				717	}
				718	if (any(is.na(x))) {
				719	x[is.na(x)] <- replacement_left
				720	}
				721	if (any(is.na(y))) {
				722	y[is.na(y)] <- replacement_right
				723	}
				724	list(x = x, y = y)
				725	}
				726
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	727	fill_ranks <- function(x, y, left_rank_col, right_rank_col) {
				728	fallback <- nrow(comparison) + 1
				729	replacement_left <- rank_replacements[[left_rank_col]]
				730	if (is.null(replacement_left) \|\| !is.finite(replacement_left)) {
				731	replacement_left <- fallback
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	732	}
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	733	replacement_right <- rank_replacements[[right_rank_col]]
				734	if (is.null(replacement_right) \|\| !is.finite(replacement_right)) {
				735	replacement_right <- fallback
				736	}
				737	if (any(is.na(x))) {
				738	x[is.na(x)] <- replacement_left
				739	}
				740	if (any(is.na(y))) {
				741	y[is.na(y)] <- replacement_right
				742	}
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	743	list(x = x, y = y)
				744	}
				745
				746	left_label <- labels[1]
				747	right_label <- labels[2]
				748
				749	for (col in score_cols) {
				750	left_col <- paste0(col, "_", left_label)
				751	right_col <- paste0(col, "_", right_label)
				752	if (!all(c(left_col, right_col) %in% names(comparison))) {
				753	next
				754	}
Marc Kupietz	9894a37	2025-10-18 14:51:29 +0200	[diff] [blame]	755	filled <- fill_scores(comparison[[left_col]], comparison[[right_col]], col)
Marc Kupietz	5e35d7a	2025-10-17 21:21:22 +0200	[diff] [blame]	756	comparison[[left_col]] <- filled$x
				757	comparison[[right_col]] <- filled$y
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	758	comparison[[paste0("delta_", col)]] <- filled$x - filled$y
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	759	rank_left <- paste0("rank_", left_label, "_", col)
				760	rank_right <- paste0("rank_", right_label, "_", col)
				761	if (all(c(rank_left, rank_right) %in% names(comparison))) {
				762	filled_rank <- fill_ranks(
				763	comparison[[rank_left]],
				764	comparison[[rank_right]],
				765	rank_left,
				766	rank_right
				767	)
				768	comparison[[paste0("delta_rank_", col)]] <- filled_rank$x - filled_rank$y
				769	}
Marc Kupietz	130a2a2	2025-10-18 16:09:23 +0200	[diff] [blame]	770	pct_left <- paste0("percentile_rank_", left_label, "_", col)
				771	pct_right <- paste0("percentile_rank_", right_label, "_", col)
				772	if (all(c(pct_left, pct_right) %in% names(comparison))) {
				773	filled_pct <- fill_percentiles(
				774	comparison[[pct_left]],
				775	comparison[[pct_right]],
				776	pct_left,
				777	pct_right
				778	)
				779	comparison[[paste0("delta_percentile_rank_", col)]] <- filled_pct$x - filled_pct$y
				780	}
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	781	}
				782	}
				783
Marc Kupietz	5e35d7a	2025-10-17 21:21:22 +0200	[diff] [blame]	784	for (col in score_cols) {
				785	value_cols <- paste0(col, "_", labels)
				786	existing <- value_cols %in% names(comparison)
				787	if (!any(existing)) {
				788	next
				789	}
				790	value_cols <- value_cols[existing]
				791	safe_labels <- labels[existing]
				792
				793	score_values <- comparison[, value_cols, drop = FALSE]
				794
				795	winner_label_col <- paste0("winner_", col)
				796	winner_value_col <- paste0("winner_", col, "_value")
				797	runner_label_col <- paste0("runner_up_", col)
				798	runner_value_col <- paste0("runner_up_", col, "_value")
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	799	loser_label_col <- paste0("loser_", col)
				800	loser_value_col <- paste0("loser_", col, "_value")
				801	max_delta_col <- paste0("max_delta_", col)
Marc Kupietz	5e35d7a	2025-10-17 21:21:22 +0200	[diff] [blame]	802
				803	if (nrow(score_values) == 0) {
				804	comparison[[winner_label_col]] <- character(0)
				805	comparison[[winner_value_col]] <- numeric(0)
				806	comparison[[runner_label_col]] <- character(0)
				807	comparison[[runner_value_col]] <- numeric(0)
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	808	comparison[[loser_label_col]] <- character(0)
				809	comparison[[loser_value_col]] <- numeric(0)
				810	comparison[[max_delta_col]] <- numeric(0)
Marc Kupietz	5e35d7a	2025-10-17 21:21:22 +0200	[diff] [blame]	811	next
				812	}
				813
				814	score_matrix <- as.matrix(score_values)
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	815	storage.mode(score_matrix) <- "numeric"
Marc Kupietz	5e35d7a	2025-10-17 21:21:22 +0200	[diff] [blame]	816
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	817	n_rows <- nrow(score_matrix)
				818	winner_labels <- rep(NA_character_, n_rows)
				819	winner_values <- rep(NA_real_, n_rows)
				820	runner_labels <- rep(NA_character_, n_rows)
				821	runner_values <- rep(NA_real_, n_rows)
				822	loser_labels <- rep(NA_character_, n_rows)
				823	loser_values <- rep(NA_real_, n_rows)
				824	max_deltas <- rep(NA_real_, n_rows)
				825
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	826	if (n_rows > 0) {
				827	for (i in seq_len(n_rows)) {
				828	numeric_row <- as.numeric(score_matrix[i, ])
				829	if (all(is.na(numeric_row))) {
				830	next
				831	}
				832
Marc Kupietz	9894a37	2025-10-18 14:51:29 +0200	[diff] [blame]	833	replacement <- score_replacements[[col]]
				834	fallback_min <- suppressWarnings(min(numeric_row, na.rm = TRUE))
				835	if (!is.finite(fallback_min)) {
				836	fallback_min <- 0
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	837	}
Marc Kupietz	9894a37	2025-10-18 14:51:29 +0200	[diff] [blame]	838	if (!is.null(replacement) && is.finite(replacement)) {
				839	replacement <- min(replacement, fallback_min)
				840	} else {
				841	replacement <- fallback_min
				842	}
				843	if (!is.finite(replacement)) {
				844	replacement <- 0
				845	}
				846	if (any(is.na(numeric_row))) {
				847	numeric_row[is.na(numeric_row)] <- replacement
				848	}
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	849	score_matrix[i, ] <- numeric_row
				850
				851	max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
				852	max_idx <- which(numeric_row == max_val)
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	853	winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	854	winner_values[i] <- max_val
				855
				856	unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
				857	if (length(unique_vals) >= 2) {
				858	runner_val <- unique_vals[2]
				859	runner_idx <- which(numeric_row == runner_val)
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	860	runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	861	runner_values[i] <- runner_val
				862	}
				863
				864	min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
				865	min_idx <- which(numeric_row == min_val)
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	866	loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	867	loser_values[i] <- min_val
				868
				869	if (is.finite(max_val) && is.finite(min_val)) {
				870	max_deltas[i] <- max_val - min_val
				871	}
Marc Kupietz	5e35d7a	2025-10-17 21:21:22 +0200	[diff] [blame]	872	}
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	873	}
Marc Kupietz	5e35d7a	2025-10-17 21:21:22 +0200	[diff] [blame]	874
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	875	comparison[, value_cols] <- score_matrix
Marc Kupietz	5e35d7a	2025-10-17 21:21:22 +0200	[diff] [blame]	876	comparison[[winner_label_col]] <- winner_labels
				877	comparison[[winner_value_col]] <- winner_values
				878	comparison[[runner_label_col]] <- runner_labels
				879	comparison[[runner_value_col]] <- runner_values
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	880	comparison[[loser_label_col]] <- loser_labels
				881	comparison[[loser_value_col]] <- loser_values
				882	comparison[[max_delta_col]] <- max_deltas
Marc Kupietz	5e35d7a	2025-10-17 21:21:22 +0200	[diff] [blame]	883	}
				884
Marc Kupietz	28a2984	2025-10-18 12:25:09 +0200	[diff] [blame]	885	for (col in score_cols) {
				886	rank_cols <- paste0("rank_", labels, "_", col)
				887	existing <- rank_cols %in% names(comparison)
				888	if (!any(existing)) {
				889	next
				890	}
				891	rank_cols <- rank_cols[existing]
				892	safe_labels <- labels[existing]
				893	rank_values <- comparison[, rank_cols, drop = FALSE]
				894
				895	winner_rank_label_col <- paste0("winner_rank_", col)
				896	winner_rank_value_col <- paste0("winner_rank_", col, "_value")
				897	runner_rank_label_col <- paste0("runner_up_rank_", col)
				898	runner_rank_value_col <- paste0("runner_up_rank_", col, "_value")
				899	loser_rank_label_col <- paste0("loser_rank_", col)
				900	loser_rank_value_col <- paste0("loser_rank_", col, "_value")
				901	max_delta_rank_col <- paste0("max_delta_rank_", col)
				902
				903	if (nrow(rank_values) == 0) {
				904	comparison[[winner_rank_label_col]] <- character(0)
				905	comparison[[winner_rank_value_col]] <- numeric(0)
				906	comparison[[runner_rank_label_col]] <- character(0)
				907	comparison[[runner_rank_value_col]] <- numeric(0)
				908	comparison[[loser_rank_label_col]] <- character(0)
				909	comparison[[loser_rank_value_col]] <- numeric(0)
				910	comparison[[max_delta_rank_col]] <- numeric(0)
				911	next
				912	}
				913
				914	rank_matrix <- as.matrix(rank_values)
				915	storage.mode(rank_matrix) <- "numeric"
				916
				917	n_rows <- nrow(rank_matrix)
				918	winner_labels <- rep(NA_character_, n_rows)
				919	winner_values <- rep(NA_real_, n_rows)
				920	runner_labels <- rep(NA_character_, n_rows)
				921	runner_values <- rep(NA_real_, n_rows)
				922	loser_labels <- rep(NA_character_, n_rows)
				923	loser_values <- rep(NA_real_, n_rows)
				924	max_deltas <- rep(NA_real_, n_rows)
				925
				926	for (i in seq_len(n_rows)) {
				927	numeric_row <- as.numeric(rank_matrix[i, ])
				928	if (all(is.na(numeric_row))) {
				929	next
				930	}
				931
				932	if (length(rank_cols) > 0) {
				933	replacement_vec <- rank_replacements[rank_cols]
				934	replacement_vec[is.na(replacement_vec)] <- nrow(comparison) + 1
				935	missing_idx <- which(is.na(numeric_row))
				936	if (length(missing_idx) > 0) {
				937	numeric_row[missing_idx] <- replacement_vec[missing_idx]
				938	}
				939	}
				940
				941	valid_idx <- seq_along(numeric_row)
				942	valid_values <- numeric_row[valid_idx]
				943	min_val <- suppressWarnings(min(valid_values, na.rm = TRUE))
				944	min_positions <- valid_idx[which(valid_values == min_val)]
				945	winner_labels[i] <- collapse_label_values(min_positions, safe_labels)
				946	winner_values[i] <- min_val
				947
				948	ordered_vals <- sort(unique(valid_values), decreasing = FALSE)
				949	if (length(ordered_vals) >= 2) {
				950	runner_val <- ordered_vals[2]
				951	runner_positions <- valid_idx[which(valid_values == runner_val)]
				952	runner_labels[i] <- collapse_label_values(runner_positions, safe_labels)
				953	runner_values[i] <- runner_val
				954	}
				955
				956	max_val <- suppressWarnings(max(valid_values, na.rm = TRUE))
				957	max_positions <- valid_idx[which(valid_values == max_val)]
				958	loser_labels[i] <- collapse_label_values(max_positions, safe_labels)
				959	loser_values[i] <- max_val
				960
				961	if (is.finite(max_val) && is.finite(min_val)) {
				962	max_deltas[i] <- max_val - min_val
				963	}
				964	}
				965
				966	comparison[[winner_rank_label_col]] <- winner_labels
				967	comparison[[winner_rank_value_col]] <- winner_values
				968	comparison[[runner_rank_label_col]] <- runner_labels
				969	comparison[[runner_rank_value_col]] <- runner_values
				970	comparison[[loser_rank_label_col]] <- loser_labels
				971	comparison[[loser_rank_value_col]] <- loser_values
				972	comparison[[max_delta_rank_col]] <- max_deltas
				973	}
				974
Marc Kupietz	130a2a2	2025-10-18 16:09:23 +0200	[diff] [blame]	975	for (col in score_cols) {
				976	pct_cols <- paste0("percentile_rank_", labels, "_", col)
				977	existing <- pct_cols %in% names(comparison)
				978	if (!any(existing)) {
				979	next
				980	}
				981	pct_cols <- pct_cols[existing]
				982	safe_labels <- labels[existing]
				983	pct_values <- comparison[, pct_cols, drop = FALSE]
				984
				985	winner_pct_label_col <- paste0("winner_percentile_rank_", col)
				986	winner_pct_value_col <- paste0("winner_percentile_rank_", col, "_value")
				987	runner_pct_label_col <- paste0("runner_up_percentile_rank_", col)
				988	runner_pct_value_col <- paste0("runner_up_percentile_rank_", col, "_value")
				989	loser_pct_label_col <- paste0("loser_percentile_rank_", col)
				990	loser_pct_value_col <- paste0("loser_percentile_rank_", col, "_value")
				991	max_delta_pct_col <- paste0("max_delta_percentile_rank_", col)
				992
				993	if (nrow(pct_values) == 0) {
				994	comparison[[winner_pct_label_col]] <- character(0)
				995	comparison[[winner_pct_value_col]] <- numeric(0)
				996	comparison[[runner_pct_label_col]] <- character(0)
				997	comparison[[runner_pct_value_col]] <- numeric(0)
				998	comparison[[loser_pct_label_col]] <- character(0)
				999	comparison[[loser_pct_value_col]] <- numeric(0)
				1000	comparison[[max_delta_pct_col]] <- numeric(0)
				1001	next
				1002	}
				1003
				1004	pct_matrix <- as.matrix(pct_values)
				1005	storage.mode(pct_matrix) <- "numeric"
				1006
				1007	n_rows <- nrow(pct_matrix)
				1008	winner_labels <- rep(NA_character_, n_rows)
				1009	winner_values <- rep(NA_real_, n_rows)
				1010	runner_labels <- rep(NA_character_, n_rows)
				1011	runner_values <- rep(NA_real_, n_rows)
				1012	loser_labels <- rep(NA_character_, n_rows)
				1013	loser_values <- rep(NA_real_, n_rows)
				1014	max_deltas <- rep(NA_real_, n_rows)
				1015
				1016	if (n_rows > 0) {
				1017	for (i in seq_len(n_rows)) {
				1018	numeric_row <- as.numeric(pct_matrix[i, ])
				1019	if (all(is.na(numeric_row))) {
				1020	next
				1021	}
				1022
				1023	if (any(is.na(numeric_row))) {
				1024	numeric_row[is.na(numeric_row)] <- 0
				1025	}
				1026	pct_matrix[i, ] <- numeric_row
				1027
				1028	max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
				1029	max_idx <- which(numeric_row == max_val)
				1030	winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
				1031	winner_values[i] <- max_val
				1032
				1033	unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
				1034	if (length(unique_vals) >= 2) {
				1035	runner_val <- unique_vals[2]
				1036	runner_idx <- which(numeric_row == runner_val)
				1037	runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
				1038	runner_values[i] <- runner_val
				1039	}
				1040
				1041	min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
				1042	min_idx <- which(numeric_row == min_val)
				1043	loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
				1044	loser_values[i] <- min_val
				1045
				1046	if (is.finite(max_val) && is.finite(min_val)) {
				1047	max_deltas[i] <- max_val - min_val
				1048	}
				1049	}
				1050	}
				1051
				1052	comparison[, pct_cols] <- pct_matrix
				1053	comparison[[winner_pct_label_col]] <- winner_labels
				1054	comparison[[winner_pct_value_col]] <- winner_values
				1055	comparison[[runner_pct_label_col]] <- runner_labels
				1056	comparison[[runner_pct_value_col]] <- runner_values
				1057	comparison[[loser_pct_label_col]] <- loser_labels
				1058	comparison[[loser_pct_value_col]] <- loser_values
				1059	comparison[[max_delta_pct_col]] <- max_deltas
				1060	}
				1061
Marc Kupietz	c4540a2	2025-10-14 17:39:53 +0200	[diff] [blame]	1062	dplyr::left_join(result, comparison, by = c("node", "collocate"))
				1063	}
				1064
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1065	#' @importFrom magrittr debug_pipe
Marc Kupietz	2b17b21	2023-08-27 17:47:26 +0200	[diff] [blame]	1066	#' @importFrom stringr str_detect
				1067	#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
				1068	#'
				1069	matches2FreqTable <- function(matches,
				1070	index = 0,
				1071	minOccur = 5,
				1072	leftContextSize = 5,
				1073	rightContextSize = 5,
				1074	ignoreCollocateCase = FALSE,
				1075	stopwords = c(),
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1076	collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz	2b17b21	2023-08-27 17:47:26 +0200	[diff] [blame]	1077	oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
				1078	verbose = TRUE) {
				1079	word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
				1080	frequency <- NULL
				1081
				1082	if (nrow(matches) < 1) {
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1083	dplyr::tibble(word = c(), frequency = c())
Marc Kupietz	2b17b21	2023-08-27 17:47:26 +0200	[diff] [blame]	1084	} else if (index == 0) {
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1085	if (!"tokens" %in% colnames(matches) \|\| !is.list(matches$tokens)) {
Marc Kupietz	2b17b21	2023-08-27 17:47:26 +0200	[diff] [blame]	1086	log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1087	return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
				1088	ignoreCollocateCase = ignoreCollocateCase,
				1089	stopwords = stopwords, oldTable = oldTable, verbose = verbose
				1090	))
Marc Kupietz	2b17b21	2023-08-27 17:47:26 +0200	[diff] [blame]	1091	}
				1092	log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietz	a25fbd9	2025-10-14 17:38:09 +0200	[diff] [blame]	1093	for (i in seq_len(nrow(matches))) {
Marc Kupietz	2b17b21	2023-08-27 17:47:26 +0200	[diff] [blame]	1094	oldTable <- matches2FreqTable(
				1095	matches,
				1096	i,
				1097	leftContextSize = leftContextSize,
				1098	rightContextSize = rightContextSize,
				1099	collocateFilterRegex = collocateFilterRegex,
				1100	oldTable = oldTable,
				1101	stopwords = stopwords
				1102	)
				1103	}
				1104	log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1105	oldTable \|>
Marc Kupietz	4cd066d	2025-02-28 15:48:23 +0100	[diff] [blame]	1106	group_by(word) \|>
				1107	mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) \|>
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1108	summarise(frequency = sum(frequency), .groups = "drop") \|>
Marc Kupietz	2b17b21	2023-08-27 17:47:26 +0200	[diff] [blame]	1109	arrange(desc(frequency))
				1110	} else {
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1111	stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz	2b17b21	2023-08-27 17:47:26 +0200	[diff] [blame]	1112
				1113	left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
				1114
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1115	# cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz	2b17b21	2023-08-27 17:47:26 +0200	[diff] [blame]	1116
				1117	right <- head(unlist(matches$tokens$right[index]), rightContextSize)
				1118
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1119	# cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz	2b17b21	2023-08-27 17:47:26 +0200	[diff] [blame]	1120
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1121	if (length(left) + length(right) == 0) {
Marc Kupietz	2b17b21	2023-08-27 17:47:26 +0200	[diff] [blame]	1122	oldTable
				1123	} else {
Marc Kupietz	4cd066d	2025-02-28 15:48:23 +0100	[diff] [blame]	1124	table(c(left, right)) \|>
				1125	dplyr::as_tibble(.name_repair = "minimal") \|>
				1126	dplyr::rename(word = 1, frequency = 2) \|>
				1127	dplyr::filter(str_detect(word, collocateFilterRegex)) \|>
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1128	dplyr::anti_join(stopwordsTable, by = "word") \|>
Marc Kupietz	2b17b21	2023-08-27 17:47:26 +0200	[diff] [blame]	1129	dplyr::bind_rows(oldTable)
				1130	}
				1131	}
				1132	}
				1133
				1134	#' @importFrom magrittr debug_pipe
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1135	#' @importFrom stringr str_match str_split str_detect
				1136	#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
				1137	#'
				1138	snippet2FreqTable <- function(snippet,
				1139	minOccur = 5,
				1140	leftContextSize = 5,
				1141	rightContextSize = 5,
				1142	ignoreCollocateCase = FALSE,
				1143	stopwords = c(),
				1144	tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+\|")",
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1145	collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1146	oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
				1147	verbose = TRUE) {
				1148	word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
				1149	frequency <- NULL
				1150
				1151	if (length(snippet) < 1) {
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1152	dplyr::tibble(word = c(), frequency = c())
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1153	} else if (length(snippet) > 1) {
Marc Kupietz	a47d150	2023-04-18 15:26:47 +0200	[diff] [blame]	1154	log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1155	for (s in snippet) {
				1156	oldTable <- snippet2FreqTable(
				1157	s,
				1158	leftContextSize = leftContextSize,
				1159	rightContextSize = rightContextSize,
Marc Kupietz	47d0d2b	2021-12-19 16:38:52 +0100	[diff] [blame]	1160	collocateFilterRegex = collocateFilterRegex,
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1161	oldTable = oldTable,
				1162	stopwords = stopwords
				1163	)
				1164	}
Marc Kupietz	a47d150	2023-04-18 15:26:47 +0200	[diff] [blame]	1165	log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1166	oldTable \|>
Marc Kupietz	4cd066d	2025-02-28 15:48:23 +0100	[diff] [blame]	1167	group_by(word) \|>
				1168	mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) \|>
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1169	summarise(frequency = sum(frequency), .groups = "drop") \|>
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1170	arrange(desc(frequency))
				1171	} else {
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1172	stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1173	match <-
				1174	str_match(
				1175	snippet,
				1176	'<span class="context-left">(<span class="more"></span>)?(.[^ ]) </span><span class="match"><mark>.</mark></span><span class="context-right"> ([^<]*)'
				1177	)
				1178
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1179	left <- if (leftContextSize > 0) {
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1180	tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1181	} else {
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1182	""
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1183	}
				1184	# cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1185
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1186	right <- if (rightContextSize > 0) {
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1187	head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1188	} else {
				1189	""
				1190	}
				1191	# cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1192
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1193	if (is.na(left[1]) \|\| is.na(right[1]) \|\| length(left) + length(right) == 0) {
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1194	oldTable
				1195	} else {
Marc Kupietz	4cd066d	2025-02-28 15:48:23 +0100	[diff] [blame]	1196	table(c(left, right)) \|>
				1197	dplyr::as_tibble(.name_repair = "minimal") \|>
				1198	dplyr::rename(word = 1, frequency = 2) \|>
				1199	dplyr::filter(str_detect(word, collocateFilterRegex)) \|>
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1200	dplyr::anti_join(stopwordsTable, by = "word") \|>
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1201	dplyr::bind_rows(oldTable)
				1202	}
				1203	}
				1204	}
				1205
				1206	#' Preliminary synsemantic stopwords function
				1207	#'
				1208	#' @description
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	1209	#' `r lifecycle::badge("experimental")`
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1210	#'
				1211	#' Preliminary synsemantic stopwords function to be used in collocation analysis.
				1212	#'
				1213	#' @details
				1214	#' Currently only suitable for German. See stopwords package for other languages.
				1215	#'
				1216	#' @param ... future arguments for language detection
				1217	#'
				1218	#' @family collocation analysis functions
				1219	#' @return Vector of synsemantic stopwords.
				1220	#' @export
				1221	synsemanticStopwords <- function(...) {
Marc Kupietz	c79155b	2025-10-19 13:42:55 +0200	[diff] [blame]	1222	base <- c(
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1223	"der",
				1224	"die",
				1225	"und",
				1226	"in",
				1227	"den",
				1228	"von",
				1229	"mit",
				1230	"das",
				1231	"zu",
				1232	"im",
				1233	"ist",
				1234	"auf",
				1235	"sich",
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1236	"des",
				1237	"dem",
				1238	"nicht",
				1239	"ein",
				1240	"eine",
				1241	"es",
				1242	"auch",
				1243	"an",
				1244	"als",
				1245	"am",
				1246	"aus",
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1247	"bei",
				1248	"er",
				1249	"dass",
				1250	"sie",
				1251	"nach",
				1252	"um",
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1253	"zum",
				1254	"noch",
				1255	"war",
				1256	"einen",
				1257	"einer",
				1258	"wie",
				1259	"einem",
				1260	"vor",
				1261	"bis",
				1262	"\u00fcber",
				1263	"so",
				1264	"aber",
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1265	"diese",
Marc Kupietz	c79155b	2025-10-19 13:42:55 +0200	[diff] [blame]	1266	"oder"
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1267	)
Marc Kupietz	c79155b	2025-10-19 13:42:55 +0200	[diff] [blame]	1268
				1269	lower <- unique(tolower(base))
				1270	capitalized <- paste0(toupper(substr(lower, 1, 1)), substring(lower, 2))
				1271
				1272	unique(c(lower, capitalized))
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1273	}
				1274
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	1275
Marc Kupietz	76b0559	2021-12-19 16:26:15 +0100	[diff] [blame]	1276	# #' @export
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	1277	findExample <-
				1278	function(kco,
				1279	query,
				1280	vc = "",
				1281	matchOnly = TRUE) {
				1282	out <- character(length = length(query))
				1283
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1284	if (length(vc) < length(query)) {
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	1285	vc <- rep(vc, length(query))
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1286	}
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	1287
				1288	for (i in seq_along(query)) {
				1289	q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietz	b811ffb	2021-12-07 10:34:10 +0100	[diff] [blame]	1290	if (q@totalResults > 0) {
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1291	q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietz	b811ffb	2021-12-07 10:34:10 +0100	[diff] [blame]	1292	example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1293	out[i] <- if (matchOnly) {
				1294	gsub(".<mark>(.+)</mark>.", "\\1", example)
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	1295	} else {
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1296	stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	1297	}
Marc Kupietz	b811ffb	2021-12-07 10:34:10 +0100	[diff] [blame]	1298	} else {
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1299	out[i] <- ""
Marc Kupietz	b811ffb	2021-12-07 10:34:10 +0100	[diff] [blame]	1300	}
Marc Kupietz	5a336b6	2021-11-27 17:51:35 +0100	[diff] [blame]	1301	}
				1302	out
				1303	}
				1304
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1305	collocatesQuery <-
				1306	function(kco,
				1307	query,
				1308	vc = "",
				1309	minOccur = 5,
				1310	leftContextSize = 5,
				1311	rightContextSize = 5,
				1312	searchHitsSampleLimit = 20000,
				1313	ignoreCollocateCase = FALSE,
				1314	stopwords = c(),
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	1315	collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1316	...) {
				1317	frequency <- NULL
				1318	q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1319	if (q@totalResults == 0) {
				1320	tibble(word = c(), frequency = c())
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1321	} else {
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1322	q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
				1323	matches2FreqTable(q@collectedMatches,
				1324	0,
				1325	minOccur = minOccur,
				1326	leftContextSize = leftContextSize,
				1327	rightContextSize = rightContextSize,
				1328	ignoreCollocateCase = ignoreCollocateCase,
				1329	stopwords = stopwords,
Marc Kupietz	b2862d4	2025-10-18 10:17:49 +0200	[diff] [blame]	1330	collocateFilterRegex = collocateFilterRegex,
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	1331	...,
				1332	verbose = kco@verbose
				1333	) \|>
Marc Kupietz	4cd066d	2025-02-28 15:48:23 +0100	[diff] [blame]	1334	mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) \|>
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1335	filter(frequency >= minOccur)
				1336	}
				1337	}