Add full collocation analysis (client side only implementation)
Resolves #2
Change-Id: Ib01d89a72b44ff06816b21532b7ea709a4e837b0
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
new file mode 100644
index 0000000..0df5bea
--- /dev/null
+++ b/R/collocationAnalysis.R
@@ -0,0 +1,318 @@
+setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
+
+#' Collocation analysis
+#'
+#' @aliases collocationAnalysis
+#'
+#' @description
+#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
+#'
+#' Performs a collocation analysis for the given node (or query)
+#' in the given virtual corpus.
+#'
+#' @details
+#' The collocation analysis is currently implemented on the client side, as some of the
+#' functionality is not yet provided by the KorAP backend. Mainly for this reason
+#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
+#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
+#' and look for expression-internal collocates using the focus function (see examples and demo).
+#'
+#' To increase speed at the cost of accuracy and possible false negatives,
+#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
+#'
+#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
+#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
+#' user interface.
+#'
+#' @family collocation analysis functions
+#'
+#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. x -> [tt/l=x]
+#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
+#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
+#' @param searchHitsSampleLimit limit the size of the search hits sample
+#' @param stopwords vector of stopwords not to be considered as collocates
+#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
+#' @param seed seed for random page collecting order
+#' @param expand if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations
+#' @param ... more arguments will be passed to \code{\link{collocationScoreQuery}}
+#' @inheritParams collocationScoreQuery,KorAPConnection-method
+#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
+#'
+#' @importFrom stringr str_match str_split str_detect
+#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
+#' @importFrom purrr pmap
+#' @importFrom tidyr expand_grid
+#'
+#' @examples
+#' \donttest{
+#' # Find top collocates of "Packung" inside and outside the sports domain.
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
+#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
+#' dplyr::filter(logDice >= 5)
+#' }
+#'
+#' \donttest{
+#' # Identify the most prominent light verb construction with "in ... setzen".
+#' # Note that, currently, the use of focus function disallows exactFrequencies.
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
+#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
+#' }
+#'
+#' @export
+setMethod("collocationAnalysis", "KorAPConnection",
+ function(kco,
+ node,
+ vc = "",
+ lemmatizeNodeQuery = FALSE,
+ minOccur = 5,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ topCollocatesLimit = 200,
+ searchHitsSampleLimit = 20000,
+ ignoreCollocateCase = FALSE,
+ withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
+ exactFrequencies = TRUE,
+ stopwords = RKorAPClient::synsemanticStopwords(),
+ seed = 7,
+ expand = length(vc) != length(node),
+ ...) {
+ # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+ word <- frequency <- NULL
+
+ if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
+ stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
+ }
+
+ if (lemmatizeNodeQuery) {
+ node <- lemmatizeWordQuery(node)
+ }
+
+ if (length(node) > 1 || length(vc) > 1) {
+ grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
+ purrr::pmap(grid, function(node, vc, ...)
+ collocationAnalysis(kco,
+ node =node,
+ vc = vc,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ topCollocatesLimit = topCollocatesLimit,
+ searchHitsSampleLimit = searchHitsSampleLimit,
+ ignoreCollocateCase = ignoreCollocateCase,
+ withinSpan = withinSpan,
+ exactFrequencies = exactFrequencies,
+ stopwords = stopwords,
+ seed = seed,
+ expand = expand,
+ ...) ) %>%
+ bind_rows()
+ } else {
+ set.seed(seed)
+ candidates <- collocatesQuery(
+ kco,
+ node,
+ vc = vc,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ searchHitsSampleLimit = searchHitsSampleLimit,
+ ignoreCollocateCase = ignoreCollocateCase,
+ stopwords = stopwords,
+ ...
+ )
+
+ if (nrow(candidates) > 0) {
+ candidates <- candidates %>%
+ filter(frequency >= minOccur) %>%
+ head(topCollocatesLimit)
+ collocationScoreQuery(
+ kco,
+ node = node,
+ collocate = candidates$word,
+ vc = vc,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ observed = if (exactFrequencies) NA else candidates$frequency,
+ ignoreCollocateCase = ignoreCollocateCase,
+ withinSpan = withinSpan,
+ ...
+ ) %>%
+ filter(.$O >= minOccur) %>%
+ dplyr::arrange(dplyr::desc(logDice))
+ } else {
+ tibble()
+ }
+ }
+ }
+)
+
+#' @importFrom magrittr debug_pipe
+#' @importFrom stringr str_match str_split str_detect
+#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
+#'
+snippet2FreqTable <- function(snippet,
+ minOccur = 5,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ ignoreCollocateCase = FALSE,
+ stopwords = c(),
+ tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|")",
+ oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
+ verbose = TRUE) {
+ word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+ frequency <- NULL
+
+ if (length(snippet) < 1) {
+ dplyr::tibble(word=c(), frequency=c())
+ } else if (length(snippet) > 1) {
+ log.info(verbose, paste("Joinging", length(snippet), "kwics"))
+ for (s in snippet) {
+ oldTable <- snippet2FreqTable(
+ s,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ oldTable = oldTable,
+ stopwords = stopwords
+ )
+ }
+ log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens"))
+ oldTable %>%
+ group_by(word) %>%
+ mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
+ summarise(frequency=sum(frequency), .groups = "drop") %>%
+ arrange(desc(frequency))
+ } else {
+ stopwordsTable <- dplyr::tibble(word=stopwords)
+ match <-
+ str_match(
+ snippet,
+ '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
+ )
+
+ left <- if(leftContextSize > 0)
+ tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
+ else
+ ""
+# cat(paste("left:", left, "\n", collapse=" "))
+
+ right <- if(rightContextSize > 0)
+ head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
+ else
+ ""
+# cat(paste("right:", right, "\n", collapse=" "))
+
+ if(is.na(left) || is.na(right) || length(left) + length(right) == 0) {
+ oldTable
+ } else {
+ table(c(left, right)) %>%
+ dplyr::as_tibble(.name_repair = "minimal") %>%
+ dplyr::rename(word = 1, frequency = 2) %>%
+ dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
+ dplyr::anti_join(stopwordsTable, by="word") %>%
+ dplyr::bind_rows(oldTable)
+ }
+ }
+}
+
+#' Preliminary synsemantic stopwords function
+#'
+#' @description
+#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
+#'
+#' Preliminary synsemantic stopwords function to be used in collocation analysis.
+#'
+#' @details
+#' Currently only suitable for German. See stopwords package for other languages.
+#'
+#' @param ... future arguments for language detection
+#'
+#' @family collocation analysis functions
+#' @return Vector of synsemantic stopwords.
+#' @export
+synsemanticStopwords <- function(...) {
+ res <- c(
+ "der",
+ "die",
+ "und",
+ "in",
+ "den",
+ "von",
+ "mit",
+ "das",
+ "zu",
+ "im",
+ "ist",
+ "auf",
+ "sich",
+ "Die",
+ "des",
+ "dem",
+ "nicht",
+ "ein",
+ "eine",
+ "es",
+ "auch",
+ "an",
+ "als",
+ "am",
+ "aus",
+ "Der",
+ "bei",
+ "er",
+ "dass",
+ "sie",
+ "nach",
+ "um",
+ "Das",
+ "zum",
+ "noch",
+ "war",
+ "einen",
+ "einer",
+ "wie",
+ "einem",
+ "vor",
+ "bis",
+ "\u00fcber",
+ "so",
+ "aber",
+ "Eine",
+ "diese",
+ "Diese",
+ "oder"
+ )
+ return(res)
+}
+
+collocatesQuery <-
+ function(kco,
+ query,
+ vc = "",
+ minOccur = 5,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ searchHitsSampleLimit = 20000,
+ ignoreCollocateCase = FALSE,
+ stopwords = c(),
+ ...) {
+ frequency <- NULL
+ q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
+ if(q@totalResults == 0) {
+ tibble(word=c(), frequency=c())
+ } else {
+ q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
+ snippet2FreqTable((q@collectedMatches)$snippet,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ ignoreCollocateCase = ignoreCollocateCase,
+ stopwords = stopwords,
+ verbose = kco@verbose) %>%
+ mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
+ filter(frequency >= minOccur)
+ }
+ }
+
+