Add full collocation analysis (client side only implementation)

Resolves #2

Change-Id: Ib01d89a72b44ff06816b21532b7ea709a4e837b0
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 0b07106..5ef4846 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -66,15 +66,23 @@
 setGeneric("fetchNext", function(kqo, ...)  standardGeneric("fetchNext") )
 setGeneric("fetchRest", function(kqo, ...)  standardGeneric("fetchRest") )
 setGeneric("frequencyQuery", function(kco, ...)  standardGeneric("frequencyQuery") )
-setGeneric("collocationScoreQuery", function(kco, ...)  standardGeneric("collocationScoreQuery") )
 
 maxResultsPerPage <- 50
 
 ## quiets concerns of R CMD check re: the .'s that appear in pipelines
 if(getRversion() >= "2.15.1")  utils::globalVariables(c("."))
 
+#' Corpus query
+#'
 #' \bold{\code{corpusQuery}} performs a corpus query via a connection to a KorAP-API-server
 #'
+#' @rdname KorAPQuery-class
+#' @aliases corpusQuery
+#'
+#' @importFrom urltools url_encode
+#' @importFrom purrr pmap
+#' @importFrom dplyr bind_rows
+#'
 #' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
 #' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.
 #' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
@@ -125,7 +133,6 @@
 #' @references
 #' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
 #'
-#' @aliases corpusQuery
 #' @export
 setMethod("corpusQuery", "KorAPConnection",
           function(kco,
@@ -151,25 +158,19 @@
                    expand = length(vc) != length(query),
           as.df = FALSE) {
   if (length(query) > 1 || length(vc) > 1) {
-
-    grid <- {
-      if (expand)
-        expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc) }
-        return(
-             do.call(rbind,
-                     Map(function(q, cq) corpusQuery(kco, query=q, vc=cq, ql=ql,
-                                                     verbose=verbose, as.df = TRUE), grid$query, grid$vc)) %>%
-               remove_rownames()
-           )
-    } else {
+    grid <- if (expand) expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc)
+    purrr::pmap(grid, function(query, vc, ...)
+      corpusQuery(kco, query=query, vc=vc, ql=ql, verbose=verbose, as.df = TRUE)) %>%
+      bind_rows()
+  } else {
       contentFields <- c("snippet")
       if (metadataOnly) {
         fields <- fields[!fields %in% contentFields]
       }
       request <-
         paste0('?q=',
-               URLencode(enc2utf8(query), reserved = TRUE),
-               if (vc != '') paste0('&cq=', URLencode(enc2utf8(vc), reserved = TRUE)) else '', '&ql=', ql)
+               url_encode(enc2utf8(query)),
+               ifelse (vc != '', paste0('&cq=', url_encode(enc2utf8(vc))), ''), '&ql=', ql)
       webUIRequestUrl <- paste0(kco@KorAPUrl, request)
       requestUrl <- paste0(
         kco@apiUrl,
@@ -182,10 +183,11 @@
       log.info(verbose, "Searching \"", query, "\" in \"", vc, "\"", sep =
                  "")
       res = apiCall(kco, paste0(requestUrl, '&count=0'))
+      log.info(verbose, ": ", res$meta$totalResults, " hits")
       if(!is.null(res$meta$cached))
         log.info(verbose, " [cached]\n")
       else
-        log.info(verbose, " took ", res$meta$benchmark, "\n", sep = "")
+        log.info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
       if (as.df)
         data.frame(
           query = query,
@@ -218,6 +220,7 @@
 #' @param offset start offset for query results to fetch
 #' @param maxFetch maximum number of query results to fetch
 #' @param verbose print progress information if true
+#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use \code{\link{set.seed}} to set seed for reproducible results.
 #' @return The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
 #'
 #' @examples
@@ -232,7 +235,11 @@
 #' @rdname KorAPQuery-class
 #' @importFrom dplyr rowwise bind_rows select summarise n
 #' @export
-setMethod("fetchNext", "KorAPQuery", function(kqo, offset = kqo@nextStartIndex, maxFetch = maxResultsPerPage, verbose = kqo@korapConnection@verbose) {
+setMethod("fetchNext", "KorAPQuery", function(kqo,
+                                              offset = kqo@nextStartIndex,
+                                              maxFetch = maxResultsPerPage,
+                                              verbose = kqo@korapConnection@verbose,
+                                              randomizePageOrder = FALSE) {
   if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
     return(kqo)
   }
@@ -242,8 +249,14 @@
   pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
   collectedMatches <- kqo@collectedMatches
 
+  if (randomizePageOrder) {
+    pages <- head(sample.int(ceiling(kqo@totalResults / maxResultsPerPage)), maxFetch) - 1
+  }
+
   repeat {
-    query <- paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', offset + results, '&cutoff=true')
+    page = length(collectedMatches[,1]) %/% maxResultsPerPage + 1
+    currentOffset = ifelse(randomizePageOrder, pages[page],  page - 1) * maxResultsPerPage
+    query <- paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', currentOffset, '&cutoff=true')
     res <- apiCall(kqo@korapConnection, query)
     if (length(res$matches) == 0) {
       break
@@ -270,11 +283,22 @@
       collectedMatches <- rbind(collectedMatches, currentMatches)
     }
     if (verbose) {
-      cat(paste0("Retrieved page ", page, "/", ceiling((kqo@totalResults) / res$meta$itemsPerPage), ' in ', res$meta$benchmark, '\n'))
+      cat(paste0(
+        "Retrieved page ",
+        ceiling(length(collectedMatches[, 1]) / res$meta$itemsPerPage),
+        "/",
+        if (!is.na(maxFetch) && maxFetch < kqo@totalResults)
+          sprintf("%d (%d)", ceiling(maxFetch / res$meta$itemsPerPage), ceiling(kqo@totalResults / res$meta$itemsPerPage))
+        else
+          sprintf("%d", ceiling(kqo@totalResults / res$meta$itemsPerPage)),
+        ' in ',
+        res$meta$benchmark,
+        '\n'
+      ))
     }
     page <- page + 1
     results <- results + res$meta$itemsPerPage
-    if (offset + results >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
+    if (length(collectedMatches[,1]) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
       break
     }
   }
@@ -305,8 +329,8 @@
 #' @aliases fetchAll
 #' @rdname KorAPQuery-class
 #' @export
-setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
-  return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose))
+setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
+  return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
 })
 
 #' Fetches the remaining results of a KorAP query.
@@ -320,8 +344,8 @@
 #' @aliases fetchRest
 #' @rdname KorAPQuery-class
 #' @export
-setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
-  return(fetchNext(kqo, maxFetch = NA, verbose = verbose))
+setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
+  return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
 })
 
 #' Query relative frequency of search term(s)
@@ -357,6 +381,56 @@
       ci(conf.level = conf.level)
 })
 
+
+#' buildWebUIRequestUrl
+#'
+#' @rdname KorAPQuery-class
+#' @importFrom urltools url_encode
+#' @export
+buildWebUIRequestUrl <- function(kco,
+                                 query = if (missing(KorAPUrl))
+                                   stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
+                                 else
+                                   httr::parse_url(KorAPUrl)$query$q,
+                                 vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq,
+                                 KorAPUrl,
+                                 metadataOnly = TRUE,
+                                 ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql,
+                                 fields = c(
+                                   "corpusSigle",
+                                   "textSigle",
+                                   "pubDate",
+                                   "pubPlace",
+                                   "availability",
+                                   "textClass",
+                                   "snippet"
+                                 ),
+                                 accessRewriteFatal = TRUE) {
+  request <-
+    paste0(
+      '?q=',
+      urltools::url_encode(enc2utf8(as.character(query))),
+      ifelse(vc != '',
+        paste0('&cq=', urltools::url_encode(enc2utf8(vc))),
+        ''),
+      '&ql=',
+      ql
+    )
+  webUIRequestUrl <- paste0(kco@KorAPUrl, request)
+  requestUrl <- paste0(
+    kco@apiUrl,
+    'search',
+    request,
+    '&fields=',
+    paste(fields, collapse = ","),
+    if (metadataOnly)
+      '&access-rewrite-disabled=true'
+    else
+      ''
+  )
+  webUIRequestUrl
+}
+
 #´ format()
 #' @rdname KorAPQuery-class
 #' @param x KorAPQuery object
@@ -388,118 +462,3 @@
   format(object)
 })
 
-
-
-lemmatizeWordQuery <- function(w) {
-  paste0('[tt/l=', w, ']')
-}
-
-#' Query frequencies of a node and a collocate and calculate collocation association scores
-#'
-#' \bold{\code{collocationScoreQuery}} computes various collocation association scores
-#' based on \code{\link{frequencyQuery}}s for a target word and a collocate.
-#'
-#' @aliases collocationScoreQuery
-#' @rdname KorAPQuery-class
-#'
-#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
-#' @param node               target word
-#' @param collocate          collocate of target word
-#' @param vc                 string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
-#' @param lemmatizeNodeQuery      logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
-#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
-#' @param leftContextSize    size of the left context window
-#' @param rightContextSize   size of the right context window
-#' @param scoreFunctions     named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}
-#' @param smoothingConstant  smoothing constant will be added to all observed values
-#'
-#' @return tibble with query KorAP web request URL, all observed values and association scores
-#'
-#' @examples
-#' \donttest{
-#' new("KorAPConnection", verbose = TRUE) %>%
-#'   collocationScoreQuery("Grund", "triftiger")
-#' }
-#'
-#' \donttest{
-#' new("KorAPConnection", verbose = TRUE) %>%
-#' collocationScoreQuery("Grund", c("guter", "triftiger"),
-#'    scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
-#' }
-#'
-#' \donttest{
-#' library(highcharter)
-#' library(tidyr)
-#' new("KorAPConnection", verbose = TRUE) %>%
-#'   collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
-#'                         lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
-#'                          pivot_longer(14:last_col(), names_to = "measure", values_to = "score") %>%
-#'   hchart(type="spline", hcaes(label, score, group=measure)) %>%
-#'   hc_add_onclick_korap_search()
-#' }
-#'
-#' @importFrom tidyr pivot_longer
-#' @export
-setMethod("collocationScoreQuery", "KorAPConnection",
-          function(kco,
-                   node,
-                   collocate,
-                   vc = "",
-                   lemmatizeNodeQuery = FALSE,
-                   lemmatizeCollocateQuery = FALSE,
-                   leftContextSize = 5,
-                   rightContextSize = 5,
-                   scoreFunctions = defaultAssociationScoreFunctions(),
-                   smoothingConstant = .5
-                   ) {
-            # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
-            O1 <- O2 <- O <- N <- E <- w <- 0
-
-            if (leftContextSize <= 0 && rightContextSize <= 0) {
-              stop("At least one of leftContextSize and rightContextSize must be > 0",
-                   call. = FALSE)
-            }
-
-            if (lemmatizeNodeQuery) {
-              node <- lemmatizeWordQuery(node)
-            }
-
-            if (lemmatizeCollocateQuery) {
-              collocate <- lemmatizeWordQuery(collocate)
-            }
-
-            query <- ""
-
-            if (leftContextSize > 0) {
-              query <-
-                paste0(collocate,
-                       if (leftContextSize > 1) paste0(" []{0,", leftContextSize - 1, "} ") else " ",
-                       node,
-                       if (rightContextSize > 0)  " | ")
-            }
-
-            if (rightContextSize > 0) {
-              query <-
-                paste0(query, node,
-                       if (rightContextSize > 1) paste0(" []{0,", rightContextSize - 1, "} ") else " ", collocate)
-            }
-
-
-            tibble(
-              node = node,
-              collocate = collocate,
-              label = queryStringToLabel(vc),
-              vc = vc,
-              webUIRequestUrl = frequencyQuery(kco, query, vc)$webUIRequestUrl,
-              w = leftContextSize + rightContextSize,
-              leftContextSize,
-              rightContextSize,
-              N  = frequencyQuery(kco, node, vc)$total + smoothingConstant,
-              O = as.double(frequencyQuery(kco, query, vc)$totalResults) + smoothingConstant,
-              O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant,
-              O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant,
-              E = w * as.double(O1) * O2 / N
-            ) %>%
-              mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w))
-
-          })
diff --git a/R/association-scores.R b/R/association-scores.R
index 31ab994..fccba98 100644
--- a/R/association-scores.R
+++ b/R/association-scores.R
@@ -9,6 +9,7 @@
 #'
 #' @return              association score
 #' @name association-score-functions
+#'
 #' @description
 #' Functions to calculate different collocation association scores between
 #' a node (target word) and words in a window around the it.
@@ -18,6 +19,8 @@
 
 #' @rdname association-score-functions
 #'
+#' @family collocation analysis functions
+#'
 #' @export
 #'
 #' @examples
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
new file mode 100644
index 0000000..0df5bea
--- /dev/null
+++ b/R/collocationAnalysis.R
@@ -0,0 +1,318 @@
+setGeneric("collocationAnalysis", function(kco, ...)  standardGeneric("collocationAnalysis") )
+
+#' Collocation analysis
+#'
+#' @aliases collocationAnalysis
+#'
+#' @description
+#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
+#'
+#' Performs a collocation analysis for the given node (or query)
+#' in the given virtual corpus.
+#'
+#' @details
+#' The collocation analysis is currently implemented on the client side, as some of the
+#' functionality is not yet provided by the KorAP backend. Mainly for this reason
+#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
+#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
+#' and look for expression-internal collocates using the focus function (see examples and demo).
+#'
+#' To increase speed at the cost of accuracy and possible false negatives,
+#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
+#'
+#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
+#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
+#' user interface.
+#'
+#' @family collocation analysis functions
+#'
+#' @param lemmatizeNodeQuery     if TRUE, node query will be lemmatized, i.e. x -> [tt/l=x]
+#' @param minOccur               minimum absolute number of observed co-occurrences to consider a collocate candidate
+#' @param topCollocatesLimit     limit analysis to the n most frequent collocates in the search hits sample
+#' @param searchHitsSampleLimit  limit the size of the search hits sample
+#' @param stopwords              vector of stopwords not to be considered as collocates
+#' @param exactFrequencies       if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
+#' @param seed                   seed for random page collecting order
+#' @param expand                 if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations
+#' @param ...                    more arguments will be passed to \code{\link{collocationScoreQuery}}
+#' @inheritParams collocationScoreQuery,KorAPConnection-method
+#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
+#'
+#' @importFrom stringr str_match str_split str_detect
+#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
+#' @importFrom purrr pmap
+#' @importFrom tidyr expand_grid
+#'
+#' @examples
+#' \donttest{
+#'  # Find top collocates of "Packung" inside and outside the sports domain.
+#'  new("KorAPConnection", verbose = TRUE) %>%
+#'   collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
+#'                       leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
+#'   dplyr::filter(logDice >= 5)
+#' }
+#'
+#' \donttest{
+#' # Identify the most prominent light verb construction with "in ... setzen".
+#' # Note that, currently, the use of focus function disallows exactFrequencies.
+#' new("KorAPConnection", verbose = TRUE) %>%
+#'   collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
+#'     leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
+#' }
+#'
+#' @export
+setMethod("collocationAnalysis", "KorAPConnection",
+          function(kco,
+                   node,
+                   vc = "",
+                   lemmatizeNodeQuery = FALSE,
+                   minOccur = 5,
+                   leftContextSize = 5,
+                   rightContextSize = 5,
+                   topCollocatesLimit = 200,
+                   searchHitsSampleLimit = 20000,
+                   ignoreCollocateCase = FALSE,
+                   withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
+                   exactFrequencies = TRUE,
+                   stopwords = RKorAPClient::synsemanticStopwords(),
+                   seed = 7,
+                   expand = length(vc) != length(node),
+                   ...) {
+            # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+            word <- frequency <- NULL
+
+            if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
+              stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
+            }
+
+            if (lemmatizeNodeQuery) {
+              node <- lemmatizeWordQuery(node)
+            }
+
+            if (length(node) > 1 || length(vc) > 1) {
+              grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
+              purrr::pmap(grid, function(node, vc, ...)
+                        collocationAnalysis(kco,
+                                            node =node,
+                                            vc = vc,
+                                            minOccur = minOccur,
+                                            leftContextSize = leftContextSize,
+                                            rightContextSize = rightContextSize,
+                                            topCollocatesLimit = topCollocatesLimit,
+                                            searchHitsSampleLimit = searchHitsSampleLimit,
+                                            ignoreCollocateCase = ignoreCollocateCase,
+                                            withinSpan = withinSpan,
+                                            exactFrequencies = exactFrequencies,
+                                            stopwords = stopwords,
+                                            seed = seed,
+                                            expand = expand,
+                                            ...) ) %>%
+                bind_rows()
+            } else {
+              set.seed(seed)
+              candidates <- collocatesQuery(
+                kco,
+                node,
+                vc = vc,
+                minOccur = minOccur,
+                leftContextSize = leftContextSize,
+                rightContextSize = rightContextSize,
+                searchHitsSampleLimit = searchHitsSampleLimit,
+                ignoreCollocateCase = ignoreCollocateCase,
+                stopwords = stopwords,
+                ...
+              )
+
+              if (nrow(candidates) > 0) {
+                candidates <- candidates %>%
+                  filter(frequency >= minOccur) %>%
+                  head(topCollocatesLimit)
+                collocationScoreQuery(
+                  kco,
+                  node = node,
+                  collocate = candidates$word,
+                  vc = vc,
+                  leftContextSize = leftContextSize,
+                  rightContextSize = rightContextSize,
+                  observed = if (exactFrequencies) NA else candidates$frequency,
+                  ignoreCollocateCase = ignoreCollocateCase,
+                  withinSpan = withinSpan,
+                  ...
+                ) %>%
+                  filter(.$O >= minOccur) %>%
+                  dplyr::arrange(dplyr::desc(logDice))
+              } else {
+                tibble()
+              }
+            }
+          }
+)
+
+#' @importFrom magrittr debug_pipe
+#' @importFrom stringr str_match str_split str_detect
+#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
+#'
+snippet2FreqTable <- function(snippet,
+                              minOccur = 5,
+                              leftContextSize = 5,
+                              rightContextSize = 5,
+                              ignoreCollocateCase = FALSE,
+                              stopwords = c(),
+                              tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
+                              oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
+                              verbose = TRUE) {
+  word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+  frequency <- NULL
+
+  if (length(snippet) < 1) {
+    dplyr::tibble(word=c(), frequency=c())
+  } else if (length(snippet) > 1) {
+    log.info(verbose, paste("Joinging", length(snippet), "kwics"))
+    for (s in snippet) {
+      oldTable <- snippet2FreqTable(
+        s,
+        leftContextSize = leftContextSize,
+        rightContextSize = rightContextSize,
+        oldTable = oldTable,
+        stopwords = stopwords
+      )
+    }
+    log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens"))
+    oldTable  %>%
+      group_by(word) %>%
+      mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
+      summarise(frequency=sum(frequency), .groups = "drop") %>%
+      arrange(desc(frequency))
+  } else {
+    stopwordsTable <- dplyr::tibble(word=stopwords)
+    match <-
+      str_match(
+        snippet,
+        '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
+      )
+
+    left <- if(leftContextSize > 0)
+      tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
+    else
+      ""
+#    cat(paste("left:", left, "\n", collapse=" "))
+
+    right <- if(rightContextSize > 0)
+      head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
+    else
+        ""
+#    cat(paste("right:", right, "\n", collapse=" "))
+
+    if(is.na(left) || is.na(right) || length(left) + length(right) == 0) {
+      oldTable
+    } else {
+      table(c(left, right)) %>%
+        dplyr::as_tibble(.name_repair = "minimal") %>%
+        dplyr::rename(word = 1, frequency = 2) %>%
+        dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
+        dplyr::anti_join(stopwordsTable, by="word")  %>%
+        dplyr::bind_rows(oldTable)
+    }
+  }
+}
+
+#' Preliminary synsemantic stopwords function
+#'
+#' @description
+#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
+#'
+#' Preliminary synsemantic stopwords function to be used in collocation analysis.
+#'
+#' @details
+#' Currently only suitable for German. See stopwords package for other languages.
+#'
+#' @param ... future arguments for language detection
+#'
+#' @family collocation analysis functions
+#' @return Vector of synsemantic stopwords.
+#' @export
+synsemanticStopwords <- function(...) {
+  res <- c(
+    "der",
+    "die",
+    "und",
+    "in",
+    "den",
+    "von",
+    "mit",
+    "das",
+    "zu",
+    "im",
+    "ist",
+    "auf",
+    "sich",
+    "Die",
+    "des",
+    "dem",
+    "nicht",
+    "ein",
+    "eine",
+    "es",
+    "auch",
+    "an",
+    "als",
+    "am",
+    "aus",
+    "Der",
+    "bei",
+    "er",
+    "dass",
+    "sie",
+    "nach",
+    "um",
+    "Das",
+    "zum",
+    "noch",
+    "war",
+    "einen",
+    "einer",
+    "wie",
+    "einem",
+    "vor",
+    "bis",
+    "\u00fcber",
+    "so",
+    "aber",
+    "Eine",
+    "diese",
+    "Diese",
+    "oder"
+  )
+  return(res)
+}
+
+collocatesQuery <-
+  function(kco,
+           query,
+           vc = "",
+           minOccur = 5,
+           leftContextSize = 5,
+           rightContextSize = 5,
+           searchHitsSampleLimit = 20000,
+           ignoreCollocateCase = FALSE,
+           stopwords = c(),
+           ...) {
+    frequency <- NULL
+    q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
+    if(q@totalResults == 0) {
+      tibble(word=c(), frequency=c())
+    } else {
+      q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
+      snippet2FreqTable((q@collectedMatches)$snippet,
+                        minOccur = minOccur,
+                        leftContextSize = leftContextSize,
+                        rightContextSize = rightContextSize,
+                        ignoreCollocateCase = ignoreCollocateCase,
+                        stopwords = stopwords,
+                        verbose = kco@verbose) %>%
+        mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
+        filter(frequency >= minOccur)
+    }
+  }
+
+
diff --git a/R/collocationScoreQuery.R b/R/collocationScoreQuery.R
new file mode 100644
index 0000000..8ec9bf2
--- /dev/null
+++ b/R/collocationScoreQuery.R
@@ -0,0 +1,175 @@
+setGeneric("collocationScoreQuery", function(kco, ...)  standardGeneric("collocationScoreQuery") )
+
+## quiets concerns of R CMD check re: the .'s that appear in pipelines
+if(getRversion() >= "2.15.1")  utils::globalVariables(c("."))
+
+
+#' Query frequencies of a node and a collocate and calculate collocation association scores
+#'
+#' @aliases collocationScoreQuery
+#'
+#' @description
+#' Computes various collocation association scores
+#' based on \code{\link{frequencyQuery}}s for a target word and a collocate.
+#'
+#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
+#' @param node               target word
+#' @param collocate          collocate of target word
+#' @param vc                 string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
+#' @param lemmatizeNodeQuery      logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
+#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
+#' @param leftContextSize    size of the left context window
+#' @param rightContextSize   size of the right context window
+#' @param scoreFunctions     named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}
+#' @param smoothingConstant  smoothing constant will be added to all observed values
+#' @param observed           if collocation frequencies are already known (or estimated from a sample) they can be passed as a vector here, otherwise: NA
+#' @param ignoreCollocateCase     logical, set to TRUE if collocate case should be ignored
+#' @param withinSpan         KorAP span specification for collocations to be searched within
+#'
+#' @return tibble with query KorAP web request URL, all observed values and association scores
+#'
+#' @family collocation analysis functions
+#'
+#' @examples
+#' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#'   collocationScoreQuery("Grund", "triftiger")
+#' }
+#'
+#' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Grund", c("guter", "triftiger"),
+#'    scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
+#' }
+#'
+#' \donttest{
+#' library(highcharter)
+#' library(tidyr)
+#' new("KorAPConnection", verbose = TRUE) %>%
+#'   collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+#'                         lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
+#'                          pivot_longer(14:last_col(), names_to = "measure", values_to = "score") %>%
+#'   hchart(type="spline", hcaes(label, score, group=measure)) %>%
+#'   hc_add_onclick_korap_search()
+#' }
+#'
+#' @importFrom tidyr pivot_longer
+#' @export
+setMethod("collocationScoreQuery", "KorAPConnection",
+          function(kco,
+                   node,
+                   collocate,
+                   vc = "",
+                   lemmatizeNodeQuery = FALSE,
+                   lemmatizeCollocateQuery = FALSE,
+                   leftContextSize = 5,
+                   rightContextSize = 5,
+                   scoreFunctions = defaultAssociationScoreFunctions(),
+                   smoothingConstant = .5,
+                   observed = NA,
+                   ignoreCollocateCase = FALSE,
+                   withinSpan = "base/s=s"
+          ) {
+            # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+            O1 <- O2 <- O <- N <- E <- w <- 0
+
+            query <- buildCollocationQuery(node,
+                                           collocate,
+                                           lemmatizeNodeQuery,
+                                           lemmatizeCollocateQuery,
+                                           leftContextSize,
+                                           rightContextSize,
+                                           ignoreCollocateCase,
+                                           withinSpan)
+
+            tibble(
+              node = node,
+              collocate = collocate,
+              label = queryStringToLabel(vc),
+              vc = vc,
+              webUIRequestUrl = if (is.na(observed[1]))
+                frequencyQuery(kco, query, vc)$webUIRequestUrl
+              else
+                buildWebUIRequestUrl(
+                  kco,
+                  buildCollocationQuery(
+                    node,
+                    collocate,
+                    lemmatizeNodeQuery,
+                    lemmatizeCollocateQuery,
+                    leftContextSize,
+                    rightContextSize,
+                    ignoreCollocateCase,
+                    withinSpan
+                  ),
+                  vc
+                ),
+              w = leftContextSize + rightContextSize,
+              leftContextSize,
+              rightContextSize,
+              N  = frequencyQuery(kco, node, vc)$total + smoothingConstant,
+              O = as.double( if(is.na(observed[1])) frequencyQuery(kco, query, vc)$totalResults else observed) + smoothingConstant,
+              O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant,
+              O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant,
+              E = w * as.double(O1) * O2 / N
+            ) %>%
+              mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w))
+
+          })
+
+buildCollocationQuery <- function(                   node,
+                                                     collocate,
+                                                     lemmatizeNodeQuery = FALSE,
+                                                     lemmatizeCollocateQuery = FALSE,
+                                                     leftContextSize = 5,
+                                                     rightContextSize = 5,
+                                                     ignoreCollocateCase = FALSE,
+                                                     withinSpan = "base/s=s"
+) {
+  if (leftContextSize <= 0 && rightContextSize <= 0) {
+    stop(sprintf("At least one of leftContextSize (=%d) and rightContextSize (=%d) must be > 0", leftContextSize, rightContextSize),
+         call. = FALSE)
+  }
+
+  if (lemmatizeNodeQuery) {
+    node <- lemmatizeWordQuery(node)
+  }
+
+  if (ignoreCollocateCase) {
+    collocate <- ignoreCollocateCaseWordQuery(collocate)
+  }
+
+  if (lemmatizeCollocateQuery) {
+    collocate <- lemmatizeWordQuery(collocate)
+  }
+
+  query <- ""
+
+  if (leftContextSize > 0) {
+    query <-
+      paste0(collocate,
+             if (leftContextSize > 1) paste0(" []{0,", leftContextSize - 1, "} ") else " ",
+             node,
+             if (rightContextSize > 0)  " | ")
+  }
+
+  if (rightContextSize > 0) {
+    query <-
+      paste0(query, node,
+             if (rightContextSize > 1) paste0(" []{0,", rightContextSize - 1, "} ") else " ", collocate)
+  }
+
+  if(!is.null(withinSpan) && !is.na(withinSpan) && nchar(withinSpan) > 0) {
+    query <- sprintf("contains(<%s>, (%s))", withinSpan, query)
+  }
+
+  query
+}
+
+ignoreCollocateCaseWordQuery <- function(w) {
+  paste0(w, '/i')
+}
+
+lemmatizeWordQuery <- function(w) {
+  paste0('[tt/l=', w, ']')
+}