Add recursion to collocationAnalysis

Change-Id: Ied5ab55f31f34048552ec0f0789b0b0175b2e4e0
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index bf962f6..2117da9 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R
@@ -79,6 +79,9 @@
                    stopwords = RKorAPClient::synsemanticStopwords(),
                    seed = 7,
                    expand = length(vc) != length(node),
+                   maxRecurse = 0,
+                   addExamples = TRUE,
+                   localStopwords = c(),
                    ...) {
             # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
             word <- frequency <- NULL
@@ -93,7 +96,7 @@
               node <- lemmatizeWordQuery(node)
             }
 
-            if (length(node) > 1 || length(vc) > 1) {
+            result <- if (length(node) > 1 || length(vc) > 1) {
               grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
               purrr::pmap(grid, function(node, vc, ...)
                         collocationAnalysis(kco,
@@ -108,6 +111,8 @@
                                             withinSpan = withinSpan,
                                             exactFrequencies = exactFrequencies,
                                             stopwords = stopwords,
+                                            addExamples = TRUE,
+                                            localStopwords = localStopwords,
                                             seed = seed,
                                             expand = expand,
                                             ...) ) %>%
@@ -123,7 +128,7 @@
                 rightContextSize = rightContextSize,
                 searchHitsSampleLimit = searchHitsSampleLimit,
                 ignoreCollocateCase = ignoreCollocateCase,
-                stopwords = stopwords,
+                stopwords = append(stopwords, localStopwords),
                 ...
               )
 
@@ -149,9 +154,66 @@
                 tibble()
               }
             }
+            if (maxRecurse > 0 & any( result$logDice >= 2) ) {
+              recurseWith <- result %>%
+                filter(logDice >= 2)
+              result <- collocationAnalysis(
+                kco,
+                node = paste0("(", buildCollocationQuery(
+                  removeWithinSpan(recurseWith$node, withinSpan),
+                  recurseWith$collocate,
+                  leftContextSize = leftContextSize,
+                  rightContextSize = rightContextSize,
+                  withinSpan = ""
+                ), ")"),
+                vc = vc,
+                minOccur = minOccur,
+                leftContextSize = leftContextSize,
+                rightContextSize = rightContextSize,
+                withinSpan = withinSpan,
+                maxRecurse = maxRecurse - 1,
+                stopwords = stopwords,
+                localStopwords = recurseWith$collocate,
+                exactFrequencies = exactFrequencies,
+                searchHitsSampleLimit = searchHitsSampleLimit,
+                topCollocatesLimit = topCollocatesLimit,
+                addExamples = FALSE
+              ) %>%
+                bind_rows(result) %>%
+                filter(logDice >= 2) %>%
+                filter(.$O >= minOccur) %>%
+                dplyr::arrange(dplyr::desc(logDice))
+            }
+            if (addExamples && length(result) > 0) {
+              result$xquery <-buildCollocationQuery(
+                result$node,
+                result$collocate,
+                leftContextSize = leftContextSize,
+                rightContextSize = rightContextSize,
+                withinSpan = ""
+              )
+              result$example <- findExample(
+                kco,
+                query = result$xquery,
+                vc = result$vc
+              )
+            }
+            result
           }
 )
 
+#' @export
+removeWithinSpan <- function(query, withinSpan) {
+  if (withinSpan == "") {
+    return(query)
+  }
+  needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
+  res <- gsub(needle, '\\1', query)
+  needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
+  res <- gsub(needle, '\\1', res)
+  return(res)
+}
+
 #' @importFrom magrittr debug_pipe
 #' @importFrom stringr str_match str_split str_detect
 #' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
@@ -290,6 +352,31 @@
   return(res)
 }
 
+
+#' @export
+findExample <-
+  function(kco,
+           query,
+           vc = "",
+           matchOnly = TRUE) {
+    out <- character(length = length(query))
+
+    if (length(vc) < length(query))
+      vc <- rep(vc, length(query))
+
+    for (i in seq_along(query)) {
+      q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
+      q <- fetchNext(q, maxFetch=50, randomizePageOrder=F)
+      example <- as.character((q@collectedMatches)$snippet[1])
+      out[i] <- if(matchOnly) {
+          gsub('.*<mark>(.+)</mark>.*', '\\1', example)
+        } else {
+          stringr::str_replace(example, '<[^>]*>', '')
+        }
+    }
+    out
+  }
+
 collocatesQuery <-
   function(kco,
            query,