Add recursion to collocationAnalysis

Change-Id: Ied5ab55f31f34048552ec0f0789b0b0175b2e4e0
diff --git a/NAMESPACE b/NAMESPACE
index 9420cc0..47a5ace 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,11 +4,13 @@
 export("%>%")
 export(as_tibble)
 export(bind_cols)
+export(buildCollocationQuery)
 export(buildWebUIRequestUrl)
 export(ci)
 export(complete)
 export(defaultAssociationScoreFunctions)
 export(expand_grid)
+export(findExample)
 export(geom_freq_by_year_ci)
 export(group_by)
 export(hc_add_onclick_korap_search)
@@ -23,6 +25,7 @@
 export(percent)
 export(pmi)
 export(queryStringToLabel)
+export(removeWithinSpan)
 export(select)
 export(summarise)
 export(synsemanticStopwords)
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index bf962f6..2117da9 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R
@@ -79,6 +79,9 @@
                    stopwords = RKorAPClient::synsemanticStopwords(),
                    seed = 7,
                    expand = length(vc) != length(node),
+                   maxRecurse = 0,
+                   addExamples = TRUE,
+                   localStopwords = c(),
                    ...) {
             # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
             word <- frequency <- NULL
@@ -93,7 +96,7 @@
               node <- lemmatizeWordQuery(node)
             }
 
-            if (length(node) > 1 || length(vc) > 1) {
+            result <- if (length(node) > 1 || length(vc) > 1) {
               grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
               purrr::pmap(grid, function(node, vc, ...)
                         collocationAnalysis(kco,
@@ -108,6 +111,8 @@
                                             withinSpan = withinSpan,
                                             exactFrequencies = exactFrequencies,
                                             stopwords = stopwords,
+                                            addExamples = TRUE,
+                                            localStopwords = localStopwords,
                                             seed = seed,
                                             expand = expand,
                                             ...) ) %>%
@@ -123,7 +128,7 @@
                 rightContextSize = rightContextSize,
                 searchHitsSampleLimit = searchHitsSampleLimit,
                 ignoreCollocateCase = ignoreCollocateCase,
-                stopwords = stopwords,
+                stopwords = append(stopwords, localStopwords),
                 ...
               )
 
@@ -149,9 +154,66 @@
                 tibble()
               }
             }
+            if (maxRecurse > 0 & any( result$logDice >= 2) ) {
+              recurseWith <- result %>%
+                filter(logDice >= 2)
+              result <- collocationAnalysis(
+                kco,
+                node = paste0("(", buildCollocationQuery(
+                  removeWithinSpan(recurseWith$node, withinSpan),
+                  recurseWith$collocate,
+                  leftContextSize = leftContextSize,
+                  rightContextSize = rightContextSize,
+                  withinSpan = ""
+                ), ")"),
+                vc = vc,
+                minOccur = minOccur,
+                leftContextSize = leftContextSize,
+                rightContextSize = rightContextSize,
+                withinSpan = withinSpan,
+                maxRecurse = maxRecurse - 1,
+                stopwords = stopwords,
+                localStopwords = recurseWith$collocate,
+                exactFrequencies = exactFrequencies,
+                searchHitsSampleLimit = searchHitsSampleLimit,
+                topCollocatesLimit = topCollocatesLimit,
+                addExamples = FALSE
+              ) %>%
+                bind_rows(result) %>%
+                filter(logDice >= 2) %>%
+                filter(.$O >= minOccur) %>%
+                dplyr::arrange(dplyr::desc(logDice))
+            }
+            if (addExamples && length(result) > 0) {
+              result$xquery <-buildCollocationQuery(
+                result$node,
+                result$collocate,
+                leftContextSize = leftContextSize,
+                rightContextSize = rightContextSize,
+                withinSpan = ""
+              )
+              result$example <- findExample(
+                kco,
+                query = result$xquery,
+                vc = result$vc
+              )
+            }
+            result
           }
 )
 
+#' @export
+removeWithinSpan <- function(query, withinSpan) {
+  if (withinSpan == "") {
+    return(query)
+  }
+  needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
+  res <- gsub(needle, '\\1', query)
+  needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
+  res <- gsub(needle, '\\1', res)
+  return(res)
+}
+
 #' @importFrom magrittr debug_pipe
 #' @importFrom stringr str_match str_split str_detect
 #' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
@@ -290,6 +352,31 @@
   return(res)
 }
 
+
+#' @export
+findExample <-
+  function(kco,
+           query,
+           vc = "",
+           matchOnly = TRUE) {
+    out <- character(length = length(query))
+
+    if (length(vc) < length(query))
+      vc <- rep(vc, length(query))
+
+    for (i in seq_along(query)) {
+      q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
+      q <- fetchNext(q, maxFetch=50, randomizePageOrder=F)
+      example <- as.character((q@collectedMatches)$snippet[1])
+      out[i] <- if(matchOnly) {
+          gsub('.*<mark>(.+)</mark>.*', '\\1', example)
+        } else {
+          stringr::str_replace(example, '<[^>]*>', '')
+        }
+    }
+    out
+  }
+
 collocatesQuery <-
   function(kco,
            query,
diff --git a/R/collocationScoreQuery.R b/R/collocationScoreQuery.R
index 53d214a..1284155 100644
--- a/R/collocationScoreQuery.R
+++ b/R/collocationScoreQuery.R
@@ -96,7 +96,7 @@
                 buildWebUIRequestUrl(
                   kco,
                   buildCollocationQuery(
-                    node,
+                    removeWithinSpan(node, withinSpan),
                     collocate,
                     lemmatizeNodeQuery,
                     lemmatizeCollocateQuery,
@@ -120,6 +120,7 @@
 
           })
 
+#' @export
 buildCollocationQuery <- function(                   node,
                                                      collocate,
                                                      lemmatizeNodeQuery = FALSE,
diff --git a/demo/Rmd/ca.Rmd b/demo/Rmd/ca.Rmd
new file mode 100644
index 0000000..4e530e8
--- /dev/null
+++ b/demo/Rmd/ca.Rmd
@@ -0,0 +1,44 @@
+---
+title: "Kookkurrenzanalyse zu aufmerksam"
+output:
+  html_document:
+    css: style.css
+    keep_md: yes
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
+library(RKorAPClient)
+library(kableExtra)
+library(DT)
+library(tidyverse)
+kco <- new("KorAPConnection", verbose=T)
+if (!exists('ca')) {
+ca <- kco %>%
+  collocationAnalysis(
+    "aufmerksam",
+    leftContextSize = 2,
+    rightContextSize = 2,
+    exactFrequencies = TRUE,
+    searchHitsSampleLimit = 1000,
+    topCollocatesLimit = 10,
+    withinSpan = "",
+    maxRecurse=1
+  )
+}
+```
+
+```{r ca}
+ca %>%
+  mutate(Beispiel=sprintf('<a href="%s">%s</a>', webUIRequestUrl, example)) %>%
+  select(Beispiel, logDice, pmi, ll) %>%
+  head(50) %>%
+  datatable(escape = F,
+            extensions = c('Buttons'),
+            options = list(
+              buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
+              pageLength = 25,
+              dom = 'ftpB'
+            )) %>%
+    formatRound(columns=~logDice + pmi + ll,digits=2)
+```
diff --git a/demo/Rmd/style.css b/demo/Rmd/style.css
new file mode 100644
index 0000000..93df502
--- /dev/null
+++ b/demo/Rmd/style.css
@@ -0,0 +1,50 @@
+@import url('//code.cdn.mozilla.net/fonts/fira.css');
+@import url('//korap.ids-mannheim.de/font/libertinus.css');
+
+h1, h2, h3, h4, h5, h6 {
+    font-family: 'Fira Sans',sans-serif;
+    line-height: 1.2;
+    font-weight: 500;
+}
+
+.title, .subtitle {
+  text-transform: uppercase;
+  text-align: center;
+}
+
+.date, .author {
+  text-align: center;
+}
+
+body {
+    font-family: 'Fira Sans', sans-serif;
+    font-size: 18px;
+    font-weight: 400;
+    font-variant-ligatures: common-ligatures;
+}
+
+
+
+p {
+  hyphens: auto;
+	text-align: justify;
+  overflow-wrap: break-word;
+}
+
+.footnotes {
+  font-family: 'Fira Sans Condensed', sans-serif;
+  font-weight: 400;
+  font-size: 14px;
+  line-height: 1.5;
+}
+
+#TOC > ul {
+  font-family: 'Fira Sans', sans-serif;
+}
+
+.caption {
+  font-family: 'Fira Sans Condensed', sans-serif;
+  font-weight: 400;
+  font-size: 16px;
+	text-align: center;
+}
diff --git a/demo/recursiveCA.R b/demo/recursiveCA.R
new file mode 100644
index 0000000..b121c49
--- /dev/null
+++ b/demo/recursiveCA.R
@@ -0,0 +1,26 @@
+library(RKorAPClient)
+library(tidyverse)
+library(knitr)
+new("KorAPConnection", verbose = TRUE) %>%
+  collocationAnalysis(
+    "aufmerksam",
+    leftContextSize = 2,
+    rightContextSize = 2,
+    exactFrequencies = TRUE,
+    searchHitsSampleLimit = 1000,
+    topCollocatesLimit = 10,
+#    withinSpan = "",
+    maxRecurse = 1,
+    addExamples = T
+  ) %>%
+#  mutate(LVC = sprintf("[aufmerksam %s](%s)", collocate, webUIRequestUrl)) %>%
+  { . ->> cax } %>%
+  mutate(LVC = sprintf("[%s](%s)", example, webUIRequestUrl)) %>%
+  { . ->> ca } %>%
+  select(LVC, logDice, pmi, ll) %>%
+  head(50) %>%
+  kable(format="pipe", digits=2)  %>%
+  cat(file="/tmp/aufmerksam.md", sep="\n")
+
+rmarkdown::render("/tmp/aufmerksam.md")
+browseURL("/tmp/aufmerksam.html")
diff --git a/man/collocationAnalysis-KorAPConnection-method.Rd b/man/collocationAnalysis-KorAPConnection-method.Rd
index 8ee0418..347909e 100644
--- a/man/collocationAnalysis-KorAPConnection-method.Rd
+++ b/man/collocationAnalysis-KorAPConnection-method.Rd
@@ -21,6 +21,9 @@
   stopwords = RKorAPClient::synsemanticStopwords(),
   seed = 7,
   expand = length(vc) != length(node),
+  maxRecurse = 0,
+  addExamples = TRUE,
+  localStopwords = c(),
   ...
 )
 }