Add recursion to collocationAnalysis
Change-Id: Ied5ab55f31f34048552ec0f0789b0b0175b2e4e0
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index bf962f6..2117da9 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R
@@ -79,6 +79,9 @@
stopwords = RKorAPClient::synsemanticStopwords(),
seed = 7,
expand = length(vc) != length(node),
+ maxRecurse = 0,
+ addExamples = TRUE,
+ localStopwords = c(),
...) {
# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
word <- frequency <- NULL
@@ -93,7 +96,7 @@
node <- lemmatizeWordQuery(node)
}
- if (length(node) > 1 || length(vc) > 1) {
+ result <- if (length(node) > 1 || length(vc) > 1) {
grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
purrr::pmap(grid, function(node, vc, ...)
collocationAnalysis(kco,
@@ -108,6 +111,8 @@
withinSpan = withinSpan,
exactFrequencies = exactFrequencies,
stopwords = stopwords,
+ addExamples = TRUE,
+ localStopwords = localStopwords,
seed = seed,
expand = expand,
...) ) %>%
@@ -123,7 +128,7 @@
rightContextSize = rightContextSize,
searchHitsSampleLimit = searchHitsSampleLimit,
ignoreCollocateCase = ignoreCollocateCase,
- stopwords = stopwords,
+ stopwords = append(stopwords, localStopwords),
...
)
@@ -149,9 +154,66 @@
tibble()
}
}
+ if (maxRecurse > 0 & any( result$logDice >= 2) ) {
+ recurseWith <- result %>%
+ filter(logDice >= 2)
+ result <- collocationAnalysis(
+ kco,
+ node = paste0("(", buildCollocationQuery(
+ removeWithinSpan(recurseWith$node, withinSpan),
+ recurseWith$collocate,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ withinSpan = ""
+ ), ")"),
+ vc = vc,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ withinSpan = withinSpan,
+ maxRecurse = maxRecurse - 1,
+ stopwords = stopwords,
+ localStopwords = recurseWith$collocate,
+ exactFrequencies = exactFrequencies,
+ searchHitsSampleLimit = searchHitsSampleLimit,
+ topCollocatesLimit = topCollocatesLimit,
+ addExamples = FALSE
+ ) %>%
+ bind_rows(result) %>%
+ filter(logDice >= 2) %>%
+ filter(.$O >= minOccur) %>%
+ dplyr::arrange(dplyr::desc(logDice))
+ }
+ if (addExamples && length(result) > 0) {
+ result$xquery <-buildCollocationQuery(
+ result$node,
+ result$collocate,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ withinSpan = ""
+ )
+ result$example <- findExample(
+ kco,
+ query = result$xquery,
+ vc = result$vc
+ )
+ }
+ result
}
)
+#' @export
+removeWithinSpan <- function(query, withinSpan) {
+ if (withinSpan == "") {
+ return(query)
+ }
+ needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
+ res <- gsub(needle, '\\1', query)
+ needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
+ res <- gsub(needle, '\\1', res)
+ return(res)
+}
+
#' @importFrom magrittr debug_pipe
#' @importFrom stringr str_match str_split str_detect
#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
@@ -290,6 +352,31 @@
return(res)
}
+
+#' @export
+findExample <-
+ function(kco,
+ query,
+ vc = "",
+ matchOnly = TRUE) {
+ out <- character(length = length(query))
+
+ if (length(vc) < length(query))
+ vc <- rep(vc, length(query))
+
+ for (i in seq_along(query)) {
+ q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
+ q <- fetchNext(q, maxFetch=50, randomizePageOrder=F)
+ example <- as.character((q@collectedMatches)$snippet[1])
+ out[i] <- if(matchOnly) {
+ gsub('.*<mark>(.+)</mark>.*', '\\1', example)
+ } else {
+ stringr::str_replace(example, '<[^>]*>', '')
+ }
+ }
+ out
+ }
+
collocatesQuery <-
function(kco,
query,