Add recursion to collocationAnalysis
Change-Id: Ied5ab55f31f34048552ec0f0789b0b0175b2e4e0
diff --git a/NAMESPACE b/NAMESPACE
index 9420cc0..47a5ace 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,11 +4,13 @@
export("%>%")
export(as_tibble)
export(bind_cols)
+export(buildCollocationQuery)
export(buildWebUIRequestUrl)
export(ci)
export(complete)
export(defaultAssociationScoreFunctions)
export(expand_grid)
+export(findExample)
export(geom_freq_by_year_ci)
export(group_by)
export(hc_add_onclick_korap_search)
@@ -23,6 +25,7 @@
export(percent)
export(pmi)
export(queryStringToLabel)
+export(removeWithinSpan)
export(select)
export(summarise)
export(synsemanticStopwords)
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index bf962f6..2117da9 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R
@@ -79,6 +79,9 @@
stopwords = RKorAPClient::synsemanticStopwords(),
seed = 7,
expand = length(vc) != length(node),
+ maxRecurse = 0,
+ addExamples = TRUE,
+ localStopwords = c(),
...) {
# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
word <- frequency <- NULL
@@ -93,7 +96,7 @@
node <- lemmatizeWordQuery(node)
}
- if (length(node) > 1 || length(vc) > 1) {
+ result <- if (length(node) > 1 || length(vc) > 1) {
grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
purrr::pmap(grid, function(node, vc, ...)
collocationAnalysis(kco,
@@ -108,6 +111,8 @@
withinSpan = withinSpan,
exactFrequencies = exactFrequencies,
stopwords = stopwords,
+ addExamples = TRUE,
+ localStopwords = localStopwords,
seed = seed,
expand = expand,
...) ) %>%
@@ -123,7 +128,7 @@
rightContextSize = rightContextSize,
searchHitsSampleLimit = searchHitsSampleLimit,
ignoreCollocateCase = ignoreCollocateCase,
- stopwords = stopwords,
+ stopwords = append(stopwords, localStopwords),
...
)
@@ -149,9 +154,66 @@
tibble()
}
}
+ if (maxRecurse > 0 & any( result$logDice >= 2) ) {
+ recurseWith <- result %>%
+ filter(logDice >= 2)
+ result <- collocationAnalysis(
+ kco,
+ node = paste0("(", buildCollocationQuery(
+ removeWithinSpan(recurseWith$node, withinSpan),
+ recurseWith$collocate,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ withinSpan = ""
+ ), ")"),
+ vc = vc,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ withinSpan = withinSpan,
+ maxRecurse = maxRecurse - 1,
+ stopwords = stopwords,
+ localStopwords = recurseWith$collocate,
+ exactFrequencies = exactFrequencies,
+ searchHitsSampleLimit = searchHitsSampleLimit,
+ topCollocatesLimit = topCollocatesLimit,
+ addExamples = FALSE
+ ) %>%
+ bind_rows(result) %>%
+ filter(logDice >= 2) %>%
+ filter(.$O >= minOccur) %>%
+ dplyr::arrange(dplyr::desc(logDice))
+ }
+ if (addExamples && length(result) > 0) {
+ result$xquery <-buildCollocationQuery(
+ result$node,
+ result$collocate,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ withinSpan = ""
+ )
+ result$example <- findExample(
+ kco,
+ query = result$xquery,
+ vc = result$vc
+ )
+ }
+ result
}
)
+#' @export
+removeWithinSpan <- function(query, withinSpan) {
+ if (withinSpan == "") {
+ return(query)
+ }
+ needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
+ res <- gsub(needle, '\\1', query)
+ needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
+ res <- gsub(needle, '\\1', res)
+ return(res)
+}
+
#' @importFrom magrittr debug_pipe
#' @importFrom stringr str_match str_split str_detect
#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
@@ -290,6 +352,31 @@
return(res)
}
+
+#' @export
+findExample <-
+ function(kco,
+ query,
+ vc = "",
+ matchOnly = TRUE) {
+ out <- character(length = length(query))
+
+ if (length(vc) < length(query))
+ vc <- rep(vc, length(query))
+
+ for (i in seq_along(query)) {
+ q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
+ q <- fetchNext(q, maxFetch=50, randomizePageOrder=F)
+ example <- as.character((q@collectedMatches)$snippet[1])
+ out[i] <- if(matchOnly) {
+ gsub('.*<mark>(.+)</mark>.*', '\\1', example)
+ } else {
+ stringr::str_replace(example, '<[^>]*>', '')
+ }
+ }
+ out
+ }
+
collocatesQuery <-
function(kco,
query,
diff --git a/R/collocationScoreQuery.R b/R/collocationScoreQuery.R
index 53d214a..1284155 100644
--- a/R/collocationScoreQuery.R
+++ b/R/collocationScoreQuery.R
@@ -96,7 +96,7 @@
buildWebUIRequestUrl(
kco,
buildCollocationQuery(
- node,
+ removeWithinSpan(node, withinSpan),
collocate,
lemmatizeNodeQuery,
lemmatizeCollocateQuery,
@@ -120,6 +120,7 @@
})
+#' @export
buildCollocationQuery <- function( node,
collocate,
lemmatizeNodeQuery = FALSE,
diff --git a/demo/Rmd/ca.Rmd b/demo/Rmd/ca.Rmd
new file mode 100644
index 0000000..4e530e8
--- /dev/null
+++ b/demo/Rmd/ca.Rmd
@@ -0,0 +1,44 @@
+---
+title: "Kookkurrenzanalyse zu aufmerksam"
+output:
+ html_document:
+ css: style.css
+ keep_md: yes
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
+library(RKorAPClient)
+library(kableExtra)
+library(DT)
+library(tidyverse)
+kco <- new("KorAPConnection", verbose=T)
+if (!exists('ca')) {
+ca <- kco %>%
+ collocationAnalysis(
+ "aufmerksam",
+ leftContextSize = 2,
+ rightContextSize = 2,
+ exactFrequencies = TRUE,
+ searchHitsSampleLimit = 1000,
+ topCollocatesLimit = 10,
+ withinSpan = "",
+ maxRecurse=1
+ )
+}
+```
+
+```{r ca}
+ca %>%
+ mutate(Beispiel=sprintf('<a href="%s">%s</a>', webUIRequestUrl, example)) %>%
+ select(Beispiel, logDice, pmi, ll) %>%
+ head(50) %>%
+ datatable(escape = F,
+ extensions = c('Buttons'),
+ options = list(
+ buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
+ pageLength = 25,
+ dom = 'ftpB'
+ )) %>%
+ formatRound(columns=~logDice + pmi + ll,digits=2)
+```
diff --git a/demo/Rmd/style.css b/demo/Rmd/style.css
new file mode 100644
index 0000000..93df502
--- /dev/null
+++ b/demo/Rmd/style.css
@@ -0,0 +1,50 @@
+@import url('//code.cdn.mozilla.net/fonts/fira.css');
+@import url('//korap.ids-mannheim.de/font/libertinus.css');
+
+h1, h2, h3, h4, h5, h6 {
+ font-family: 'Fira Sans',sans-serif;
+ line-height: 1.2;
+ font-weight: 500;
+}
+
+.title, .subtitle {
+ text-transform: uppercase;
+ text-align: center;
+}
+
+.date, .author {
+ text-align: center;
+}
+
+body {
+ font-family: 'Fira Sans', sans-serif;
+ font-size: 18px;
+ font-weight: 400;
+ font-variant-ligatures: common-ligatures;
+}
+
+
+
+p {
+ hyphens: auto;
+ text-align: justify;
+ overflow-wrap: break-word;
+}
+
+.footnotes {
+ font-family: 'Fira Sans Condensed', sans-serif;
+ font-weight: 400;
+ font-size: 14px;
+ line-height: 1.5;
+}
+
+#TOC > ul {
+ font-family: 'Fira Sans', sans-serif;
+}
+
+.caption {
+ font-family: 'Fira Sans Condensed', sans-serif;
+ font-weight: 400;
+ font-size: 16px;
+ text-align: center;
+}
diff --git a/demo/recursiveCA.R b/demo/recursiveCA.R
new file mode 100644
index 0000000..b121c49
--- /dev/null
+++ b/demo/recursiveCA.R
@@ -0,0 +1,26 @@
+library(RKorAPClient)
+library(tidyverse)
+library(knitr)
+new("KorAPConnection", verbose = TRUE) %>%
+ collocationAnalysis(
+ "aufmerksam",
+ leftContextSize = 2,
+ rightContextSize = 2,
+ exactFrequencies = TRUE,
+ searchHitsSampleLimit = 1000,
+ topCollocatesLimit = 10,
+# withinSpan = "",
+ maxRecurse = 1,
+ addExamples = T
+ ) %>%
+# mutate(LVC = sprintf("[aufmerksam %s](%s)", collocate, webUIRequestUrl)) %>%
+ { . ->> cax } %>%
+ mutate(LVC = sprintf("[%s](%s)", example, webUIRequestUrl)) %>%
+ { . ->> ca } %>%
+ select(LVC, logDice, pmi, ll) %>%
+ head(50) %>%
+ kable(format="pipe", digits=2) %>%
+ cat(file="/tmp/aufmerksam.md", sep="\n")
+
+rmarkdown::render("/tmp/aufmerksam.md")
+browseURL("/tmp/aufmerksam.html")
diff --git a/man/collocationAnalysis-KorAPConnection-method.Rd b/man/collocationAnalysis-KorAPConnection-method.Rd
index 8ee0418..347909e 100644
--- a/man/collocationAnalysis-KorAPConnection-method.Rd
+++ b/man/collocationAnalysis-KorAPConnection-method.Rd
@@ -21,6 +21,9 @@
stopwords = RKorAPClient::synsemanticStopwords(),
seed = 7,
expand = length(vc) != length(node),
+ maxRecurse = 0,
+ addExamples = TRUE,
+ localStopwords = c(),
...
)
}