Add full collocation analysis (client side only implementation)
Resolves #2
Change-Id: Ib01d89a72b44ff06816b21532b7ea709a4e837b0
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 0b07106..5ef4846 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -66,15 +66,23 @@
setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") )
setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery") )
-setGeneric("collocationScoreQuery", function(kco, ...) standardGeneric("collocationScoreQuery") )
maxResultsPerPage <- 50
## quiets concerns of R CMD check re: the .'s that appear in pipelines
if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
+#' Corpus query
+#'
#' \bold{\code{corpusQuery}} performs a corpus query via a connection to a KorAP-API-server
#'
+#' @rdname KorAPQuery-class
+#' @aliases corpusQuery
+#'
+#' @importFrom urltools url_encode
+#' @importFrom purrr pmap
+#' @importFrom dplyr bind_rows
+#'
#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
#' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.
#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
@@ -125,7 +133,6 @@
#' @references
#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
#'
-#' @aliases corpusQuery
#' @export
setMethod("corpusQuery", "KorAPConnection",
function(kco,
@@ -151,25 +158,19 @@
expand = length(vc) != length(query),
as.df = FALSE) {
if (length(query) > 1 || length(vc) > 1) {
-
- grid <- {
- if (expand)
- expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc) }
- return(
- do.call(rbind,
- Map(function(q, cq) corpusQuery(kco, query=q, vc=cq, ql=ql,
- verbose=verbose, as.df = TRUE), grid$query, grid$vc)) %>%
- remove_rownames()
- )
- } else {
+ grid <- if (expand) expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc)
+ purrr::pmap(grid, function(query, vc, ...)
+ corpusQuery(kco, query=query, vc=vc, ql=ql, verbose=verbose, as.df = TRUE)) %>%
+ bind_rows()
+ } else {
contentFields <- c("snippet")
if (metadataOnly) {
fields <- fields[!fields %in% contentFields]
}
request <-
paste0('?q=',
- URLencode(enc2utf8(query), reserved = TRUE),
- if (vc != '') paste0('&cq=', URLencode(enc2utf8(vc), reserved = TRUE)) else '', '&ql=', ql)
+ url_encode(enc2utf8(query)),
+ ifelse (vc != '', paste0('&cq=', url_encode(enc2utf8(vc))), ''), '&ql=', ql)
webUIRequestUrl <- paste0(kco@KorAPUrl, request)
requestUrl <- paste0(
kco@apiUrl,
@@ -182,10 +183,11 @@
log.info(verbose, "Searching \"", query, "\" in \"", vc, "\"", sep =
"")
res = apiCall(kco, paste0(requestUrl, '&count=0'))
+ log.info(verbose, ": ", res$meta$totalResults, " hits")
if(!is.null(res$meta$cached))
log.info(verbose, " [cached]\n")
else
- log.info(verbose, " took ", res$meta$benchmark, "\n", sep = "")
+ log.info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
if (as.df)
data.frame(
query = query,
@@ -218,6 +220,7 @@
#' @param offset start offset for query results to fetch
#' @param maxFetch maximum number of query results to fetch
#' @param verbose print progress information if true
+#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use \code{\link{set.seed}} to set seed for reproducible results.
#' @return The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
#'
#' @examples
@@ -232,7 +235,11 @@
#' @rdname KorAPQuery-class
#' @importFrom dplyr rowwise bind_rows select summarise n
#' @export
-setMethod("fetchNext", "KorAPQuery", function(kqo, offset = kqo@nextStartIndex, maxFetch = maxResultsPerPage, verbose = kqo@korapConnection@verbose) {
+setMethod("fetchNext", "KorAPQuery", function(kqo,
+ offset = kqo@nextStartIndex,
+ maxFetch = maxResultsPerPage,
+ verbose = kqo@korapConnection@verbose,
+ randomizePageOrder = FALSE) {
if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
return(kqo)
}
@@ -242,8 +249,14 @@
pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
collectedMatches <- kqo@collectedMatches
+ if (randomizePageOrder) {
+ pages <- head(sample.int(ceiling(kqo@totalResults / maxResultsPerPage)), maxFetch) - 1
+ }
+
repeat {
- query <- paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', offset + results, '&cutoff=true')
+ page = length(collectedMatches[,1]) %/% maxResultsPerPage + 1
+ currentOffset = ifelse(randomizePageOrder, pages[page], page - 1) * maxResultsPerPage
+ query <- paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', currentOffset, '&cutoff=true')
res <- apiCall(kqo@korapConnection, query)
if (length(res$matches) == 0) {
break
@@ -270,11 +283,22 @@
collectedMatches <- rbind(collectedMatches, currentMatches)
}
if (verbose) {
- cat(paste0("Retrieved page ", page, "/", ceiling((kqo@totalResults) / res$meta$itemsPerPage), ' in ', res$meta$benchmark, '\n'))
+ cat(paste0(
+ "Retrieved page ",
+ ceiling(length(collectedMatches[, 1]) / res$meta$itemsPerPage),
+ "/",
+ if (!is.na(maxFetch) && maxFetch < kqo@totalResults)
+ sprintf("%d (%d)", ceiling(maxFetch / res$meta$itemsPerPage), ceiling(kqo@totalResults / res$meta$itemsPerPage))
+ else
+ sprintf("%d", ceiling(kqo@totalResults / res$meta$itemsPerPage)),
+ ' in ',
+ res$meta$benchmark,
+ '\n'
+ ))
}
page <- page + 1
results <- results + res$meta$itemsPerPage
- if (offset + results >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
+ if (length(collectedMatches[,1]) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
break
}
}
@@ -305,8 +329,8 @@
#' @aliases fetchAll
#' @rdname KorAPQuery-class
#' @export
-setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
- return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose))
+setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
+ return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
})
#' Fetches the remaining results of a KorAP query.
@@ -320,8 +344,8 @@
#' @aliases fetchRest
#' @rdname KorAPQuery-class
#' @export
-setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
- return(fetchNext(kqo, maxFetch = NA, verbose = verbose))
+setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
+ return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
})
#' Query relative frequency of search term(s)
@@ -357,6 +381,56 @@
ci(conf.level = conf.level)
})
+
+#' buildWebUIRequestUrl
+#'
+#' @rdname KorAPQuery-class
+#' @importFrom urltools url_encode
+#' @export
+buildWebUIRequestUrl <- function(kco,
+ query = if (missing(KorAPUrl))
+ stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
+ else
+ httr::parse_url(KorAPUrl)$query$q,
+ vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq,
+ KorAPUrl,
+ metadataOnly = TRUE,
+ ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql,
+ fields = c(
+ "corpusSigle",
+ "textSigle",
+ "pubDate",
+ "pubPlace",
+ "availability",
+ "textClass",
+ "snippet"
+ ),
+ accessRewriteFatal = TRUE) {
+ request <-
+ paste0(
+ '?q=',
+ urltools::url_encode(enc2utf8(as.character(query))),
+ ifelse(vc != '',
+ paste0('&cq=', urltools::url_encode(enc2utf8(vc))),
+ ''),
+ '&ql=',
+ ql
+ )
+ webUIRequestUrl <- paste0(kco@KorAPUrl, request)
+ requestUrl <- paste0(
+ kco@apiUrl,
+ 'search',
+ request,
+ '&fields=',
+ paste(fields, collapse = ","),
+ if (metadataOnly)
+ '&access-rewrite-disabled=true'
+ else
+ ''
+ )
+ webUIRequestUrl
+}
+
#´ format()
#' @rdname KorAPQuery-class
#' @param x KorAPQuery object
@@ -388,118 +462,3 @@
format(object)
})
-
-
-lemmatizeWordQuery <- function(w) {
- paste0('[tt/l=', w, ']')
-}
-
-#' Query frequencies of a node and a collocate and calculate collocation association scores
-#'
-#' \bold{\code{collocationScoreQuery}} computes various collocation association scores
-#' based on \code{\link{frequencyQuery}}s for a target word and a collocate.
-#'
-#' @aliases collocationScoreQuery
-#' @rdname KorAPQuery-class
-#'
-#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
-#' @param node target word
-#' @param collocate collocate of target word
-#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
-#' @param lemmatizeNodeQuery logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
-#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
-#' @param leftContextSize size of the left context window
-#' @param rightContextSize size of the right context window
-#' @param scoreFunctions named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}
-#' @param smoothingConstant smoothing constant will be added to all observed values
-#'
-#' @return tibble with query KorAP web request URL, all observed values and association scores
-#'
-#' @examples
-#' \donttest{
-#' new("KorAPConnection", verbose = TRUE) %>%
-#' collocationScoreQuery("Grund", "triftiger")
-#' }
-#'
-#' \donttest{
-#' new("KorAPConnection", verbose = TRUE) %>%
-#' collocationScoreQuery("Grund", c("guter", "triftiger"),
-#' scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
-#' }
-#'
-#' \donttest{
-#' library(highcharter)
-#' library(tidyr)
-#' new("KorAPConnection", verbose = TRUE) %>%
-#' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
-#' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
-#' pivot_longer(14:last_col(), names_to = "measure", values_to = "score") %>%
-#' hchart(type="spline", hcaes(label, score, group=measure)) %>%
-#' hc_add_onclick_korap_search()
-#' }
-#'
-#' @importFrom tidyr pivot_longer
-#' @export
-setMethod("collocationScoreQuery", "KorAPConnection",
- function(kco,
- node,
- collocate,
- vc = "",
- lemmatizeNodeQuery = FALSE,
- lemmatizeCollocateQuery = FALSE,
- leftContextSize = 5,
- rightContextSize = 5,
- scoreFunctions = defaultAssociationScoreFunctions(),
- smoothingConstant = .5
- ) {
- # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
- O1 <- O2 <- O <- N <- E <- w <- 0
-
- if (leftContextSize <= 0 && rightContextSize <= 0) {
- stop("At least one of leftContextSize and rightContextSize must be > 0",
- call. = FALSE)
- }
-
- if (lemmatizeNodeQuery) {
- node <- lemmatizeWordQuery(node)
- }
-
- if (lemmatizeCollocateQuery) {
- collocate <- lemmatizeWordQuery(collocate)
- }
-
- query <- ""
-
- if (leftContextSize > 0) {
- query <-
- paste0(collocate,
- if (leftContextSize > 1) paste0(" []{0,", leftContextSize - 1, "} ") else " ",
- node,
- if (rightContextSize > 0) " | ")
- }
-
- if (rightContextSize > 0) {
- query <-
- paste0(query, node,
- if (rightContextSize > 1) paste0(" []{0,", rightContextSize - 1, "} ") else " ", collocate)
- }
-
-
- tibble(
- node = node,
- collocate = collocate,
- label = queryStringToLabel(vc),
- vc = vc,
- webUIRequestUrl = frequencyQuery(kco, query, vc)$webUIRequestUrl,
- w = leftContextSize + rightContextSize,
- leftContextSize,
- rightContextSize,
- N = frequencyQuery(kco, node, vc)$total + smoothingConstant,
- O = as.double(frequencyQuery(kco, query, vc)$totalResults) + smoothingConstant,
- O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant,
- O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant,
- E = w * as.double(O1) * O2 / N
- ) %>%
- mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w))
-
- })
diff --git a/R/association-scores.R b/R/association-scores.R
index 31ab994..fccba98 100644
--- a/R/association-scores.R
+++ b/R/association-scores.R
@@ -9,6 +9,7 @@
#'
#' @return association score
#' @name association-score-functions
+#'
#' @description
#' Functions to calculate different collocation association scores between
#' a node (target word) and words in a window around the it.
@@ -18,6 +19,8 @@
#' @rdname association-score-functions
#'
+#' @family collocation analysis functions
+#'
#' @export
#'
#' @examples
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
new file mode 100644
index 0000000..0df5bea
--- /dev/null
+++ b/R/collocationAnalysis.R
@@ -0,0 +1,318 @@
+setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
+
+#' Collocation analysis
+#'
+#' @aliases collocationAnalysis
+#'
+#' @description
+#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
+#'
+#' Performs a collocation analysis for the given node (or query)
+#' in the given virtual corpus.
+#'
+#' @details
+#' The collocation analysis is currently implemented on the client side, as some of the
+#' functionality is not yet provided by the KorAP backend. Mainly for this reason
+#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
+#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
+#' and look for expression-internal collocates using the focus function (see examples and demo).
+#'
+#' To increase speed at the cost of accuracy and possible false negatives,
+#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
+#'
+#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
+#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
+#' user interface.
+#'
+#' @family collocation analysis functions
+#'
+#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. x -> [tt/l=x]
+#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
+#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
+#' @param searchHitsSampleLimit limit the size of the search hits sample
+#' @param stopwords vector of stopwords not to be considered as collocates
+#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
+#' @param seed seed for random page collecting order
+#' @param expand if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations
+#' @param ... more arguments will be passed to \code{\link{collocationScoreQuery}}
+#' @inheritParams collocationScoreQuery,KorAPConnection-method
+#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
+#'
+#' @importFrom stringr str_match str_split str_detect
+#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
+#' @importFrom purrr pmap
+#' @importFrom tidyr expand_grid
+#'
+#' @examples
+#' \donttest{
+#' # Find top collocates of "Packung" inside and outside the sports domain.
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
+#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
+#' dplyr::filter(logDice >= 5)
+#' }
+#'
+#' \donttest{
+#' # Identify the most prominent light verb construction with "in ... setzen".
+#' # Note that, currently, the use of focus function disallows exactFrequencies.
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
+#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
+#' }
+#'
+#' @export
+setMethod("collocationAnalysis", "KorAPConnection",
+ function(kco,
+ node,
+ vc = "",
+ lemmatizeNodeQuery = FALSE,
+ minOccur = 5,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ topCollocatesLimit = 200,
+ searchHitsSampleLimit = 20000,
+ ignoreCollocateCase = FALSE,
+ withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
+ exactFrequencies = TRUE,
+ stopwords = RKorAPClient::synsemanticStopwords(),
+ seed = 7,
+ expand = length(vc) != length(node),
+ ...) {
+ # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+ word <- frequency <- NULL
+
+ if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
+ stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
+ }
+
+ if (lemmatizeNodeQuery) {
+ node <- lemmatizeWordQuery(node)
+ }
+
+ if (length(node) > 1 || length(vc) > 1) {
+ grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
+ purrr::pmap(grid, function(node, vc, ...)
+ collocationAnalysis(kco,
+ node =node,
+ vc = vc,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ topCollocatesLimit = topCollocatesLimit,
+ searchHitsSampleLimit = searchHitsSampleLimit,
+ ignoreCollocateCase = ignoreCollocateCase,
+ withinSpan = withinSpan,
+ exactFrequencies = exactFrequencies,
+ stopwords = stopwords,
+ seed = seed,
+ expand = expand,
+ ...) ) %>%
+ bind_rows()
+ } else {
+ set.seed(seed)
+ candidates <- collocatesQuery(
+ kco,
+ node,
+ vc = vc,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ searchHitsSampleLimit = searchHitsSampleLimit,
+ ignoreCollocateCase = ignoreCollocateCase,
+ stopwords = stopwords,
+ ...
+ )
+
+ if (nrow(candidates) > 0) {
+ candidates <- candidates %>%
+ filter(frequency >= minOccur) %>%
+ head(topCollocatesLimit)
+ collocationScoreQuery(
+ kco,
+ node = node,
+ collocate = candidates$word,
+ vc = vc,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ observed = if (exactFrequencies) NA else candidates$frequency,
+ ignoreCollocateCase = ignoreCollocateCase,
+ withinSpan = withinSpan,
+ ...
+ ) %>%
+ filter(.$O >= minOccur) %>%
+ dplyr::arrange(dplyr::desc(logDice))
+ } else {
+ tibble()
+ }
+ }
+ }
+)
+
+#' @importFrom magrittr debug_pipe
+#' @importFrom stringr str_match str_split str_detect
+#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
+#'
+snippet2FreqTable <- function(snippet,
+ minOccur = 5,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ ignoreCollocateCase = FALSE,
+ stopwords = c(),
+ tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|")",
+ oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
+ verbose = TRUE) {
+ word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+ frequency <- NULL
+
+ if (length(snippet) < 1) {
+ dplyr::tibble(word=c(), frequency=c())
+ } else if (length(snippet) > 1) {
+ log.info(verbose, paste("Joinging", length(snippet), "kwics"))
+ for (s in snippet) {
+ oldTable <- snippet2FreqTable(
+ s,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ oldTable = oldTable,
+ stopwords = stopwords
+ )
+ }
+ log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens"))
+ oldTable %>%
+ group_by(word) %>%
+ mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
+ summarise(frequency=sum(frequency), .groups = "drop") %>%
+ arrange(desc(frequency))
+ } else {
+ stopwordsTable <- dplyr::tibble(word=stopwords)
+ match <-
+ str_match(
+ snippet,
+ '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
+ )
+
+ left <- if(leftContextSize > 0)
+ tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
+ else
+ ""
+# cat(paste("left:", left, "\n", collapse=" "))
+
+ right <- if(rightContextSize > 0)
+ head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
+ else
+ ""
+# cat(paste("right:", right, "\n", collapse=" "))
+
+ if(is.na(left) || is.na(right) || length(left) + length(right) == 0) {
+ oldTable
+ } else {
+ table(c(left, right)) %>%
+ dplyr::as_tibble(.name_repair = "minimal") %>%
+ dplyr::rename(word = 1, frequency = 2) %>%
+ dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
+ dplyr::anti_join(stopwordsTable, by="word") %>%
+ dplyr::bind_rows(oldTable)
+ }
+ }
+}
+
+#' Preliminary synsemantic stopwords function
+#'
+#' @description
+#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
+#'
+#' Preliminary synsemantic stopwords function to be used in collocation analysis.
+#'
+#' @details
+#' Currently only suitable for German. See stopwords package for other languages.
+#'
+#' @param ... future arguments for language detection
+#'
+#' @family collocation analysis functions
+#' @return Vector of synsemantic stopwords.
+#' @export
+synsemanticStopwords <- function(...) {
+ res <- c(
+ "der",
+ "die",
+ "und",
+ "in",
+ "den",
+ "von",
+ "mit",
+ "das",
+ "zu",
+ "im",
+ "ist",
+ "auf",
+ "sich",
+ "Die",
+ "des",
+ "dem",
+ "nicht",
+ "ein",
+ "eine",
+ "es",
+ "auch",
+ "an",
+ "als",
+ "am",
+ "aus",
+ "Der",
+ "bei",
+ "er",
+ "dass",
+ "sie",
+ "nach",
+ "um",
+ "Das",
+ "zum",
+ "noch",
+ "war",
+ "einen",
+ "einer",
+ "wie",
+ "einem",
+ "vor",
+ "bis",
+ "\u00fcber",
+ "so",
+ "aber",
+ "Eine",
+ "diese",
+ "Diese",
+ "oder"
+ )
+ return(res)
+}
+
+collocatesQuery <-
+ function(kco,
+ query,
+ vc = "",
+ minOccur = 5,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ searchHitsSampleLimit = 20000,
+ ignoreCollocateCase = FALSE,
+ stopwords = c(),
+ ...) {
+ frequency <- NULL
+ q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
+ if(q@totalResults == 0) {
+ tibble(word=c(), frequency=c())
+ } else {
+ q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
+ snippet2FreqTable((q@collectedMatches)$snippet,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ ignoreCollocateCase = ignoreCollocateCase,
+ stopwords = stopwords,
+ verbose = kco@verbose) %>%
+ mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
+ filter(frequency >= minOccur)
+ }
+ }
+
+
diff --git a/R/collocationScoreQuery.R b/R/collocationScoreQuery.R
new file mode 100644
index 0000000..8ec9bf2
--- /dev/null
+++ b/R/collocationScoreQuery.R
@@ -0,0 +1,175 @@
+setGeneric("collocationScoreQuery", function(kco, ...) standardGeneric("collocationScoreQuery") )
+
+## quiets concerns of R CMD check re: the .'s that appear in pipelines
+if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
+
+
+#' Query frequencies of a node and a collocate and calculate collocation association scores
+#'
+#' @aliases collocationScoreQuery
+#'
+#' @description
+#' Computes various collocation association scores
+#' based on \code{\link{frequencyQuery}}s for a target word and a collocate.
+#'
+#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
+#' @param node target word
+#' @param collocate collocate of target word
+#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
+#' @param lemmatizeNodeQuery logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
+#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
+#' @param leftContextSize size of the left context window
+#' @param rightContextSize size of the right context window
+#' @param scoreFunctions named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}
+#' @param smoothingConstant smoothing constant will be added to all observed values
+#' @param observed if collocation frequencies are already known (or estimated from a sample) they can be passed as a vector here, otherwise: NA
+#' @param ignoreCollocateCase logical, set to TRUE if collocate case should be ignored
+#' @param withinSpan KorAP span specification for collocations to be searched within
+#'
+#' @return tibble with query KorAP web request URL, all observed values and association scores
+#'
+#' @family collocation analysis functions
+#'
+#' @examples
+#' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Grund", "triftiger")
+#' }
+#'
+#' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Grund", c("guter", "triftiger"),
+#' scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
+#' }
+#'
+#' \donttest{
+#' library(highcharter)
+#' library(tidyr)
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+#' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
+#' pivot_longer(14:last_col(), names_to = "measure", values_to = "score") %>%
+#' hchart(type="spline", hcaes(label, score, group=measure)) %>%
+#' hc_add_onclick_korap_search()
+#' }
+#'
+#' @importFrom tidyr pivot_longer
+#' @export
+setMethod("collocationScoreQuery", "KorAPConnection",
+ function(kco,
+ node,
+ collocate,
+ vc = "",
+ lemmatizeNodeQuery = FALSE,
+ lemmatizeCollocateQuery = FALSE,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ scoreFunctions = defaultAssociationScoreFunctions(),
+ smoothingConstant = .5,
+ observed = NA,
+ ignoreCollocateCase = FALSE,
+ withinSpan = "base/s=s"
+ ) {
+ # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+ O1 <- O2 <- O <- N <- E <- w <- 0
+
+ query <- buildCollocationQuery(node,
+ collocate,
+ lemmatizeNodeQuery,
+ lemmatizeCollocateQuery,
+ leftContextSize,
+ rightContextSize,
+ ignoreCollocateCase,
+ withinSpan)
+
+ tibble(
+ node = node,
+ collocate = collocate,
+ label = queryStringToLabel(vc),
+ vc = vc,
+ webUIRequestUrl = if (is.na(observed[1]))
+ frequencyQuery(kco, query, vc)$webUIRequestUrl
+ else
+ buildWebUIRequestUrl(
+ kco,
+ buildCollocationQuery(
+ node,
+ collocate,
+ lemmatizeNodeQuery,
+ lemmatizeCollocateQuery,
+ leftContextSize,
+ rightContextSize,
+ ignoreCollocateCase,
+ withinSpan
+ ),
+ vc
+ ),
+ w = leftContextSize + rightContextSize,
+ leftContextSize,
+ rightContextSize,
+ N = frequencyQuery(kco, node, vc)$total + smoothingConstant,
+ O = as.double( if(is.na(observed[1])) frequencyQuery(kco, query, vc)$totalResults else observed) + smoothingConstant,
+ O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant,
+ O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant,
+ E = w * as.double(O1) * O2 / N
+ ) %>%
+ mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w))
+
+ })
+
+buildCollocationQuery <- function( node,
+ collocate,
+ lemmatizeNodeQuery = FALSE,
+ lemmatizeCollocateQuery = FALSE,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ ignoreCollocateCase = FALSE,
+ withinSpan = "base/s=s"
+) {
+ if (leftContextSize <= 0 && rightContextSize <= 0) {
+ stop(sprintf("At least one of leftContextSize (=%d) and rightContextSize (=%d) must be > 0", leftContextSize, rightContextSize),
+ call. = FALSE)
+ }
+
+ if (lemmatizeNodeQuery) {
+ node <- lemmatizeWordQuery(node)
+ }
+
+ if (ignoreCollocateCase) {
+ collocate <- ignoreCollocateCaseWordQuery(collocate)
+ }
+
+ if (lemmatizeCollocateQuery) {
+ collocate <- lemmatizeWordQuery(collocate)
+ }
+
+ query <- ""
+
+ if (leftContextSize > 0) {
+ query <-
+ paste0(collocate,
+ if (leftContextSize > 1) paste0(" []{0,", leftContextSize - 1, "} ") else " ",
+ node,
+ if (rightContextSize > 0) " | ")
+ }
+
+ if (rightContextSize > 0) {
+ query <-
+ paste0(query, node,
+ if (rightContextSize > 1) paste0(" []{0,", rightContextSize - 1, "} ") else " ", collocate)
+ }
+
+ if(!is.null(withinSpan) && !is.na(withinSpan) && nchar(withinSpan) > 0) {
+ query <- sprintf("contains(<%s>, (%s))", withinSpan, query)
+ }
+
+ query
+}
+
+ignoreCollocateCaseWordQuery <- function(w) {
+ paste0(w, '/i')
+}
+
+lemmatizeWordQuery <- function(w) {
+ paste0('[tt/l=', w, ']')
+}