Add full collocation analysis (client side only implementation)
Resolves #2
Change-Id: Ib01d89a72b44ff06816b21532b7ea709a4e837b0
diff --git a/DESCRIPTION b/DESCRIPTION
index 340b22a..2053a09 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -39,6 +39,7 @@
magrittr,
tidyr,
dplyr,
+ lifecycle,
lubridate,
highcharter,
jsonlite,
@@ -46,7 +47,10 @@
utils,
httr,
methods,
- PTXQC
+ PTXQC,
+ purrr,
+ stringr,
+ urltools
Suggests:
testthat
Collate:
@@ -56,6 +60,8 @@
'KorAPQuery.R'
'association-scores.R'
'ci.R'
+ 'collocationAnalysis.R'
+ 'collocationScoreQuery.R'
'highcharter-helper.R'
'misc.R'
'reexports.R'
diff --git a/NAMESPACE b/NAMESPACE
index 6017fe9..9420cc0 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,6 +4,7 @@
export("%>%")
export(as_tibble)
export(bind_cols)
+export(buildWebUIRequestUrl)
export(ci)
export(complete)
export(defaultAssociationScoreFunctions)
@@ -24,6 +25,7 @@
export(queryStringToLabel)
export(select)
export(summarise)
+export(synsemanticStopwords)
export(tidy)
export(year)
exportClasses(KorAPConnection)
@@ -32,6 +34,7 @@
exportMethods(apiCall)
exportMethods(clearAccessToken)
exportMethods(clearCache)
+exportMethods(collocationAnalysis)
exportMethods(collocationScoreQuery)
exportMethods(corpusQuery)
exportMethods(corpusStats)
@@ -52,9 +55,15 @@
importFrom(PTXQC,lcsCount)
importFrom(broom,tidy)
importFrom(dplyr,.data)
+importFrom(dplyr,anti_join)
+importFrom(dplyr,arrange)
+importFrom(dplyr,as_tibble)
importFrom(dplyr,bind_cols)
importFrom(dplyr,bind_rows)
+importFrom(dplyr,case_when)
+importFrom(dplyr,desc)
importFrom(dplyr,enquo)
+importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,if_else)
importFrom(dplyr,mutate)
@@ -62,8 +71,10 @@
importFrom(dplyr,rename)
importFrom(dplyr,rowwise)
importFrom(dplyr,select)
+importFrom(dplyr,slice_head)
importFrom(dplyr,starts_with)
importFrom(dplyr,summarise)
+importFrom(dplyr,tibble)
importFrom(ggplot2,GeomPoint)
importFrom(ggplot2,aes)
importFrom(ggplot2,element_text)
@@ -78,7 +89,12 @@
importFrom(jsonlite,fromJSON)
importFrom(lubridate,year)
importFrom(magrittr,"%>%")
+importFrom(magrittr,debug_pipe)
+importFrom(purrr,pmap)
importFrom(stats,prop.test)
+importFrom(stringr,str_detect)
+importFrom(stringr,str_match)
+importFrom(stringr,str_split)
importFrom(tibble,add_column)
importFrom(tibble,as_tibble)
importFrom(tibble,remove_rownames)
@@ -87,3 +103,4 @@
importFrom(tidyr,complete)
importFrom(tidyr,expand_grid)
importFrom(tidyr,pivot_longer)
+importFrom(urltools,url_encode)
diff --git a/NEWS.md b/NEWS.md
index 88a6648..70f6ad1 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,10 @@
+## New Features
+- experimental new `collocationAnalysis` method (client-sided)
+- new parameter `randomizePageOrder` to fetch result pages in randomized order
+- new parameter `ignoreCollocateCase` in `collocationScoreQuery`
+- new parameter `withinSpan` (default: `base/s=s`) in `collocationScoreQuery`
+- number of hits logged during queries, if `verbose=TRUE`
+
## Bug Fixes
- fixed umlaut queries on windows
- fixed retrieval of access token when multiple access tokens are stored
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 0b07106..5ef4846 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -66,15 +66,23 @@
setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") )
setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery") )
-setGeneric("collocationScoreQuery", function(kco, ...) standardGeneric("collocationScoreQuery") )
maxResultsPerPage <- 50
## quiets concerns of R CMD check re: the .'s that appear in pipelines
if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
+#' Corpus query
+#'
#' \bold{\code{corpusQuery}} performs a corpus query via a connection to a KorAP-API-server
#'
+#' @rdname KorAPQuery-class
+#' @aliases corpusQuery
+#'
+#' @importFrom urltools url_encode
+#' @importFrom purrr pmap
+#' @importFrom dplyr bind_rows
+#'
#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
#' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.
#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
@@ -125,7 +133,6 @@
#' @references
#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
#'
-#' @aliases corpusQuery
#' @export
setMethod("corpusQuery", "KorAPConnection",
function(kco,
@@ -151,25 +158,19 @@
expand = length(vc) != length(query),
as.df = FALSE) {
if (length(query) > 1 || length(vc) > 1) {
-
- grid <- {
- if (expand)
- expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc) }
- return(
- do.call(rbind,
- Map(function(q, cq) corpusQuery(kco, query=q, vc=cq, ql=ql,
- verbose=verbose, as.df = TRUE), grid$query, grid$vc)) %>%
- remove_rownames()
- )
- } else {
+ grid <- if (expand) expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc)
+ purrr::pmap(grid, function(query, vc, ...)
+ corpusQuery(kco, query=query, vc=vc, ql=ql, verbose=verbose, as.df = TRUE)) %>%
+ bind_rows()
+ } else {
contentFields <- c("snippet")
if (metadataOnly) {
fields <- fields[!fields %in% contentFields]
}
request <-
paste0('?q=',
- URLencode(enc2utf8(query), reserved = TRUE),
- if (vc != '') paste0('&cq=', URLencode(enc2utf8(vc), reserved = TRUE)) else '', '&ql=', ql)
+ url_encode(enc2utf8(query)),
+ ifelse (vc != '', paste0('&cq=', url_encode(enc2utf8(vc))), ''), '&ql=', ql)
webUIRequestUrl <- paste0(kco@KorAPUrl, request)
requestUrl <- paste0(
kco@apiUrl,
@@ -182,10 +183,11 @@
log.info(verbose, "Searching \"", query, "\" in \"", vc, "\"", sep =
"")
res = apiCall(kco, paste0(requestUrl, '&count=0'))
+ log.info(verbose, ": ", res$meta$totalResults, " hits")
if(!is.null(res$meta$cached))
log.info(verbose, " [cached]\n")
else
- log.info(verbose, " took ", res$meta$benchmark, "\n", sep = "")
+ log.info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
if (as.df)
data.frame(
query = query,
@@ -218,6 +220,7 @@
#' @param offset start offset for query results to fetch
#' @param maxFetch maximum number of query results to fetch
#' @param verbose print progress information if true
+#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use \code{\link{set.seed}} to set seed for reproducible results.
#' @return The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
#'
#' @examples
@@ -232,7 +235,11 @@
#' @rdname KorAPQuery-class
#' @importFrom dplyr rowwise bind_rows select summarise n
#' @export
-setMethod("fetchNext", "KorAPQuery", function(kqo, offset = kqo@nextStartIndex, maxFetch = maxResultsPerPage, verbose = kqo@korapConnection@verbose) {
+setMethod("fetchNext", "KorAPQuery", function(kqo,
+ offset = kqo@nextStartIndex,
+ maxFetch = maxResultsPerPage,
+ verbose = kqo@korapConnection@verbose,
+ randomizePageOrder = FALSE) {
if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
return(kqo)
}
@@ -242,8 +249,14 @@
pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
collectedMatches <- kqo@collectedMatches
+ if (randomizePageOrder) {
+ pages <- head(sample.int(ceiling(kqo@totalResults / maxResultsPerPage)), maxFetch) - 1
+ }
+
repeat {
- query <- paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', offset + results, '&cutoff=true')
+ page = length(collectedMatches[,1]) %/% maxResultsPerPage + 1
+ currentOffset = ifelse(randomizePageOrder, pages[page], page - 1) * maxResultsPerPage
+ query <- paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', currentOffset, '&cutoff=true')
res <- apiCall(kqo@korapConnection, query)
if (length(res$matches) == 0) {
break
@@ -270,11 +283,22 @@
collectedMatches <- rbind(collectedMatches, currentMatches)
}
if (verbose) {
- cat(paste0("Retrieved page ", page, "/", ceiling((kqo@totalResults) / res$meta$itemsPerPage), ' in ', res$meta$benchmark, '\n'))
+ cat(paste0(
+ "Retrieved page ",
+ ceiling(length(collectedMatches[, 1]) / res$meta$itemsPerPage),
+ "/",
+ if (!is.na(maxFetch) && maxFetch < kqo@totalResults)
+ sprintf("%d (%d)", ceiling(maxFetch / res$meta$itemsPerPage), ceiling(kqo@totalResults / res$meta$itemsPerPage))
+ else
+ sprintf("%d", ceiling(kqo@totalResults / res$meta$itemsPerPage)),
+ ' in ',
+ res$meta$benchmark,
+ '\n'
+ ))
}
page <- page + 1
results <- results + res$meta$itemsPerPage
- if (offset + results >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
+ if (length(collectedMatches[,1]) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
break
}
}
@@ -305,8 +329,8 @@
#' @aliases fetchAll
#' @rdname KorAPQuery-class
#' @export
-setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
- return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose))
+setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
+ return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
})
#' Fetches the remaining results of a KorAP query.
@@ -320,8 +344,8 @@
#' @aliases fetchRest
#' @rdname KorAPQuery-class
#' @export
-setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
- return(fetchNext(kqo, maxFetch = NA, verbose = verbose))
+setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
+ return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
})
#' Query relative frequency of search term(s)
@@ -357,6 +381,56 @@
ci(conf.level = conf.level)
})
+
+#' buildWebUIRequestUrl
+#'
+#' @rdname KorAPQuery-class
+#' @importFrom urltools url_encode
+#' @export
+buildWebUIRequestUrl <- function(kco,
+ query = if (missing(KorAPUrl))
+ stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
+ else
+ httr::parse_url(KorAPUrl)$query$q,
+ vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq,
+ KorAPUrl,
+ metadataOnly = TRUE,
+ ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql,
+ fields = c(
+ "corpusSigle",
+ "textSigle",
+ "pubDate",
+ "pubPlace",
+ "availability",
+ "textClass",
+ "snippet"
+ ),
+ accessRewriteFatal = TRUE) {
+ request <-
+ paste0(
+ '?q=',
+ urltools::url_encode(enc2utf8(as.character(query))),
+ ifelse(vc != '',
+ paste0('&cq=', urltools::url_encode(enc2utf8(vc))),
+ ''),
+ '&ql=',
+ ql
+ )
+ webUIRequestUrl <- paste0(kco@KorAPUrl, request)
+ requestUrl <- paste0(
+ kco@apiUrl,
+ 'search',
+ request,
+ '&fields=',
+ paste(fields, collapse = ","),
+ if (metadataOnly)
+ '&access-rewrite-disabled=true'
+ else
+ ''
+ )
+ webUIRequestUrl
+}
+
#´ format()
#' @rdname KorAPQuery-class
#' @param x KorAPQuery object
@@ -388,118 +462,3 @@
format(object)
})
-
-
-lemmatizeWordQuery <- function(w) {
- paste0('[tt/l=', w, ']')
-}
-
-#' Query frequencies of a node and a collocate and calculate collocation association scores
-#'
-#' \bold{\code{collocationScoreQuery}} computes various collocation association scores
-#' based on \code{\link{frequencyQuery}}s for a target word and a collocate.
-#'
-#' @aliases collocationScoreQuery
-#' @rdname KorAPQuery-class
-#'
-#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
-#' @param node target word
-#' @param collocate collocate of target word
-#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
-#' @param lemmatizeNodeQuery logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
-#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
-#' @param leftContextSize size of the left context window
-#' @param rightContextSize size of the right context window
-#' @param scoreFunctions named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}
-#' @param smoothingConstant smoothing constant will be added to all observed values
-#'
-#' @return tibble with query KorAP web request URL, all observed values and association scores
-#'
-#' @examples
-#' \donttest{
-#' new("KorAPConnection", verbose = TRUE) %>%
-#' collocationScoreQuery("Grund", "triftiger")
-#' }
-#'
-#' \donttest{
-#' new("KorAPConnection", verbose = TRUE) %>%
-#' collocationScoreQuery("Grund", c("guter", "triftiger"),
-#' scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
-#' }
-#'
-#' \donttest{
-#' library(highcharter)
-#' library(tidyr)
-#' new("KorAPConnection", verbose = TRUE) %>%
-#' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
-#' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
-#' pivot_longer(14:last_col(), names_to = "measure", values_to = "score") %>%
-#' hchart(type="spline", hcaes(label, score, group=measure)) %>%
-#' hc_add_onclick_korap_search()
-#' }
-#'
-#' @importFrom tidyr pivot_longer
-#' @export
-setMethod("collocationScoreQuery", "KorAPConnection",
- function(kco,
- node,
- collocate,
- vc = "",
- lemmatizeNodeQuery = FALSE,
- lemmatizeCollocateQuery = FALSE,
- leftContextSize = 5,
- rightContextSize = 5,
- scoreFunctions = defaultAssociationScoreFunctions(),
- smoothingConstant = .5
- ) {
- # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
- O1 <- O2 <- O <- N <- E <- w <- 0
-
- if (leftContextSize <= 0 && rightContextSize <= 0) {
- stop("At least one of leftContextSize and rightContextSize must be > 0",
- call. = FALSE)
- }
-
- if (lemmatizeNodeQuery) {
- node <- lemmatizeWordQuery(node)
- }
-
- if (lemmatizeCollocateQuery) {
- collocate <- lemmatizeWordQuery(collocate)
- }
-
- query <- ""
-
- if (leftContextSize > 0) {
- query <-
- paste0(collocate,
- if (leftContextSize > 1) paste0(" []{0,", leftContextSize - 1, "} ") else " ",
- node,
- if (rightContextSize > 0) " | ")
- }
-
- if (rightContextSize > 0) {
- query <-
- paste0(query, node,
- if (rightContextSize > 1) paste0(" []{0,", rightContextSize - 1, "} ") else " ", collocate)
- }
-
-
- tibble(
- node = node,
- collocate = collocate,
- label = queryStringToLabel(vc),
- vc = vc,
- webUIRequestUrl = frequencyQuery(kco, query, vc)$webUIRequestUrl,
- w = leftContextSize + rightContextSize,
- leftContextSize,
- rightContextSize,
- N = frequencyQuery(kco, node, vc)$total + smoothingConstant,
- O = as.double(frequencyQuery(kco, query, vc)$totalResults) + smoothingConstant,
- O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant,
- O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant,
- E = w * as.double(O1) * O2 / N
- ) %>%
- mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w))
-
- })
diff --git a/R/association-scores.R b/R/association-scores.R
index 31ab994..fccba98 100644
--- a/R/association-scores.R
+++ b/R/association-scores.R
@@ -9,6 +9,7 @@
#'
#' @return association score
#' @name association-score-functions
+#'
#' @description
#' Functions to calculate different collocation association scores between
#' a node (target word) and words in a window around the it.
@@ -18,6 +19,8 @@
#' @rdname association-score-functions
#'
+#' @family collocation analysis functions
+#'
#' @export
#'
#' @examples
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
new file mode 100644
index 0000000..0df5bea
--- /dev/null
+++ b/R/collocationAnalysis.R
@@ -0,0 +1,318 @@
+setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
+
+#' Collocation analysis
+#'
+#' @aliases collocationAnalysis
+#'
+#' @description
+#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
+#'
+#' Performs a collocation analysis for the given node (or query)
+#' in the given virtual corpus.
+#'
+#' @details
+#' The collocation analysis is currently implemented on the client side, as some of the
+#' functionality is not yet provided by the KorAP backend. Mainly for this reason
+#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
+#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
+#' and look for expression-internal collocates using the focus function (see examples and demo).
+#'
+#' To increase speed at the cost of accuracy and possible false negatives,
+#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
+#'
+#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
+#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
+#' user interface.
+#'
+#' @family collocation analysis functions
+#'
+#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. x -> [tt/l=x]
+#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
+#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
+#' @param searchHitsSampleLimit limit the size of the search hits sample
+#' @param stopwords vector of stopwords not to be considered as collocates
+#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
+#' @param seed seed for random page collecting order
+#' @param expand if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations
+#' @param ... more arguments will be passed to \code{\link{collocationScoreQuery}}
+#' @inheritParams collocationScoreQuery,KorAPConnection-method
+#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
+#'
+#' @importFrom stringr str_match str_split str_detect
+#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
+#' @importFrom purrr pmap
+#' @importFrom tidyr expand_grid
+#'
+#' @examples
+#' \donttest{
+#' # Find top collocates of "Packung" inside and outside the sports domain.
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
+#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
+#' dplyr::filter(logDice >= 5)
+#' }
+#'
+#' \donttest{
+#' # Identify the most prominent light verb construction with "in ... setzen".
+#' # Note that, currently, the use of focus function disallows exactFrequencies.
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
+#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
+#' }
+#'
+#' @export
+setMethod("collocationAnalysis", "KorAPConnection",
+ function(kco,
+ node,
+ vc = "",
+ lemmatizeNodeQuery = FALSE,
+ minOccur = 5,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ topCollocatesLimit = 200,
+ searchHitsSampleLimit = 20000,
+ ignoreCollocateCase = FALSE,
+ withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
+ exactFrequencies = TRUE,
+ stopwords = RKorAPClient::synsemanticStopwords(),
+ seed = 7,
+ expand = length(vc) != length(node),
+ ...) {
+ # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+ word <- frequency <- NULL
+
+ if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
+ stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
+ }
+
+ if (lemmatizeNodeQuery) {
+ node <- lemmatizeWordQuery(node)
+ }
+
+ if (length(node) > 1 || length(vc) > 1) {
+ grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
+ purrr::pmap(grid, function(node, vc, ...)
+ collocationAnalysis(kco,
+ node =node,
+ vc = vc,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ topCollocatesLimit = topCollocatesLimit,
+ searchHitsSampleLimit = searchHitsSampleLimit,
+ ignoreCollocateCase = ignoreCollocateCase,
+ withinSpan = withinSpan,
+ exactFrequencies = exactFrequencies,
+ stopwords = stopwords,
+ seed = seed,
+ expand = expand,
+ ...) ) %>%
+ bind_rows()
+ } else {
+ set.seed(seed)
+ candidates <- collocatesQuery(
+ kco,
+ node,
+ vc = vc,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ searchHitsSampleLimit = searchHitsSampleLimit,
+ ignoreCollocateCase = ignoreCollocateCase,
+ stopwords = stopwords,
+ ...
+ )
+
+ if (nrow(candidates) > 0) {
+ candidates <- candidates %>%
+ filter(frequency >= minOccur) %>%
+ head(topCollocatesLimit)
+ collocationScoreQuery(
+ kco,
+ node = node,
+ collocate = candidates$word,
+ vc = vc,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ observed = if (exactFrequencies) NA else candidates$frequency,
+ ignoreCollocateCase = ignoreCollocateCase,
+ withinSpan = withinSpan,
+ ...
+ ) %>%
+ filter(.$O >= minOccur) %>%
+ dplyr::arrange(dplyr::desc(logDice))
+ } else {
+ tibble()
+ }
+ }
+ }
+)
+
+#' @importFrom magrittr debug_pipe
+#' @importFrom stringr str_match str_split str_detect
+#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
+#'
+snippet2FreqTable <- function(snippet,
+ minOccur = 5,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ ignoreCollocateCase = FALSE,
+ stopwords = c(),
+ tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|")",
+ oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
+ verbose = TRUE) {
+ word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+ frequency <- NULL
+
+ if (length(snippet) < 1) {
+ dplyr::tibble(word=c(), frequency=c())
+ } else if (length(snippet) > 1) {
+ log.info(verbose, paste("Joinging", length(snippet), "kwics"))
+ for (s in snippet) {
+ oldTable <- snippet2FreqTable(
+ s,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ oldTable = oldTable,
+ stopwords = stopwords
+ )
+ }
+ log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens"))
+ oldTable %>%
+ group_by(word) %>%
+ mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
+ summarise(frequency=sum(frequency), .groups = "drop") %>%
+ arrange(desc(frequency))
+ } else {
+ stopwordsTable <- dplyr::tibble(word=stopwords)
+ match <-
+ str_match(
+ snippet,
+ '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
+ )
+
+ left <- if(leftContextSize > 0)
+ tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
+ else
+ ""
+# cat(paste("left:", left, "\n", collapse=" "))
+
+ right <- if(rightContextSize > 0)
+ head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
+ else
+ ""
+# cat(paste("right:", right, "\n", collapse=" "))
+
+ if(is.na(left) || is.na(right) || length(left) + length(right) == 0) {
+ oldTable
+ } else {
+ table(c(left, right)) %>%
+ dplyr::as_tibble(.name_repair = "minimal") %>%
+ dplyr::rename(word = 1, frequency = 2) %>%
+ dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
+ dplyr::anti_join(stopwordsTable, by="word") %>%
+ dplyr::bind_rows(oldTable)
+ }
+ }
+}
+
+#' Preliminary synsemantic stopwords function
+#'
+#' @description
+#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
+#'
+#' Preliminary synsemantic stopwords function to be used in collocation analysis.
+#'
+#' @details
+#' Currently only suitable for German. See stopwords package for other languages.
+#'
+#' @param ... future arguments for language detection
+#'
+#' @family collocation analysis functions
+#' @return Vector of synsemantic stopwords.
+#' @export
+synsemanticStopwords <- function(...) {
+ res <- c(
+ "der",
+ "die",
+ "und",
+ "in",
+ "den",
+ "von",
+ "mit",
+ "das",
+ "zu",
+ "im",
+ "ist",
+ "auf",
+ "sich",
+ "Die",
+ "des",
+ "dem",
+ "nicht",
+ "ein",
+ "eine",
+ "es",
+ "auch",
+ "an",
+ "als",
+ "am",
+ "aus",
+ "Der",
+ "bei",
+ "er",
+ "dass",
+ "sie",
+ "nach",
+ "um",
+ "Das",
+ "zum",
+ "noch",
+ "war",
+ "einen",
+ "einer",
+ "wie",
+ "einem",
+ "vor",
+ "bis",
+ "\u00fcber",
+ "so",
+ "aber",
+ "Eine",
+ "diese",
+ "Diese",
+ "oder"
+ )
+ return(res)
+}
+
+collocatesQuery <-
+ function(kco,
+ query,
+ vc = "",
+ minOccur = 5,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ searchHitsSampleLimit = 20000,
+ ignoreCollocateCase = FALSE,
+ stopwords = c(),
+ ...) {
+ frequency <- NULL
+ q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
+ if(q@totalResults == 0) {
+ tibble(word=c(), frequency=c())
+ } else {
+ q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
+ snippet2FreqTable((q@collectedMatches)$snippet,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ ignoreCollocateCase = ignoreCollocateCase,
+ stopwords = stopwords,
+ verbose = kco@verbose) %>%
+ mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
+ filter(frequency >= minOccur)
+ }
+ }
+
+
diff --git a/R/collocationScoreQuery.R b/R/collocationScoreQuery.R
new file mode 100644
index 0000000..8ec9bf2
--- /dev/null
+++ b/R/collocationScoreQuery.R
@@ -0,0 +1,175 @@
+setGeneric("collocationScoreQuery", function(kco, ...) standardGeneric("collocationScoreQuery") )
+
+## quiets concerns of R CMD check re: the .'s that appear in pipelines
+if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
+
+
+#' Query frequencies of a node and a collocate and calculate collocation association scores
+#'
+#' @aliases collocationScoreQuery
+#'
+#' @description
+#' Computes various collocation association scores
+#' based on \code{\link{frequencyQuery}}s for a target word and a collocate.
+#'
+#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
+#' @param node target word
+#' @param collocate collocate of target word
+#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
+#' @param lemmatizeNodeQuery logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
+#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
+#' @param leftContextSize size of the left context window
+#' @param rightContextSize size of the right context window
+#' @param scoreFunctions named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}
+#' @param smoothingConstant smoothing constant will be added to all observed values
+#' @param observed if collocation frequencies are already known (or estimated from a sample) they can be passed as a vector here, otherwise: NA
+#' @param ignoreCollocateCase logical, set to TRUE if collocate case should be ignored
+#' @param withinSpan KorAP span specification for collocations to be searched within
+#'
+#' @return tibble with query KorAP web request URL, all observed values and association scores
+#'
+#' @family collocation analysis functions
+#'
+#' @examples
+#' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Grund", "triftiger")
+#' }
+#'
+#' \donttest{
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Grund", c("guter", "triftiger"),
+#' scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
+#' }
+#'
+#' \donttest{
+#' library(highcharter)
+#' library(tidyr)
+#' new("KorAPConnection", verbose = TRUE) %>%
+#' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+#' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
+#' pivot_longer(14:last_col(), names_to = "measure", values_to = "score") %>%
+#' hchart(type="spline", hcaes(label, score, group=measure)) %>%
+#' hc_add_onclick_korap_search()
+#' }
+#'
+#' @importFrom tidyr pivot_longer
+#' @export
+setMethod("collocationScoreQuery", "KorAPConnection",
+ function(kco,
+ node,
+ collocate,
+ vc = "",
+ lemmatizeNodeQuery = FALSE,
+ lemmatizeCollocateQuery = FALSE,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ scoreFunctions = defaultAssociationScoreFunctions(),
+ smoothingConstant = .5,
+ observed = NA,
+ ignoreCollocateCase = FALSE,
+ withinSpan = "base/s=s"
+ ) {
+ # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+ O1 <- O2 <- O <- N <- E <- w <- 0
+
+ query <- buildCollocationQuery(node,
+ collocate,
+ lemmatizeNodeQuery,
+ lemmatizeCollocateQuery,
+ leftContextSize,
+ rightContextSize,
+ ignoreCollocateCase,
+ withinSpan)
+
+ tibble(
+ node = node,
+ collocate = collocate,
+ label = queryStringToLabel(vc),
+ vc = vc,
+ webUIRequestUrl = if (is.na(observed[1]))
+ frequencyQuery(kco, query, vc)$webUIRequestUrl
+ else
+ buildWebUIRequestUrl(
+ kco,
+ buildCollocationQuery(
+ node,
+ collocate,
+ lemmatizeNodeQuery,
+ lemmatizeCollocateQuery,
+ leftContextSize,
+ rightContextSize,
+ ignoreCollocateCase,
+ withinSpan
+ ),
+ vc
+ ),
+ w = leftContextSize + rightContextSize,
+ leftContextSize,
+ rightContextSize,
+ N = frequencyQuery(kco, node, vc)$total + smoothingConstant,
+ O = as.double( if(is.na(observed[1])) frequencyQuery(kco, query, vc)$totalResults else observed) + smoothingConstant,
+ O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant,
+ O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant,
+ E = w * as.double(O1) * O2 / N
+ ) %>%
+ mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w))
+
+ })
+
+buildCollocationQuery <- function( node,
+ collocate,
+ lemmatizeNodeQuery = FALSE,
+ lemmatizeCollocateQuery = FALSE,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ ignoreCollocateCase = FALSE,
+ withinSpan = "base/s=s"
+) {
+ if (leftContextSize <= 0 && rightContextSize <= 0) {
+ stop(sprintf("At least one of leftContextSize (=%d) and rightContextSize (=%d) must be > 0", leftContextSize, rightContextSize),
+ call. = FALSE)
+ }
+
+ if (lemmatizeNodeQuery) {
+ node <- lemmatizeWordQuery(node)
+ }
+
+ if (ignoreCollocateCase) {
+ collocate <- ignoreCollocateCaseWordQuery(collocate)
+ }
+
+ if (lemmatizeCollocateQuery) {
+ collocate <- lemmatizeWordQuery(collocate)
+ }
+
+ query <- ""
+
+ if (leftContextSize > 0) {
+ query <-
+ paste0(collocate,
+ if (leftContextSize > 1) paste0(" []{0,", leftContextSize - 1, "} ") else " ",
+ node,
+ if (rightContextSize > 0) " | ")
+ }
+
+ if (rightContextSize > 0) {
+ query <-
+ paste0(query, node,
+ if (rightContextSize > 1) paste0(" []{0,", rightContextSize - 1, "} ") else " ", collocate)
+ }
+
+ if(!is.null(withinSpan) && !is.na(withinSpan) && nchar(withinSpan) > 0) {
+ query <- sprintf("contains(<%s>, (%s))", withinSpan, query)
+ }
+
+ query
+}
+
+ignoreCollocateCaseWordQuery <- function(w) {
+ paste0(w, '/i')
+}
+
+lemmatizeWordQuery <- function(w) {
+ paste0('[tt/l=', w, ']')
+}
diff --git a/Readme.md b/Readme.md
index 18c9dfb..1e5918c 100644
--- a/Readme.md
+++ b/Readme.md
@@ -83,6 +83,42 @@
```
[![Proportion of "ergibt … Sinn" versus "macht … Sinn" between 1980 and 2010 in newspapers and magazines](man/figures/Readme-Example-2.png)<!-- -->](https://korap.github.io/RKorAPClient/man/figures/Readme-Example-2.html)
+
+### Identify *in … setzen* light verb constructions by using the new `collocationAnalysis` function
+[![Lifecycle:experimental](https://lifecycle.r-lib.org/articles/figures/lifecycle-experimental.svg)](https://www.tidyverse.org/lifecycle/#experimental)
+
+```r
+library(RKorAPClient)
+library(knitr)
+new("KorAPConnection", verbose = TRUE) %>%
+ collocationAnalysis(
+ "focus(in [tt/p=NN] {[tt/l=setzen]})",
+ leftContextSize = 1,
+ rightContextSize = 0,
+ exactFrequencies = FALSE,
+ searchHitsSampleLimit = 1000,
+ topCollocatesLimit = 20
+ ) %>%
+ mutate(LVC = sprintf("[in %s setzen](%s)", collocate, webUIRequestUrl)) %>%
+ select(LVC, logDice, pmi, ll) %>%
+ head(10) %>%
+ kable(format="pipe", digits=2)
+```
+
+|LVC | logDice| pmi| ll|
+|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------:|-----:|---------:|
+|[in Szene setzen](https://korap.ids-mannheim.de/?q=Szene%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 9.66| 10.86| 465467.52|
+|[in Gang setzen](https://korap.ids-mannheim.de/?q=Gang%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 9.21| 10.57| 256146.92|
+|[in Verbindung setzen](https://korap.ids-mannheim.de/?q=Verbindung%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 8.46| 9.62| 189682.19|
+|[in Kenntnis setzen](https://korap.ids-mannheim.de/?q=Kenntnis%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 8.28| 9.81| 101112.02|
+|[in Bewegung setzen](https://korap.ids-mannheim.de/?q=Bewegung%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 8.11| 9.24| 149397.91|
+|[in Brand setzen](https://korap.ids-mannheim.de/?q=Brand%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 8.10| 9.33| 122427.05|
+|[in Anführungszeichen setzen](https://korap.ids-mannheim.de/?q=Anf%c3%bchrungszeichen%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 7.50| 11.96| 33959.99|
+|[in Kraft setzen](https://korap.ids-mannheim.de/?q=Kraft%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 6.88| 7.88| 77796.85|
+|[in Marsch setzen](https://korap.ids-mannheim.de/?q=Marsch%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 6.87| 9.27| 22041.63|
+|[in Klammern setzen](https://korap.ids-mannheim.de/?q=Klammern%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 6.55| 10.08| 15643.27|
+
+
## Demos
More elaborate R scripts demonstrating the use of the package can be found in the [demo](demo) folder.
diff --git a/demo/00Index b/demo/00Index
index 7ff8cf1..c392b70 100644
--- a/demo/00Index
+++ b/demo/00Index
@@ -1,11 +1,11 @@
-frequenciesOverTime Plot frequency of query expressions over time
-frequenciesOverDomains Box plot frequency of query expressions per topic domain
-conditionsOverTime Plot frequency of query expressions over time under different conditions
-alternativesOverTime Plot proportion of alternative spellings/variants over time
-regional Map plot regional frequencies of query expression
-mosaicplot Visualize frequencies of alternative query terms in relation to other variables
-highcharter-example Visualize frqequencies of optionally alternative terms over time with interactive HTML and JavaScript elements using the package highcharter as wrapper for Highcharts
-shiny-frequency-curves Web application that plots frequency curves with highcharts and shiny
-writtenVsSpoken Compare frequencies in written vs. spoken corpora
-displayKwics Display query results as KWICs via html
-
+frequenciesOverTime Plot frequency of query expressions over time
+frequenciesOverDomains Box plot frequency of query expressions per topic domain
+conditionsOverTime Plot frequency of query expressions over time under different conditions
+alternativesOverTime Plot proportion of alternative spellings/variants over time
+regional Map plot regional frequencies of query expression
+mosaicplot Visualize frequencies of alternative query terms in relation to other variables
+shiny-frequency-curves Web application that plots frequency curves with highcharts and shiny
+writtenVsSpoken Compare frequencies in written vs. spoken corpora
+displayKwics Display query results as KWICs via html
+light-verb-construction-ca Collocation analysis to identify light verb constructions matching the pattern "in NN setzen", with result rendered as HTML DataTable
+highcharter-example Visualize frequencies of optionally alternative terms over time with interactive HTML and JavaScript elements using the package highcharter as wrapper for Highcharts
diff --git a/demo/light-verb-construction-ca.R b/demo/light-verb-construction-ca.R
new file mode 100644
index 0000000..c8a323e
--- /dev/null
+++ b/demo/light-verb-construction-ca.R
@@ -0,0 +1,20 @@
+library(RKorAPClient)
+library(knitr)
+new("KorAPConnection", verbose = TRUE) %>%
+ collocationAnalysis(
+ "focus(in [tt/p=NN] {[tt/l=setzen]})",
+ leftContextSize = 1,
+ rightContextSize = 0,
+ exactFrequencies = FALSE,
+ searchHitsSampleLimit = 1000,
+ topCollocatesLimit = 20
+ ) %>%
+ mutate(LVC = sprintf("[in %s setzen](%s)", collocate, webUIRequestUrl)) %>%
+ select(LVC, logDice, pmi, ll) %>%
+ head(10) %>%
+ kable(format="pipe", digits=2) %>%
+ cat(file="/tmp/in_setzen.md", sep="\n")
+
+#rmarkdown::render("/tmp/in_setzen.md")
+#browseURL("/tmp/in_setzen.html")
+
diff --git a/man/KorAPQuery-class.Rd b/man/KorAPQuery-class.Rd
index b3a0edf..e5cf68a 100644
--- a/man/KorAPQuery-class.Rd
+++ b/man/KorAPQuery-class.Rd
@@ -5,6 +5,8 @@
\alias{KorAPQuery-class}
\alias{KorAPQuery}
\alias{initialize,KorAPQuery-method}
+\alias{corpusQuery,KorAPConnection-method}
+\alias{corpusQuery}
\alias{fetchNext,KorAPQuery-method}
\alias{fetchNext}
\alias{fetchAll,KorAPQuery-method}
@@ -13,10 +15,9 @@
\alias{fetchRest}
\alias{frequencyQuery,KorAPConnection-method}
\alias{frequencyQuery}
+\alias{buildWebUIRequestUrl}
\alias{format.KorAPQuery}
\alias{show,KorAPQuery-method}
-\alias{collocationScoreQuery,KorAPConnection-method}
-\alias{collocationScoreQuery}
\title{Class KorAPQuery}
\usage{
\S4method{initialize}{KorAPQuery}(
@@ -35,16 +36,34 @@
collectedMatches = NULL
)
+\S4method{corpusQuery}{KorAPConnection}(
+ kco,
+ query = if (missing(KorAPUrl))
+ stop("At least one of the parameters query and KorAPUrl must be specified.", call. =
+ FALSE) else httr::parse_url(KorAPUrl)$query$q,
+ vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq,
+ KorAPUrl,
+ metadataOnly = TRUE,
+ ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql,
+ fields = c("corpusSigle", "textSigle", "pubDate", "pubPlace", "availability",
+ "textClass", "snippet"),
+ accessRewriteFatal = TRUE,
+ verbose = kco@verbose,
+ expand = length(vc) != length(query),
+ as.df = FALSE
+)
+
\S4method{fetchNext}{KorAPQuery}(
kqo,
offset = kqo@nextStartIndex,
maxFetch = maxResultsPerPage,
- verbose = kqo@korapConnection@verbose
+ verbose = kqo@korapConnection@verbose,
+ randomizePageOrder = FALSE
)
-\S4method{fetchAll}{KorAPQuery}(kqo, verbose = kqo@korapConnection@verbose)
+\S4method{fetchAll}{KorAPQuery}(kqo, verbose = kqo@korapConnection@verbose, ...)
-\S4method{fetchRest}{KorAPQuery}(kqo, verbose = kqo@korapConnection@verbose)
+\S4method{fetchRest}{KorAPQuery}(kqo, verbose = kqo@korapConnection@verbose, ...)
\S4method{frequencyQuery}{KorAPConnection}(
kco,
@@ -55,22 +74,23 @@
...
)
+buildWebUIRequestUrl(
+ kco,
+ query = if (missing(KorAPUrl))
+ stop("At least one of the parameters query and KorAPUrl must be specified.", call. =
+ FALSE) else httr::parse_url(KorAPUrl)$query$q,
+ vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq,
+ KorAPUrl,
+ metadataOnly = TRUE,
+ ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql,
+ fields = c("corpusSigle", "textSigle", "pubDate", "pubPlace", "availability",
+ "textClass", "snippet"),
+ accessRewriteFatal = TRUE
+)
+
\method{format}{KorAPQuery}(x, ...)
\S4method{show}{KorAPQuery}(object)
-
-\S4method{collocationScoreQuery}{KorAPConnection}(
- kco,
- node,
- collocate,
- vc = "",
- lemmatizeNodeQuery = FALSE,
- lemmatizeCollocateQuery = FALSE,
- leftContextSize = 5,
- rightContextSize = 5,
- scoreFunctions = defaultAssociationScoreFunctions(),
- smoothingConstant = 0.5
-)
}
\arguments{
\item{.Object}{…}
@@ -85,7 +105,7 @@
\item{nextStartIndex}{at what index to start the next fetch of query results}
-\item{fields}{what data / metadata fields should be collected}
+\item{fields}{(meta)data fields that will be fetched for every match.}
\item{requestUrl}{complete URL of the API request}
@@ -97,67 +117,95 @@
\item{collectedMatches}{matches already fetched from the KorAP-API-server}
+\item{kco}{\code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}}
+
+\item{query}{string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.}
+
+\item{KorAPUrl}{instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.}
+
+\item{metadataOnly}{logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).}
+
+\item{ql}{string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values.}
+
+\item{accessRewriteFatal}{abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).}
+
+\item{verbose}{print progress information if true}
+
+\item{expand}{logical that decides if \code{query} and \code{vc} parameters are expanded to all of their combinations}
+
+\item{as.df}{return result as data frame instead of as S4 object?}
+
\item{kqo}{object obtained from \code{\link{corpusQuery}}}
\item{offset}{start offset for query results to fetch}
\item{maxFetch}{maximum number of query results to fetch}
-\item{verbose}{print progress information if true}
+\item{randomizePageOrder}{fetch result pages in pseudo random order if true. Use \code{\link{set.seed}} to set seed for reproducible results.}
-\item{kco}{\code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}}
-
-\item{query}{string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.}
+\item{...}{further arguments passed to or from other methods}
\item{conf.level}{confidence level of the returned confidence interval (passed through \code{\link{ci}} to \code{\link{prop.test}}).}
\item{as.alternatives}{LOGICAL that specifies if the query terms should be treated as alternatives. If \code{as.alternatives} is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.}
-\item{...}{further arguments passed to or from other methods}
-
\item{x}{KorAPQuery object}
\item{object}{KorAPQuery object}
-
-\item{node}{target word}
-
-\item{collocate}{collocate of target word}
-
-\item{lemmatizeNodeQuery}{logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]}
-
-\item{lemmatizeCollocateQuery}{logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]}
-
-\item{leftContextSize}{size of the left context window}
-
-\item{rightContextSize}{size of the right context window}
-
-\item{scoreFunctions}{named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}}
-
-\item{smoothingConstant}{smoothing constant will be added to all observed values}
}
\value{
-The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
+Depending on the \code{as.df} parameter, a table or a \code{\link{KorAPQuery}} object that, among other information, contains the total number of results in \code{@totalResults}. The resulting object can be used to fetch all query results (with \code{\link{fetchAll}}) or the next page of results (with \code{\link{fetchNext}}).
+A corresponding URL to be used within a web browser is contained in \code{@webUIRequestUrl}
+Please make sure to check \code{$collection$rewrites} to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
-tibble with query KorAP web request URL, all observed values and association scores
+The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
}
\description{
This class provides methods to perform different kinds of queries on the KorAP API server.
\code{KorAPQuery} objects, which are typically created by the \code{\link{corpusQuery}} method,
represent the current state of a query to a KorAP server.
+\bold{\code{corpusQuery}} performs a corpus query via a connection to a KorAP-API-server
+
\bold{\code{fetchNext}} fetches the next bunch of results of a KorAP query.
-\bold{\code{fetchAll}} fetches allf results of a KorAP query.
+\bold{\code{fetchAll}} fetches all results of a KorAP query.
\bold{\code{frequencyQuery}} combines \code{\link{corpusQuery}}, \code{\link{corpusStats}} and
\code{\link{ci}} to compute a table with the relative frequencies and
confidence intervals of one ore multiple search terms across one or multiple
virtual corpora.
-
-\bold{\code{collocationScoreQuery}} computes various collocation association scores
-based on \code{\link{frequencyQuery}}s for a target word and a collocate.
}
\examples{
+# Fetch metadata of every query hit for "Ameisenplage" and show a summary
+\donttest{
+new("KorAPConnection") \%>\% corpusQuery("Ameisenplage") \%>\% fetchAll()
+}
+
+# Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
+# and show the number of query hits (but don't fetch them).
+
+new("KorAPConnection", verbose = TRUE) \%>\%
+ corpusQuery(KorAPUrl =
+ "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp")
+
+# Plot the time/frequency curve of "Ameisenplage"
+\donttest{
+new("KorAPConnection", verbose=TRUE) \%>\%
+ { . ->> kco } \%>\%
+ corpusQuery("Ameisenplage") \%>\%
+ fetchAll() \%>\%
+ slot("collectedMatches") \%>\%
+ mutate(year = lubridate::year(pubDate)) \%>\%
+ dplyr::select(year) \%>\%
+ group_by(year) \%>\%
+ summarise(Count = dplyr::n()) \%>\%
+ mutate(Freq = mapply(function(f, y)
+ f / corpusStats(kco, paste("pubDate in", y))@tokens, Count, year)) \%>\%
+ dplyr::select(-Count) \%>\%
+ complete(year = min(year):max(year), fill = list(Freq = 0)) \%>\%
+ plot(type = "l")
+}
\donttest{q <- new("KorAPConnection") \%>\% corpusQuery("Ameisenplage") \%>\% fetchNext()
q@collectedMatches
}
@@ -177,29 +225,12 @@
frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
}
-\donttest{
-new("KorAPConnection", verbose = TRUE) \%>\%
- collocationScoreQuery("Grund", "triftiger")
-}
-
-\donttest{
-new("KorAPConnection", verbose = TRUE) \%>\%
-collocationScoreQuery("Grund", c("guter", "triftiger"),
- scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
-}
-
-\donttest{
-library(highcharter)
-library(tidyr)
-new("KorAPConnection", verbose = TRUE) \%>\%
- collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
- lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) \%>\%
- pivot_longer(14:last_col(), names_to = "measure", values_to = "score") \%>\%
- hchart(type="spline", hcaes(label, score, group=measure)) \%>\%
- hc_add_onclick_korap_search()
-}
-
}
\references{
\url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
+
+\url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
+}
+\seealso{
+\code{\link{KorAPConnection}}, \code{\link{fetchNext}}, \code{\link{fetchRest}}, \code{\link{fetchAll}}, \code{\link{corpusStats}}
}
diff --git a/man/association-score-functions.Rd b/man/association-score-functions.Rd
index 7517561..fda531a 100644
--- a/man/association-score-functions.Rd
+++ b/man/association-score-functions.Rd
@@ -78,4 +78,11 @@
Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
Free PDF available from \url{http://purl.org/stefan.evert/PUB/Evert2004phd.pdf}
}
+\seealso{
+Other collocation analysis functions:
+\code{\link{collocationAnalysis,KorAPConnection-method}},
+\code{\link{collocationScoreQuery,KorAPConnection-method}},
+\code{\link{synsemanticStopwords}()}
+}
\concept{association-score-functions}
+\concept{collocation analysis functions}
diff --git a/man/collocationAnalysis-KorAPConnection-method.Rd b/man/collocationAnalysis-KorAPConnection-method.Rd
new file mode 100644
index 0000000..bcfe99e
--- /dev/null
+++ b/man/collocationAnalysis-KorAPConnection-method.Rd
@@ -0,0 +1,107 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/collocationAnalysis.R
+\name{collocationAnalysis,KorAPConnection-method}
+\alias{collocationAnalysis,KorAPConnection-method}
+\alias{collocationAnalysis}
+\title{Collocation analysis}
+\usage{
+\S4method{collocationAnalysis}{KorAPConnection}(
+ kco,
+ node,
+ vc = "",
+ lemmatizeNodeQuery = FALSE,
+ minOccur = 5,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ topCollocatesLimit = 200,
+ searchHitsSampleLimit = 20000,
+ ignoreCollocateCase = FALSE,
+ withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
+ exactFrequencies = TRUE,
+ stopwords = RKorAPClient::synsemanticStopwords(),
+ seed = 7,
+ expand = length(vc) != length(node),
+ ...
+)
+}
+\arguments{
+\item{kco}{\code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}}
+
+\item{node}{target word}
+
+\item{vc}{string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.}
+
+\item{lemmatizeNodeQuery}{if TRUE, node query will be lemmatized, i.e. x -> [tt/l=x]}
+
+\item{minOccur}{minimum absolute number of observed co-occurrences to consider a collocate candidate}
+
+\item{leftContextSize}{size of the left context window}
+
+\item{rightContextSize}{size of the right context window}
+
+\item{topCollocatesLimit}{limit analysis to the n most frequent collocates in the search hits sample}
+
+\item{searchHitsSampleLimit}{limit the size of the search hits sample}
+
+\item{ignoreCollocateCase}{logical, set to TRUE if collocate case should be ignored}
+
+\item{withinSpan}{KorAP span specification for collocations to be searched within}
+
+\item{exactFrequencies}{if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies}
+
+\item{stopwords}{vector of stopwords not to be considered as collocates}
+
+\item{seed}{seed for random page collecting order}
+
+\item{expand}{if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations}
+
+\item{...}{more arguments will be passed to \code{\link{collocationScoreQuery}}}
+}
+\value{
+Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
+}
+\description{
+\Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
+
+Performs a collocation analysis for the given node (or query)
+in the given virtual corpus.
+}
+\details{
+The collocation analysis is currently implemented on the client side, as some of the
+functionality is not yet provided by the KorAP backend. Mainly for this reason
+it is very slow (several minutes, up to hours), but on the other hand very flexible.
+You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
+and look for expression-internal collocates using the focus function (see examples and demo).
+
+To increase speed at the cost of accuracy and possible false negatives,
+you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
+
+Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
+This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
+user interface.
+}
+\examples{
+\donttest{
+ # Find top collocates of "Packung" inside and outside the sports domain.
+ new("KorAPConnection", verbose = TRUE) \%>\%
+ collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
+ leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) \%>\%
+ dplyr::filter(logDice >= 5)
+}
+
+\donttest{
+# Identify the most prominent light verb construction with "in ... setzen".
+# Note that, currently, the use of focus function disallows exactFrequencies.
+new("KorAPConnection", verbose = TRUE) \%>\%
+ collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
+ leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
+}
+
+}
+\seealso{
+Other collocation analysis functions:
+\code{\link{association-score-functions}},
+\code{\link{collocationScoreQuery,KorAPConnection-method}},
+\code{\link{synsemanticStopwords}()}
+}
+\concept{collocation analysis functions}
diff --git a/man/collocationScoreQuery-KorAPConnection-method.Rd b/man/collocationScoreQuery-KorAPConnection-method.Rd
new file mode 100644
index 0000000..605caf4
--- /dev/null
+++ b/man/collocationScoreQuery-KorAPConnection-method.Rd
@@ -0,0 +1,88 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/collocationScoreQuery.R
+\name{collocationScoreQuery,KorAPConnection-method}
+\alias{collocationScoreQuery,KorAPConnection-method}
+\alias{collocationScoreQuery}
+\title{Query frequencies of a node and a collocate and calculate collocation association scores}
+\usage{
+\S4method{collocationScoreQuery}{KorAPConnection}(
+ kco,
+ node,
+ collocate,
+ vc = "",
+ lemmatizeNodeQuery = FALSE,
+ lemmatizeCollocateQuery = FALSE,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ scoreFunctions = defaultAssociationScoreFunctions(),
+ smoothingConstant = 0.5,
+ observed = NA,
+ ignoreCollocateCase = FALSE,
+ withinSpan = "base/s=s"
+)
+}
+\arguments{
+\item{kco}{\code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}}
+
+\item{node}{target word}
+
+\item{collocate}{collocate of target word}
+
+\item{vc}{string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.}
+
+\item{lemmatizeNodeQuery}{logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]}
+
+\item{lemmatizeCollocateQuery}{logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]}
+
+\item{leftContextSize}{size of the left context window}
+
+\item{rightContextSize}{size of the right context window}
+
+\item{scoreFunctions}{named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}}
+
+\item{smoothingConstant}{smoothing constant will be added to all observed values}
+
+\item{observed}{if collocation frequencies are already known (or estimated from a sample) they can be passed as a vector here, otherwise: NA}
+
+\item{ignoreCollocateCase}{logical, set to TRUE if collocate case should be ignored}
+
+\item{withinSpan}{KorAP span specification for collocations to be searched within}
+}
+\value{
+tibble with query KorAP web request URL, all observed values and association scores
+}
+\description{
+Computes various collocation association scores
+based on \code{\link{frequencyQuery}}s for a target word and a collocate.
+}
+\examples{
+\donttest{
+new("KorAPConnection", verbose = TRUE) \%>\%
+ collocationScoreQuery("Grund", "triftiger")
+}
+
+\donttest{
+new("KorAPConnection", verbose = TRUE) \%>\%
+collocationScoreQuery("Grund", c("guter", "triftiger"),
+ scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
+}
+
+\donttest{
+library(highcharter)
+library(tidyr)
+new("KorAPConnection", verbose = TRUE) \%>\%
+ collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+ lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) \%>\%
+ pivot_longer(14:last_col(), names_to = "measure", values_to = "score") \%>\%
+ hchart(type="spline", hcaes(label, score, group=measure)) \%>\%
+ hc_add_onclick_korap_search()
+}
+
+}
+\seealso{
+Other collocation analysis functions:
+\code{\link{association-score-functions}},
+\code{\link{collocationAnalysis,KorAPConnection-method}},
+\code{\link{synsemanticStopwords}()}
+}
+\concept{collocation analysis functions}
diff --git a/man/corpusQuery-KorAPConnection-method.Rd b/man/corpusQuery-KorAPConnection-method.Rd
deleted file mode 100644
index 0170ba3..0000000
--- a/man/corpusQuery-KorAPConnection-method.Rd
+++ /dev/null
@@ -1,92 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/KorAPQuery.R
-\name{corpusQuery,KorAPConnection-method}
-\alias{corpusQuery,KorAPConnection-method}
-\alias{corpusQuery}
-\title{\bold{\code{corpusQuery}} performs a corpus query via a connection to a KorAP-API-server}
-\usage{
-\S4method{corpusQuery}{KorAPConnection}(
- kco,
- query = if (missing(KorAPUrl))
- stop("At least one of the parameters query and KorAPUrl must be specified.", call. =
- FALSE) else httr::parse_url(KorAPUrl)$query$q,
- vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq,
- KorAPUrl,
- metadataOnly = TRUE,
- ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql,
- fields = c("corpusSigle", "textSigle", "pubDate", "pubPlace", "availability",
- "textClass", "snippet"),
- accessRewriteFatal = TRUE,
- verbose = kco@verbose,
- expand = length(vc) != length(query),
- as.df = FALSE
-)
-}
-\arguments{
-\item{kco}{\code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}}
-
-\item{query}{string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.}
-
-\item{vc}{string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.}
-
-\item{KorAPUrl}{instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in \code{KorAPConnection}) to provide all necessary information for the query.}
-
-\item{metadataOnly}{logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE, unless the connection is authorized (currently not possible).}
-
-\item{ql}{string to choose the query language (see \href{https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters}{section on Query Parameters} in the Kustvakt-Wiki for possible values.}
-
-\item{fields}{(meta)data fields that will be fetched for every match.}
-
-\item{accessRewriteFatal}{abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).}
-
-\item{verbose}{print some info}
-
-\item{expand}{logical that decides if \code{query} and \code{vc} parameters are expanded to all of their combinations}
-
-\item{as.df}{return result as data frame instead of as S4 object?}
-}
-\value{
-Depending on the \code{as.df} parameter, a table or a \code{\link{KorAPQuery}} object that, among other information, contains the total number of results in \code{@totalResults}. The resulting object can be used to fetch all query results (with \code{\link{fetchAll}}) or the next page of results (with \code{\link{fetchNext}}).
-A corresponding URL to be used within a web browser is contained in \code{@webUIRequestUrl}
-Please make sure to check \code{$collection$rewrites} to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
-}
-\description{
-\bold{\code{corpusQuery}} performs a corpus query via a connection to a KorAP-API-server
-}
-\examples{
-# Fetch metadata of every query hit for "Ameisenplage" and show a summary
-\donttest{
-new("KorAPConnection") \%>\% corpusQuery("Ameisenplage") \%>\% fetchAll()
-}
-
-# Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
-# and show the number of query hits (but don't fetch them).
-
-new("KorAPConnection", verbose = TRUE) \%>\%
- corpusQuery(KorAPUrl =
- "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp")
-
-# Plot the time/frequency curve of "Ameisenplage"
-\donttest{
-new("KorAPConnection", verbose=TRUE) \%>\%
- { . ->> kco } \%>\%
- corpusQuery("Ameisenplage") \%>\%
- fetchAll() \%>\%
- slot("collectedMatches") \%>\%
- mutate(year = lubridate::year(pubDate)) \%>\%
- dplyr::select(year) \%>\%
- group_by(year) \%>\%
- summarise(Count = dplyr::n()) \%>\%
- mutate(Freq = mapply(function(f, y)
- f / corpusStats(kco, paste("pubDate in", y))@tokens, Count, year)) \%>\%
- dplyr::select(-Count) \%>\%
- complete(year = min(year):max(year), fill = list(Freq = 0)) \%>\%
- plot(type = "l")
-}
-}
-\references{
-\url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
-}
-\seealso{
-\code{\link{KorAPConnection}}, \code{\link{fetchNext}}, \code{\link{fetchRest}}, \code{\link{fetchAll}}, \code{\link{corpusStats}}
-}
diff --git a/man/figures/lifecycle-experimental.svg b/man/figures/lifecycle-experimental.svg
new file mode 100644
index 0000000..d1d060e
--- /dev/null
+++ b/man/figures/lifecycle-experimental.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="136" height="20"><linearGradient id="b" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="a"><rect width="136" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#a)"><path fill="#555" d="M0 0h53v20H0z"/><path fill="#fe7d37" d="M53 0h83v20H53z"/><path fill="url(#b)" d="M0 0h136v20H0z"/></g><g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="110"><text x="275" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="430">lifecycle</text><text x="275" y="140" transform="scale(.1)" textLength="430">lifecycle</text><text x="935" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="730">experimental</text><text x="935" y="140" transform="scale(.1)" textLength="730">experimental</text></g> </svg>
\ No newline at end of file
diff --git a/man/synsemanticStopwords.Rd b/man/synsemanticStopwords.Rd
new file mode 100644
index 0000000..54b82ad
--- /dev/null
+++ b/man/synsemanticStopwords.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/collocationAnalysis.R
+\name{synsemanticStopwords}
+\alias{synsemanticStopwords}
+\title{Preliminary synsemantic stopwords function}
+\usage{
+synsemanticStopwords(...)
+}
+\arguments{
+\item{...}{future arguments for language detection}
+}
+\value{
+Vector of synsemantic stopwords.
+}
+\description{
+\Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
+
+Preliminary synsemantic stopwords function to be used in collocation analysis.
+}
+\details{
+Currently only suitable for German. See stopwords package for other languages.
+}
+\seealso{
+Other collocation analysis functions:
+\code{\link{association-score-functions}},
+\code{\link{collocationAnalysis,KorAPConnection-method}},
+\code{\link{collocationScoreQuery,KorAPConnection-method}}
+}
+\concept{collocation analysis functions}
diff --git a/tests/testthat/test-collocations.R b/tests/testthat/test-collocations.R
new file mode 100644
index 0000000..0c17144
--- /dev/null
+++ b/tests/testthat/test-collocations.R
@@ -0,0 +1,23 @@
+test_that("collocationScoreQuery works", {
+ kco <- new("KorAPConnection", cache = TRUE, verbose = TRUE)
+ df <- collocationScoreQuery(kco, "Ameisenplage", "heimgesucht", leftContextSize=0, rightContextSize=1)
+ expect_gt(df$logDice, 1)
+ expect_equal(df$ll, ll(df$O1, df$O2, df$O, df$N, df$E, df$w))
+ expect_equal(df$pmi, pmi(df$O1, df$O2, df$O, df$N, df$E, df$w))
+ expect_equal(df$mi2, mi2(df$O1, df$O2, df$O, df$N, df$E, df$w))
+ expect_equal(df$mi3, mi3(df$O1, df$O2, df$O, df$N, df$E, df$w))
+ expect_equal(df$logDice, logDice(df$O1, df$O2, df$O, df$N, df$E, df$w))
+})
+
+test_that("collocationAnalysis works", {
+ kco <- new("KorAPConnection", cache = TRUE, verbose = TRUE)
+ df <- collocationAnalysis(kco, "Ameisenplage", leftContextSize=0, rightContextSize=1, topCollocatesLimit=1, exactFrequencies=FALSE)
+ expect_gt(df$O, df$E)
+ expect_gt(df$logDice, 1)
+})
+
+test_that("collocationAnalysis on unaccounted strings does not error out", {
+ kco <- new("KorAPConnection", cache = TRUE, verbose = TRUE)
+ df <- collocationAnalysis(kco, "XXXXXXXXAmeisenplage")
+ testthat::expect_equal(nrow(df), 0)
+})