| % Generated by roxygen2: do not edit by hand |
| % Please edit documentation in R/collocationAnalysis.R |
| \name{collocationAnalysis,KorAPConnection-method} |
| \alias{collocationAnalysis,KorAPConnection-method} |
| \alias{collocationAnalysis} |
| \title{Collocation analysis} |
| \usage{ |
| \S4method{collocationAnalysis}{KorAPConnection}( |
| kco, |
| node, |
| vc = "", |
| lemmatizeNodeQuery = FALSE, |
| minOccur = 5, |
| leftContextSize = 5, |
| rightContextSize = 5, |
| topCollocatesLimit = 200, |
| searchHitsSampleLimit = 20000, |
| ignoreCollocateCase = FALSE, |
| withinSpan = ifelse(exactFrequencies, "base/s=s", ""), |
| exactFrequencies = TRUE, |
| stopwords = append(RKorAPClient::synsemanticStopwords(), node), |
| seed = 7, |
| expand = length(vc) != length(node), |
| maxRecurse = 0, |
| addExamples = FALSE, |
| thresholdScore = "logDice", |
| threshold = 2, |
| localStopwords = c(), |
| collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$", |
| ... |
| ) |
| } |
| \arguments{ |
| \item{kco}{\code{\link[=KorAPConnection]{KorAPConnection()}} object (obtained e.g. from \code{new("KorAPConnection")}} |
| |
| \item{node}{target word} |
| |
| \item{vc}{string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.} |
| |
| \item{lemmatizeNodeQuery}{if TRUE, node query will be lemmatized, i.e. \verb{x -> [tt/l=x]}} |
| |
| \item{minOccur}{minimum absolute number of observed co-occurrences to consider a collocate candidate} |
| |
| \item{leftContextSize}{size of the left context window} |
| |
| \item{rightContextSize}{size of the right context window} |
| |
| \item{topCollocatesLimit}{limit analysis to the n most frequent collocates in the search hits sample} |
| |
| \item{searchHitsSampleLimit}{limit the size of the search hits sample} |
| |
| \item{ignoreCollocateCase}{logical, set to TRUE if collocate case should be ignored} |
| |
| \item{withinSpan}{KorAP span specification (see \url{https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans}) for collocations to be searched within. Defaults to \code{base/s=s}.} |
| |
| \item{exactFrequencies}{if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies} |
| |
| \item{stopwords}{vector of stopwords not to be considered as collocates} |
| |
| \item{seed}{seed for random page collecting order} |
| |
| \item{expand}{if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations} |
| |
| \item{maxRecurse}{apply collocation analysis recursively \code{maxRecurse} times} |
| |
| \item{addExamples}{If TRUE, examples for instances of collocations will be added in a column \code{example}. This makes a difference in particular if \code{node} is given as a lemma query.} |
| |
| \item{thresholdScore}{association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls} |
| |
| \item{threshold}{minimum value of \code{thresholdScore} function call to apply collocation analysis recursively} |
| |
| \item{localStopwords}{vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls} |
| |
| \item{collocateFilterRegex}{allow only collocates matching the regular expression} |
| |
| \item{...}{more arguments will be passed to \code{\link[=collocationScoreQuery]{collocationScoreQuery()}}} |
| } |
| \value{ |
| Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc. |
| } |
| \description{ |
| \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} |
| |
| Performs a collocation analysis for the given node (or query) |
| in the given virtual corpus. |
| } |
| \details{ |
| The collocation analysis is currently implemented on the client side, as some of the |
| functionality is not yet provided by the KorAP backend. Mainly for this reason |
| it is very slow (several minutes, up to hours), but on the other hand very flexible. |
| You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries, |
| and look for expression-internal collocates using the focus function (see examples and demo). |
| |
| To increase speed at the cost of accuracy and possible false negatives, |
| you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE. |
| |
| Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one. |
| This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web |
| user interface. |
| } |
| \examples{ |
| \dontrun{ |
| |
| # Find top collocates of "Packung" inside and outside the sports domain. |
| new("KorAPConnection", verbose = TRUE) \%>\% |
| collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"), |
| leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) \%>\% |
| dplyr::filter(logDice >= 5) |
| } |
| |
| \dontrun{ |
| |
| # Identify the most prominent light verb construction with "in ... setzen". |
| # Note that, currently, the use of focus function disallows exactFrequencies. |
| new("KorAPConnection", verbose = TRUE) \%>\% |
| collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})", |
| leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20) |
| } |
| |
| } |
| \seealso{ |
| Other collocation analysis functions: |
| \code{\link{association-score-functions}}, |
| \code{\link{collocationScoreQuery,KorAPConnection-method}}, |
| \code{\link{synsemanticStopwords}()} |
| } |
| \concept{collocation analysis functions} |