Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 1 | % Generated by roxygen2: do not edit by hand |
| 2 | % Please edit documentation in R/collocationAnalysis.R |
| 3 | \name{collocationAnalysis,KorAPConnection-method} |
| 4 | \alias{collocationAnalysis,KorAPConnection-method} |
| 5 | \alias{collocationAnalysis} |
| 6 | \title{Collocation analysis} |
| 7 | \usage{ |
| 8 | \S4method{collocationAnalysis}{KorAPConnection}( |
| 9 | kco, |
| 10 | node, |
| 11 | vc = "", |
| 12 | lemmatizeNodeQuery = FALSE, |
| 13 | minOccur = 5, |
| 14 | leftContextSize = 5, |
| 15 | rightContextSize = 5, |
| 16 | topCollocatesLimit = 200, |
| 17 | searchHitsSampleLimit = 20000, |
| 18 | ignoreCollocateCase = FALSE, |
| 19 | withinSpan = ifelse(exactFrequencies, "base/s=s", ""), |
| 20 | exactFrequencies = TRUE, |
| 21 | stopwords = RKorAPClient::synsemanticStopwords(), |
| 22 | seed = 7, |
| 23 | expand = length(vc) != length(node), |
| 24 | ... |
| 25 | ) |
| 26 | } |
| 27 | \arguments{ |
| 28 | \item{kco}{\code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}} |
| 29 | |
| 30 | \item{node}{target word} |
| 31 | |
| 32 | \item{vc}{string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.} |
| 33 | |
| 34 | \item{lemmatizeNodeQuery}{if TRUE, node query will be lemmatized, i.e. x -> [tt/l=x]} |
| 35 | |
| 36 | \item{minOccur}{minimum absolute number of observed co-occurrences to consider a collocate candidate} |
| 37 | |
| 38 | \item{leftContextSize}{size of the left context window} |
| 39 | |
| 40 | \item{rightContextSize}{size of the right context window} |
| 41 | |
| 42 | \item{topCollocatesLimit}{limit analysis to the n most frequent collocates in the search hits sample} |
| 43 | |
| 44 | \item{searchHitsSampleLimit}{limit the size of the search hits sample} |
| 45 | |
| 46 | \item{ignoreCollocateCase}{logical, set to TRUE if collocate case should be ignored} |
| 47 | |
| 48 | \item{withinSpan}{KorAP span specification for collocations to be searched within} |
| 49 | |
| 50 | \item{exactFrequencies}{if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies} |
| 51 | |
| 52 | \item{stopwords}{vector of stopwords not to be considered as collocates} |
| 53 | |
| 54 | \item{seed}{seed for random page collecting order} |
| 55 | |
| 56 | \item{expand}{if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations} |
| 57 | |
| 58 | \item{...}{more arguments will be passed to \code{\link{collocationScoreQuery}}} |
| 59 | } |
| 60 | \value{ |
| 61 | Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc. |
| 62 | } |
| 63 | \description{ |
| 64 | \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")} |
| 65 | |
| 66 | Performs a collocation analysis for the given node (or query) |
| 67 | in the given virtual corpus. |
| 68 | } |
| 69 | \details{ |
| 70 | The collocation analysis is currently implemented on the client side, as some of the |
| 71 | functionality is not yet provided by the KorAP backend. Mainly for this reason |
| 72 | it is very slow (several minutes, up to hours), but on the other hand very flexible. |
| 73 | You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries, |
| 74 | and look for expression-internal collocates using the focus function (see examples and demo). |
| 75 | |
| 76 | To increase speed at the cost of accuracy and possible false negatives, |
| 77 | you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE. |
| 78 | |
| 79 | Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one. |
| 80 | This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web |
| 81 | user interface. |
| 82 | } |
| 83 | \examples{ |
| 84 | \donttest{ |
| 85 | # Find top collocates of "Packung" inside and outside the sports domain. |
| 86 | new("KorAPConnection", verbose = TRUE) \%>\% |
| 87 | collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"), |
| 88 | leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) \%>\% |
| 89 | dplyr::filter(logDice >= 5) |
| 90 | } |
| 91 | |
| 92 | \donttest{ |
| 93 | # Identify the most prominent light verb construction with "in ... setzen". |
| 94 | # Note that, currently, the use of focus function disallows exactFrequencies. |
| 95 | new("KorAPConnection", verbose = TRUE) \%>\% |
| 96 | collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})", |
| 97 | leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20) |
| 98 | } |
| 99 | |
| 100 | } |
| 101 | \seealso{ |
| 102 | Other collocation analysis functions: |
| 103 | \code{\link{association-score-functions}}, |
| 104 | \code{\link{collocationScoreQuery,KorAPConnection-method}}, |
| 105 | \code{\link{synsemanticStopwords}()} |
| 106 | } |
| 107 | \concept{collocation analysis functions} |