Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 1 | % Generated by roxygen2: do not edit by hand |
| 2 | % Please edit documentation in R/collocationAnalysis.R |
| 3 | \name{collocationAnalysis,KorAPConnection-method} |
| 4 | \alias{collocationAnalysis,KorAPConnection-method} |
| 5 | \alias{collocationAnalysis} |
| 6 | \title{Collocation analysis} |
| 7 | \usage{ |
| 8 | \S4method{collocationAnalysis}{KorAPConnection}( |
| 9 | kco, |
| 10 | node, |
| 11 | vc = "", |
| 12 | lemmatizeNodeQuery = FALSE, |
| 13 | minOccur = 5, |
| 14 | leftContextSize = 5, |
| 15 | rightContextSize = 5, |
| 16 | topCollocatesLimit = 200, |
| 17 | searchHitsSampleLimit = 20000, |
| 18 | ignoreCollocateCase = FALSE, |
| 19 | withinSpan = ifelse(exactFrequencies, "base/s=s", ""), |
| 20 | exactFrequencies = TRUE, |
| 21 | stopwords = RKorAPClient::synsemanticStopwords(), |
| 22 | seed = 7, |
| 23 | expand = length(vc) != length(node), |
Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame^] | 24 | maxRecurse = 0, |
| 25 | addExamples = TRUE, |
| 26 | localStopwords = c(), |
Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 27 | ... |
| 28 | ) |
| 29 | } |
| 30 | \arguments{ |
Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 31 | \item{kco}{\code{\link[=KorAPConnection]{KorAPConnection()}} object (obtained e.g. from \code{new("KorAPConnection")}} |
Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 32 | |
| 33 | \item{node}{target word} |
| 34 | |
| 35 | \item{vc}{string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.} |
| 36 | |
Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 37 | \item{lemmatizeNodeQuery}{if TRUE, node query will be lemmatized, i.e. \verb{x -> [tt/l=x]}} |
Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 38 | |
| 39 | \item{minOccur}{minimum absolute number of observed co-occurrences to consider a collocate candidate} |
| 40 | |
| 41 | \item{leftContextSize}{size of the left context window} |
| 42 | |
| 43 | \item{rightContextSize}{size of the right context window} |
| 44 | |
| 45 | \item{topCollocatesLimit}{limit analysis to the n most frequent collocates in the search hits sample} |
| 46 | |
| 47 | \item{searchHitsSampleLimit}{limit the size of the search hits sample} |
| 48 | |
| 49 | \item{ignoreCollocateCase}{logical, set to TRUE if collocate case should be ignored} |
| 50 | |
| 51 | \item{withinSpan}{KorAP span specification for collocations to be searched within} |
| 52 | |
| 53 | \item{exactFrequencies}{if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies} |
| 54 | |
| 55 | \item{stopwords}{vector of stopwords not to be considered as collocates} |
| 56 | |
| 57 | \item{seed}{seed for random page collecting order} |
| 58 | |
| 59 | \item{expand}{if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations} |
| 60 | |
Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 61 | \item{...}{more arguments will be passed to \code{\link[=collocationScoreQuery]{collocationScoreQuery()}}} |
Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 62 | } |
| 63 | \value{ |
| 64 | Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc. |
| 65 | } |
| 66 | \description{ |
Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 67 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} |
Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 68 | |
| 69 | Performs a collocation analysis for the given node (or query) |
| 70 | in the given virtual corpus. |
| 71 | } |
| 72 | \details{ |
| 73 | The collocation analysis is currently implemented on the client side, as some of the |
| 74 | functionality is not yet provided by the KorAP backend. Mainly for this reason |
| 75 | it is very slow (several minutes, up to hours), but on the other hand very flexible. |
| 76 | You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries, |
| 77 | and look for expression-internal collocates using the focus function (see examples and demo). |
| 78 | |
| 79 | To increase speed at the cost of accuracy and possible false negatives, |
| 80 | you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE. |
| 81 | |
| 82 | Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one. |
| 83 | This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web |
| 84 | user interface. |
| 85 | } |
| 86 | \examples{ |
Marc Kupietz | 6ae7605 | 2021-09-21 10:34:00 +0200 | [diff] [blame] | 87 | \dontrun{ |
| 88 | |
Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 89 | # Find top collocates of "Packung" inside and outside the sports domain. |
| 90 | new("KorAPConnection", verbose = TRUE) \%>\% |
| 91 | collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"), |
| 92 | leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) \%>\% |
| 93 | dplyr::filter(logDice >= 5) |
| 94 | } |
| 95 | |
Marc Kupietz | 6ae7605 | 2021-09-21 10:34:00 +0200 | [diff] [blame] | 96 | \dontrun{ |
| 97 | |
Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 98 | # Identify the most prominent light verb construction with "in ... setzen". |
| 99 | # Note that, currently, the use of focus function disallows exactFrequencies. |
| 100 | new("KorAPConnection", verbose = TRUE) \%>\% |
| 101 | collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})", |
| 102 | leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20) |
| 103 | } |
| 104 | |
| 105 | } |
| 106 | \seealso{ |
| 107 | Other collocation analysis functions: |
| 108 | \code{\link{association-score-functions}}, |
| 109 | \code{\link{collocationScoreQuery,KorAPConnection-method}}, |
| 110 | \code{\link{synsemanticStopwords}()} |
| 111 | } |
| 112 | \concept{collocation analysis functions} |