| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 1 | % Generated by roxygen2: do not edit by hand | 
|  | 2 | % Please edit documentation in R/collocationAnalysis.R | 
|  | 3 | \name{collocationAnalysis,KorAPConnection-method} | 
|  | 4 | \alias{collocationAnalysis,KorAPConnection-method} | 
|  | 5 | \alias{collocationAnalysis} | 
|  | 6 | \title{Collocation analysis} | 
|  | 7 | \usage{ | 
|  | 8 | \S4method{collocationAnalysis}{KorAPConnection}( | 
|  | 9 | kco, | 
|  | 10 | node, | 
|  | 11 | vc = "", | 
|  | 12 | lemmatizeNodeQuery = FALSE, | 
|  | 13 | minOccur = 5, | 
|  | 14 | leftContextSize = 5, | 
|  | 15 | rightContextSize = 5, | 
|  | 16 | topCollocatesLimit = 200, | 
|  | 17 | searchHitsSampleLimit = 20000, | 
|  | 18 | ignoreCollocateCase = FALSE, | 
|  | 19 | withinSpan = ifelse(exactFrequencies, "base/s=s", ""), | 
|  | 20 | exactFrequencies = TRUE, | 
|  | 21 | stopwords = RKorAPClient::synsemanticStopwords(), | 
|  | 22 | seed = 7, | 
|  | 23 | expand = length(vc) != length(node), | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 24 | maxRecurse = 0, | 
|  | 25 | addExamples = TRUE, | 
| Marc Kupietz | 419f21f | 2021-12-07 10:27:30 +0100 | [diff] [blame] | 26 | thresholdScore = "logDice", | 
|  | 27 | threshold = 2, | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 28 | localStopwords = c(), | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 29 | ... | 
|  | 30 | ) | 
|  | 31 | } | 
|  | 32 | \arguments{ | 
| Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 33 | \item{kco}{\code{\link[=KorAPConnection]{KorAPConnection()}} object (obtained e.g. from \code{new("KorAPConnection")}} | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 34 |  | 
|  | 35 | \item{node}{target word} | 
|  | 36 |  | 
|  | 37 | \item{vc}{string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.} | 
|  | 38 |  | 
| Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 39 | \item{lemmatizeNodeQuery}{if TRUE, node query will be lemmatized, i.e. \verb{x -> [tt/l=x]}} | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 40 |  | 
|  | 41 | \item{minOccur}{minimum absolute number of observed co-occurrences to consider a collocate candidate} | 
|  | 42 |  | 
|  | 43 | \item{leftContextSize}{size of the left context window} | 
|  | 44 |  | 
|  | 45 | \item{rightContextSize}{size of the right context window} | 
|  | 46 |  | 
|  | 47 | \item{topCollocatesLimit}{limit analysis to the n most frequent collocates in the search hits sample} | 
|  | 48 |  | 
|  | 49 | \item{searchHitsSampleLimit}{limit the size of the search hits sample} | 
|  | 50 |  | 
|  | 51 | \item{ignoreCollocateCase}{logical, set to TRUE if collocate case should be ignored} | 
|  | 52 |  | 
|  | 53 | \item{withinSpan}{KorAP span specification for collocations to be searched within} | 
|  | 54 |  | 
|  | 55 | \item{exactFrequencies}{if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies} | 
|  | 56 |  | 
|  | 57 | \item{stopwords}{vector of stopwords not to be considered as collocates} | 
|  | 58 |  | 
|  | 59 | \item{seed}{seed for random page collecting order} | 
|  | 60 |  | 
|  | 61 | \item{expand}{if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations} | 
|  | 62 |  | 
| Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 63 | \item{...}{more arguments will be passed to \code{\link[=collocationScoreQuery]{collocationScoreQuery()}}} | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 64 | } | 
|  | 65 | \value{ | 
|  | 66 | Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc. | 
|  | 67 | } | 
|  | 68 | \description{ | 
| Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 69 | \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 70 |  | 
|  | 71 | Performs a collocation analysis for the given node (or query) | 
|  | 72 | in the given virtual corpus. | 
|  | 73 | } | 
|  | 74 | \details{ | 
|  | 75 | The collocation analysis is currently implemented on the client side, as some of the | 
|  | 76 | functionality is not yet provided by the KorAP backend. Mainly for this reason | 
|  | 77 | it is very slow (several minutes, up to hours), but on the other hand very flexible. | 
|  | 78 | You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries, | 
|  | 79 | and look for expression-internal collocates using the focus function (see examples and demo). | 
|  | 80 |  | 
|  | 81 | To increase speed at the cost of accuracy and possible false negatives, | 
|  | 82 | you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE. | 
|  | 83 |  | 
|  | 84 | Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one. | 
|  | 85 | This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web | 
|  | 86 | user interface. | 
|  | 87 | } | 
|  | 88 | \examples{ | 
| Marc Kupietz | 6ae7605 | 2021-09-21 10:34:00 +0200 | [diff] [blame] | 89 | \dontrun{ | 
|  | 90 |  | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 91 | # Find top collocates of "Packung" inside and outside the sports domain. | 
|  | 92 | new("KorAPConnection", verbose = TRUE) \%>\% | 
|  | 93 | collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"), | 
|  | 94 | leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) \%>\% | 
|  | 95 | dplyr::filter(logDice >= 5) | 
|  | 96 | } | 
|  | 97 |  | 
| Marc Kupietz | 6ae7605 | 2021-09-21 10:34:00 +0200 | [diff] [blame] | 98 | \dontrun{ | 
|  | 99 |  | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 100 | # Identify the most prominent light verb construction with "in ... setzen". | 
|  | 101 | # Note that, currently, the use of focus function disallows exactFrequencies. | 
|  | 102 | new("KorAPConnection", verbose = TRUE) \%>\% | 
|  | 103 | collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})", | 
|  | 104 | leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20) | 
|  | 105 | } | 
|  | 106 |  | 
|  | 107 | } | 
|  | 108 | \seealso{ | 
|  | 109 | Other collocation analysis functions: | 
|  | 110 | \code{\link{association-score-functions}}, | 
|  | 111 | \code{\link{collocationScoreQuery,KorAPConnection-method}}, | 
|  | 112 | \code{\link{synsemanticStopwords}()} | 
|  | 113 | } | 
|  | 114 | \concept{collocation analysis functions} |