blob: bc23235bd6accd81f180841e401e49c5ed4978e5 [file] [log] [blame]
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001% Generated by roxygen2: do not edit by hand
2% Please edit documentation in R/collocationAnalysis.R
3\name{collocationAnalysis,KorAPConnection-method}
4\alias{collocationAnalysis,KorAPConnection-method}
5\alias{collocationAnalysis}
6\title{Collocation analysis}
7\usage{
8\S4method{collocationAnalysis}{KorAPConnection}(
9 kco,
10 node,
11 vc = "",
12 lemmatizeNodeQuery = FALSE,
13 minOccur = 5,
14 leftContextSize = 5,
15 rightContextSize = 5,
16 topCollocatesLimit = 200,
17 searchHitsSampleLimit = 20000,
18 ignoreCollocateCase = FALSE,
19 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
20 exactFrequencies = TRUE,
Marc Kupietz6505ccf2021-11-27 17:46:25 +010021 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
Marc Kupietzdbd431a2021-08-29 12:17:45 +020022 seed = 7,
23 expand = length(vc) != length(node),
Marc Kupietz5a336b62021-11-27 17:51:35 +010024 maxRecurse = 0,
Marc Kupietzdadfd912021-12-22 12:48:20 +010025 addExamples = FALSE,
Marc Kupietz419f21f2021-12-07 10:27:30 +010026 thresholdScore = "logDice",
27 threshold = 2,
Marc Kupietz5a336b62021-11-27 17:51:35 +010028 localStopwords = c(),
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010029 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzde679ea2025-10-19 13:14:51 +020030 queryMissingScores = FALSE,
Marc Kupietz130a2a22025-10-18 16:09:23 +020031 missingScoreQuantile = 0.05,
32 vcLabel = NA_character_,
Marc Kupietzdbd431a2021-08-29 12:17:45 +020033 ...
34)
35}
36\arguments{
Marc Kupietz617266d2025-02-27 10:43:07 +010037\item{kco}{\code{\link[=KorAPConnection]{KorAPConnection()}} object (obtained e.g. from \code{KorAPConnection()}}
Marc Kupietzdbd431a2021-08-29 12:17:45 +020038
39\item{node}{target word}
40
41\item{vc}{string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.}
42
Marc Kupietz67edcb52021-09-20 21:54:24 +020043\item{lemmatizeNodeQuery}{if TRUE, node query will be lemmatized, i.e. \verb{x -> [tt/l=x]}}
Marc Kupietzdbd431a2021-08-29 12:17:45 +020044
45\item{minOccur}{minimum absolute number of observed co-occurrences to consider a collocate candidate}
46
47\item{leftContextSize}{size of the left context window}
48
49\item{rightContextSize}{size of the right context window}
50
51\item{topCollocatesLimit}{limit analysis to the n most frequent collocates in the search hits sample}
52
53\item{searchHitsSampleLimit}{limit the size of the search hits sample}
54
55\item{ignoreCollocateCase}{logical, set to TRUE if collocate case should be ignored}
56
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010057\item{withinSpan}{KorAP span specification (see \url{https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans}) for collocations to be searched within. Defaults to \code{base/s=s}.}
Marc Kupietzdbd431a2021-08-29 12:17:45 +020058
59\item{exactFrequencies}{if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies}
60
61\item{stopwords}{vector of stopwords not to be considered as collocates}
62
63\item{seed}{seed for random page collecting order}
64
65\item{expand}{if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations}
66
Marc Kupietz7d400e02021-12-19 16:39:36 +010067\item{maxRecurse}{apply collocation analysis recursively \code{maxRecurse} times}
68
69\item{addExamples}{If TRUE, examples for instances of collocations will be added in a column \code{example}. This makes a difference in particular if \code{node} is given as a lemma query.}
70
71\item{thresholdScore}{association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls}
72
73\item{threshold}{minimum value of \code{thresholdScore} function call to apply collocation analysis recursively}
74
75\item{localStopwords}{vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls}
76
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010077\item{collocateFilterRegex}{allow only collocates matching the regular expression}
78
Marc Kupietzde679ea2025-10-19 13:14:51 +020079\item{queryMissingScores}{if TRUE, attempt to retrieve corpus-based association scores for vc/collocate combinations that would otherwise be imputed, by re-querying the KorAP backend without applying the collocate frequency threshold}
80
Marc Kupietz4cbb5472025-10-19 12:15:25 +020081\item{missingScoreQuantile}{lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora; a robust spread is subtracted from this anchor so the imputed values stay below the weakest observed scores}
Marc Kupietz130a2a22025-10-18 16:09:23 +020082
83\item{vcLabel}{optional label override for the current virtual corpus (used internally when named VC collections are expanded)}
84
Marc Kupietz67edcb52021-09-20 21:54:24 +020085\item{...}{more arguments will be passed to \code{\link[=collocationScoreQuery]{collocationScoreQuery()}}}
Marc Kupietzdbd431a2021-08-29 12:17:45 +020086}
87\value{
Marc Kupietz130a2a22025-10-18 16:09:23 +020088A tibble where each row represents a candidate collocate for the requested node.
89Columns include (depending on the selected association measures):
90
91\itemize{
92\item \code{node}, \code{collocate}, \code{vc}, \code{label}: identifiers for the query node, collocate, virtual corpus, and optional label.
93\item Frequency and contingency information such as \code{frequency}, \code{O}, \code{O1}, \code{O2}, \code{E}, \code{leftContextSize}, \code{rightContextSize}, and \code{w}.
94\item Association measures (e.g. \code{logDice}, \code{ll}, \code{mi}, ...), one column per requested scorer.
95\item Per-labelled association scores produced by multi-VC comparisons using the pattern \code{<measure>_<label>}.
96\item Ranks per label/measure with the pattern \code{rank_<label>_<measure>} (1 is best) and the corresponding percentile ranks \code{percentile_rank_<label>_<measure>}.
97\item Pairwise contrasts for two-label comparisons, e.g. \code{delta_<measure>}, \code{delta_rank_<measure>}, and \code{delta_percentile_rank_<measure>}.
98\item Summary columns describing the strongest labels per measure (\code{winner_*}, \code{runner_up_*}, \code{loser_*}, and \code{max_delta_*}).
99\item Optional helper columns such as \code{query}, \code{example}, or \code{url} when example retrieval is requested.
100}
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200101}
102\description{
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200103Performs a collocation analysis for the given node (or query)
104in the given virtual corpus.
105}
106\details{
107The collocation analysis is currently implemented on the client side, as some of the
108functionality is not yet provided by the KorAP backend. Mainly for this reason
109it is very slow (several minutes, up to hours), but on the other hand very flexible.
110You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
111and look for expression-internal collocates using the focus function (see examples and demo).
112
113To increase speed at the cost of accuracy and possible false negatives,
114you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
115
Marc Kupietze7f0d682025-02-19 10:50:59 +0100116Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
117In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
118This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200119user interface.
120}
121\examples{
Marc Kupietz6ae76052021-09-21 10:34:00 +0200122\dontrun{
123
Marc Kupietzb1dec012025-06-04 17:16:57 +0200124# Find top collocates of "Packung" inside and outside the sports domain.
125KorAPConnection(verbose = TRUE) |>
126 collocationAnalysis("Packung",
127 vc = c("textClass=sport", "textClass!=sport"),
128 leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
129 ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200130 dplyr::filter(logDice >= 5)
131}
132
Marc Kupietz6ae76052021-09-21 10:34:00 +0200133\dontrun{
134
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200135# Identify the most prominent light verb construction with "in ... setzen".
136# Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz463bb162025-03-26 10:23:33 +0100137KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200138 collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietzb1dec012025-06-04 17:16:57 +0200139 leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
140 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200141}
142
143}
144\seealso{
145Other collocation analysis functions:
146\code{\link{association-score-functions}},
147\code{\link{collocationScoreQuery,KorAPConnection-method}},
148\code{\link{synsemanticStopwords}()}
149}
150\concept{collocation analysis functions}