blob: 754dce407db36fd079bc6094c5ad500ec9680768 [file] [log] [blame]
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001% Generated by roxygen2: do not edit by hand
2% Please edit documentation in R/collocationAnalysis.R
3\name{collocationAnalysis,KorAPConnection-method}
4\alias{collocationAnalysis,KorAPConnection-method}
5\alias{collocationAnalysis}
6\title{Collocation analysis}
7\usage{
8\S4method{collocationAnalysis}{KorAPConnection}(
9 kco,
10 node,
11 vc = "",
12 lemmatizeNodeQuery = FALSE,
13 minOccur = 5,
14 leftContextSize = 5,
15 rightContextSize = 5,
16 topCollocatesLimit = 200,
17 searchHitsSampleLimit = 20000,
18 ignoreCollocateCase = FALSE,
19 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
20 exactFrequencies = TRUE,
21 stopwords = RKorAPClient::synsemanticStopwords(),
22 seed = 7,
23 expand = length(vc) != length(node),
Marc Kupietz5a336b62021-11-27 17:51:35 +010024 maxRecurse = 0,
25 addExamples = TRUE,
Marc Kupietz419f21f2021-12-07 10:27:30 +010026 thresholdScore = "logDice",
27 threshold = 2,
Marc Kupietz5a336b62021-11-27 17:51:35 +010028 localStopwords = c(),
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010029 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +020030 ...
31)
32}
33\arguments{
Marc Kupietz67edcb52021-09-20 21:54:24 +020034\item{kco}{\code{\link[=KorAPConnection]{KorAPConnection()}} object (obtained e.g. from \code{new("KorAPConnection")}}
Marc Kupietzdbd431a2021-08-29 12:17:45 +020035
36\item{node}{target word}
37
38\item{vc}{string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.}
39
Marc Kupietz67edcb52021-09-20 21:54:24 +020040\item{lemmatizeNodeQuery}{if TRUE, node query will be lemmatized, i.e. \verb{x -> [tt/l=x]}}
Marc Kupietzdbd431a2021-08-29 12:17:45 +020041
42\item{minOccur}{minimum absolute number of observed co-occurrences to consider a collocate candidate}
43
44\item{leftContextSize}{size of the left context window}
45
46\item{rightContextSize}{size of the right context window}
47
48\item{topCollocatesLimit}{limit analysis to the n most frequent collocates in the search hits sample}
49
50\item{searchHitsSampleLimit}{limit the size of the search hits sample}
51
52\item{ignoreCollocateCase}{logical, set to TRUE if collocate case should be ignored}
53
54\item{withinSpan}{KorAP span specification for collocations to be searched within}
55
56\item{exactFrequencies}{if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies}
57
58\item{stopwords}{vector of stopwords not to be considered as collocates}
59
60\item{seed}{seed for random page collecting order}
61
62\item{expand}{if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations}
63
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010064\item{collocateFilterRegex}{allow only collocates matching the regular expression}
65
Marc Kupietz67edcb52021-09-20 21:54:24 +020066\item{...}{more arguments will be passed to \code{\link[=collocationScoreQuery]{collocationScoreQuery()}}}
Marc Kupietzdbd431a2021-08-29 12:17:45 +020067}
68\value{
69Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
70}
71\description{
Marc Kupietz67edcb52021-09-20 21:54:24 +020072\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
Marc Kupietzdbd431a2021-08-29 12:17:45 +020073
74Performs a collocation analysis for the given node (or query)
75in the given virtual corpus.
76}
77\details{
78The collocation analysis is currently implemented on the client side, as some of the
79functionality is not yet provided by the KorAP backend. Mainly for this reason
80it is very slow (several minutes, up to hours), but on the other hand very flexible.
81You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
82and look for expression-internal collocates using the focus function (see examples and demo).
83
84To increase speed at the cost of accuracy and possible false negatives,
85you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
86
87Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
88This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
89user interface.
90}
91\examples{
Marc Kupietz6ae76052021-09-21 10:34:00 +020092\dontrun{
93
Marc Kupietzdbd431a2021-08-29 12:17:45 +020094 # Find top collocates of "Packung" inside and outside the sports domain.
95 new("KorAPConnection", verbose = TRUE) \%>\%
96 collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
97 leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) \%>\%
98 dplyr::filter(logDice >= 5)
99}
100
Marc Kupietz6ae76052021-09-21 10:34:00 +0200101\dontrun{
102
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200103# Identify the most prominent light verb construction with "in ... setzen".
104# Note that, currently, the use of focus function disallows exactFrequencies.
105new("KorAPConnection", verbose = TRUE) \%>\%
106 collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
107 leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
108}
109
110}
111\seealso{
112Other collocation analysis functions:
113\code{\link{association-score-functions}},
114\code{\link{collocationScoreQuery,KorAPConnection-method}},
115\code{\link{synsemanticStopwords}()}
116}
117\concept{collocation analysis functions}