Association score funtions as parameters to collocationScoreQuery
also factor out hc_add_onclick_korap_search
Change-Id: I48f93761b9bda4e21669a99517c17c55cf3436ee
diff --git a/man/KorAPQuery-class.Rd b/man/KorAPQuery-class.Rd
index f9acc9f..46eb322 100644
--- a/man/KorAPQuery-class.Rd
+++ b/man/KorAPQuery-class.Rd
@@ -15,6 +15,8 @@
\alias{frequencyQuery}
\alias{format.KorAPQuery}
\alias{show,KorAPQuery-method}
+\alias{collocationScoreQuery,KorAPConnection-method}
+\alias{collocationScoreQuery}
\title{Class KorAPQuery}
\usage{
\S4method{initialize}{KorAPQuery}(
@@ -56,6 +58,19 @@
\method{format}{KorAPQuery}(x, ...)
\S4method{show}{KorAPQuery}(object)
+
+\S4method{collocationScoreQuery}{KorAPConnection}(
+ kco,
+ node,
+ collocate,
+ vc = "",
+ lemmatizeNodeQuery = FALSE,
+ lemmatizeCollocateQuery = FALSE,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ scoreFunctions = defaultAssociationScoreFunctions(),
+ smoothingConstant = 0.5
+)
}
\arguments{
\item{.Object}{…}
@@ -64,7 +79,7 @@
\item{request}{query part of the request URL}
-\item{vc}{definition of a virtual corpus}
+\item{vc}{string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.}
\item{totalResults}{number of hits the query has yielded}
@@ -103,9 +118,27 @@
\item{x}{KorAPQuery object}
\item{object}{KorAPQuery object}
+
+\item{node}{target word}
+
+\item{collocate}{collocate of target word}
+
+\item{lemmatizeNodeQuery}{logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]}
+
+\item{lemmatizeCollocateQuery}{logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]}
+
+\item{leftContextSize}{size of the left context window}
+
+\item{rightContextSize}{size of the right context window}
+
+\item{scoreFunctions}{named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}}
+
+\item{smoothingConstant}{smoothing constant will be added to all observed values}
}
\value{
The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
+
+tibble with query KorAP web request URL, all observed values and association scores
}
\description{
\code{KorAPQuery} objects represent the current state of a query to a KorAP server.
@@ -117,6 +150,9 @@
\code{\link{ci}} to compute a table with the relative frequencies and
confidence intervals of one ore multiple search terms across one or multiple
virtual corpora.
+
+\bold{\code{collocationScoreQuery}} computes various collocation association scores
+based on \code{\link{frequencyQuery}}s for a target word and a collocate.
}
\examples{
\donttest{q <- new("KorAPConnection") \%>\% corpusQuery("Ameisenplage") \%>\% fetchNext()
@@ -138,6 +174,28 @@
frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
}
+\donttest{
+new("KorAPConnection", verbose = TRUE) \%>\%
+ collocationScoreQuery("Grund", "triftiger")
+}
+
+\donttest{
+new("KorAPConnection", verbose = TRUE) \%>\%
+collocationScoreQuery("Grund", c("guter", "triftiger"),
+ scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
+}
+
+\donttest{
+library(highcharter)
+library(tidyr)
+new("KorAPConnection", verbose = TRUE) \%>\%
+ collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+ lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) \%>\%
+ pivot_longer(14:last_col(), names_to = "measure", values_to = "score") \%>\%
+ hchart(type="spline", hcaes(label, score, group=measure)) \%>\%
+ hc_add_onclick_korap_search()
+}
+
}
\references{
\url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
diff --git a/man/defaultAssociationScoreFunctions.Rd b/man/defaultAssociationScoreFunctions.Rd
new file mode 100644
index 0000000..09a87d8
--- /dev/null
+++ b/man/defaultAssociationScoreFunctions.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{defaultAssociationScoreFunctions}
+\alias{defaultAssociationScoreFunctions}
+\title{Default association score functions}
+\usage{
+defaultAssociationScoreFunctions()
+}
+\value{
+list of default association score functions
+}
+\description{
+Default association score functions
+}
+\examples{
+\donttest{
+new("KorAPConnection", verbose = TRUE) \%>\%
+collocationScoreQuery("Perlen", c("verziertes", "Säue"),
+ scoreFunctions = append(associationScoreFunctions(),
+ list(localMI = function(O1, O2, O, N, E, window_size) {
+ O * log2(O/E)
+ })))
+}
+
+}
+\seealso{
+Other association-score-functions:
+\code{\link{ll}()},
+\code{\link{logDice}()},
+\code{\link{mi2}()},
+\code{\link{mi3}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}
diff --git a/man/hc_add_onclick_korap_search.Rd b/man/hc_add_onclick_korap_search.Rd
new file mode 100644
index 0000000..4e3ac06
--- /dev/null
+++ b/man/hc_add_onclick_korap_search.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/highcharter-helper.R
+\name{hc_add_onclick_korap_search}
+\alias{hc_add_onclick_korap_search}
+\title{Add KorAP search click events to highchart}
+\usage{
+hc_add_onclick_korap_search(hc)
+}
+\arguments{
+\item{hc}{highchart}
+}
+\description{
+Adds on-click events to data points of highcarts that were constructed with
+\ref{frequencyQuery} or ref \ref{collocationScoreQuery}. Clicks on data points
+then launch KorAP web UI queries for the given query term and virtual corpus in
+a separate frame.
+}
+\examples{
+\donttest{
+library(highcharter)
+new("KorAPConnection", verbose = TRUE) \%>\%
+ collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
+ lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) \%>\%
+ pivot_longer(c("O", "E")) \%>\%
+ hchart(type="spline", hcaes(label, score, group=name)) \%>\%
+ hc_add_onclick_korap_search()
+}
+
+}
diff --git a/man/ll.Rd b/man/ll.Rd
new file mode 100644
index 0000000..95f2107
--- /dev/null
+++ b/man/ll.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{ll}
+\alias{ll}
+\title{Log likelihood}
+\usage{
+ll(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\description{
+Log likelihood
+}
+\references{
+Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
+
+Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
+Free PDF available from <http://purl.org/stefan.evert/PUB/Evert2004phd.pdf>
+}
+\seealso{
+Other association-score-functions:
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{logDice}()},
+\code{\link{mi2}()},
+\code{\link{mi3}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}
diff --git a/man/logDice.Rd b/man/logDice.Rd
new file mode 100644
index 0000000..9ecdc24
--- /dev/null
+++ b/man/logDice.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{logDice}
+\alias{logDice}
+\title{log-Dice coefficient}
+\usage{
+logDice(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\description{
+log-Dice coefficient
+}
+\examples{
+
+}
+\references{
+Rychlý, Pavel (2008): <a href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A lexicographer-friendly association score.</a> In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
+}
+\seealso{
+Other association-score-functions:
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{ll}()},
+\code{\link{mi2}()},
+\code{\link{mi3}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}
diff --git a/man/mi2.Rd b/man/mi2.Rd
new file mode 100644
index 0000000..4cbbab4
--- /dev/null
+++ b/man/mi2.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{mi2}
+\alias{mi2}
+\title{Pointwise mutual information squared}
+\usage{
+mi2(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\description{
+Pointwise mutual information squared
+}
+\details{
+Also referenced to as mutual dependency (MD)
+}
+\seealso{
+Other association-score-functions:
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{ll}()},
+\code{\link{logDice}()},
+\code{\link{mi3}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}
diff --git a/man/mi3.Rd b/man/mi3.Rd
new file mode 100644
index 0000000..7c8815e
--- /dev/null
+++ b/man/mi3.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{mi3}
+\alias{mi3}
+\title{Pointwise mutual information cubed}
+\usage{
+mi3(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\description{
+Pointwise mutual information cubed
+}
+\details{
+Also referenced to as log-frequency biased mutual dependency (LFMD)
+}
+\references{
+Daille, B. (1994): Approche mixte pour l’extraction automatique de terminologie: statistiques lexicales et filtres linguistiques. PhD thesis, Université Paris 7.
+
+Thanopoulos, A., Fakotakis, N., Kokkinakis, G. (2002): Comparative evaluation of collocation extraction metrics. In: Proc. of LREC 2002: 620–625.
+}
+\seealso{
+Other association-score-functions:
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{ll}()},
+\code{\link{logDice}()},
+\code{\link{mi2}()},
+\code{\link{pmi}()}
+}
+\concept{association-score-functions}
diff --git a/man/pmi.Rd b/man/pmi.Rd
new file mode 100644
index 0000000..015e00d
--- /dev/null
+++ b/man/pmi.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/association-scores.R
+\name{pmi}
+\alias{pmi}
+\title{Pointwise mutual information}
+\usage{
+pmi(O1, O2, O, N, E, window_size)
+}
+\arguments{
+\item{O1}{observed absolute frequency of node}
+
+\item{O2}{observed absolute frequency of collocate}
+
+\item{O}{observed absolute frequency of collocation}
+
+\item{N}{corpus size}
+
+\item{E}{expected absolute frequency of collocation (already adjusted to window size)}
+
+\item{window_size}{total window size around node (left neighbour count + right neighbour count)}
+}
+\value{
+association score
+}
+\description{
+Pointwise mutual information
+}
+\seealso{
+Other association-score-functions:
+\code{\link{defaultAssociationScoreFunctions}()},
+\code{\link{ll}()},
+\code{\link{logDice}()},
+\code{\link{mi2}()},
+\code{\link{mi3}()}
+}
+\concept{association-score-functions}