Add documentation on the data frames returned Change-Id: Ifdc9b27829c6ab01aa4d6d7b88c339884a470877

commit: c6a66ee05a10d651ce4c00efdee2b4f218edba89 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Oct 23 13:18:48 2023 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Mon Oct 23 15:32:39 2023 +0200
tree: bd172d0942b1ca97cd50d6a2fc6209100423aee4
parent: 4c384ef2685380e7443eefbee578f56506d9950b [diff]
diff --git a/R/derekovecs.R b/R/derekovecs.R
index 2b0d492..53efb67 100644
--- a/R/derekovecs.R
+++ b/R/derekovecs.R

@@ -1,29 +1,41 @@
-#' syntagmaticNeighbours
+#' Get syntagmatic neighbours
 #'
-#' Get the syntagmatic neighbours of a word from the predictive derekovecs model.
+#' Get the syntagmatic neighbour predictions of a word from the DeReKoVecs model (see Fankhauser/Kupietz 2022, 2017).
+#'
 #' @param word The word to get the syntagmatic neighbours for.
 #' @param ... Additional parameters to pass to the API.
 #'
-#' @return Data frame syntagmatic neighbours of a word from the predictive derekovecs model.
+#' @return Data frame with the syntagmatic neighbours of a node predicted from derekovecs model, with the following columns:
+#'
+#' \describe{
+#' \item{average}{⟨a⟩ - Average raw activation of the collocator in the columns selected by auto-focus.}
+#' \item{heat}{Vector of activation of the respective collocator in the slots around the target normalized by its maximum.}
+#' \item{max}{max(a) - Maximum activation of the collocator anywhere in the output layer.}
+#' \item{overall}{Σa/Σw – Sum of the activations over the whole window normalized by the total window sum (no auto-focus).}
+#' \item{pos}{Binary encoded position of where in the window around the node the collocate is predecited with above 0 probability, e.g. 64 = 2^6 ≙ 00010 node 00000}
+#' \item{rank}{Frequency rank of predicted collocate}
+#' \item{word}{Predicted collocate}
+#' }
 #' @export
 syntagmaticNeighbours <- function(word = "Test", ...) {
   derekovecsApiCall("", word = word, json = 1, ...)$collocators
 }
 
-#' countbasedCollocates
+#' Get count-based collocates
 #'
 #' Get the collocates of a word in the count-based dereko model.
 #'
 #' @param w The word to get the collocates for.
 #' @param ... Additional parameters to pass to the API.
 #'
-#' @return  A data fram with the most salient collcates and their association scores.
+#' @return  A data frame with the most salient collocates and their association scores.
+#' @seealso [collocationScores()] for details
 #' @export
 countbasedCollocates <- function(w = "Test", ...) {
   derekovecsApiCall(method = "/getClassicCollocators", w = w, ...)$collocates
 }
 
-#' paradigmaticNeighbours
+#' Get paradigmatic neighbours
 #'
 #' Get the paradigmatic neighbours of a word in the derekovecs model.
 #'
@@ -36,7 +48,7 @@
   derekovecsApiCall("", word = word, json = 1, ...)$list[[1]]
 }
 
-#' collocationScores
+#' Get collocation scores
 #'
 #' Calculate the association scores between a node (target word) and words in a window around the it.
 #'
@@ -45,6 +57,35 @@
 #' @param ... Additional parameters to pass to the API.
 #'
 #' @return  A one row data frame with collocate and its association scores.
+#' \describe{
+#' \item{word}{collocate}
+#' \item{f2}{abs. frequency of collocate}
+#' \item{f}{abs. frequency of collocation}
+#' \item{npmi}{normalized pmi (Bouma 2009)}
+#' \item{pmi}{pointwise mutual information}
+#' \item{dice}{dice score}
+#' \item{ld}{log-dice score (Rychlý 2008) for whole window}
+#' \item{lfmd}{log-frequency biased mutual dependency ≙ pmi³ (Dalle 1994; Thanopoulos et al. 2002)}
+#' \item{llr}{log-likelihood (Dunning 1993; Evert 2004)}
+#' \item{ln_count}{frequency of collocate as left neighbour of node}
+#' \item{ln_pmi}{pmi as left neighbour}
+#' \item{md}{mutual dependency ≙ pmi² (Dalle 1994; Thanopoulos et al. 2002)}
+#' \item{rn_count}{frequency of collocate as right neighbour of node}
+#' \item{rn_pmi}{pmi as right neighbour}
+#' \item{ldaf}{log-dice score for auto focus window}
+#' \item{win}{binary encoded positions at which the collocate  appears at least once, e.g.: 1023 = 2^10-1 ≙ 11111 node 11111}
+#' \item{afwin}{binary encoded auto-focus window (see Perkuhn et al. 2012: E8-15), e.g. 64 = 2^6 ≙ 00010 node 00000 (Aus gutem Grund)}
+#' }
+#' @references
+#' Daille, B. (1994): Approche mixte pour l’extraction automatique de terminologie: statistiques lexicales et filtres linguistiques. PhD thesis, Université Paris 7.
+#'
+#' Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
+#'
+#' Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
+#' Free PDF available from <https://purl.org/stefan.evert/PUB/Evert2004phd.pdf>
+#'
+#' Thanopoulos, A., Fakotakis, N., Kokkinakis, G. (2002): Comparative evaluation of collocation extraction metrics. In: Proc. of LREC 2002: 620–625.
+#'
 #' @export
 #'
 collocationScores <- function(w, c, ...) {
@@ -52,7 +93,7 @@
                  w = w, c = c, ...)$collocates
 }
 
-#' cosineSimilarity
+#' Get cosine similarity
 #'
 #' @param w1 The first word.
 #' @param w2  The second word.
@@ -66,7 +107,7 @@
   derekovecsApiCall("/getSimilarity", w1 = w1, w2 = w2, ...)
 }
 
-#' DeReKoVecsServer
+#' Get the DeReKoVecs server
 #'
 #' @return The URL of the DeReKoVecs API server.
 #' @export

diff --git a/man/collocationScores.Rd b/man/collocationScores.Rd
index b206e18..4221d25 100644
--- a/man/collocationScores.Rd
+++ b/man/collocationScores.Rd

@@ -2,7 +2,7 @@
 % Please edit documentation in R/derekovecs.R
 \name{collocationScores}
 \alias{collocationScores}
-\title{collocationScores}
+\title{Get collocation scores}
 \usage{
 collocationScores(w, c, ...)
 }
@@ -15,7 +15,36 @@
 }
 \value{
 A one row data frame with collocate and its association scores.
+\describe{
+\item{word}{collocate}
+\item{f2}{abs. frequency of collocate}
+\item{f}{abs. frequency of collocation}
+\item{npmi}{normalized pmi (Bouma 2009)}
+\item{pmi}{pointwise mutual information}
+\item{dice}{dice score}
+\item{ld}{log-dice score (Rychlý 2008) for whole window}
+\item{lfmd}{log-frequency biased mutual dependency ≙ pmi³ (Dalle 1994; Thanopoulos et al. 2002)}
+\item{llr}{log-likelihood (Dunning 1993; Evert 2004)}
+\item{ln_count}{frequency of collocate as left neighbour of node}
+\item{ln_pmi}{pmi as left neighbour}
+\item{md}{mutual dependency ≙ pmi² (Dalle 1994; Thanopoulos et al. 2002)}
+\item{rn_count}{frequency of collocate as right neighbour of node}
+\item{rn_pmi}{pmi as right neighbour}
+\item{ldaf}{log-dice score for auto focus window}
+\item{win}{binary encoded positions at which the collocate  appears at least once, e.g.: 1023 = 2^10-1 ≙ 11111 node 11111}
+\item{afwin}{binary encoded auto-focus window (see Perkuhn et al. 2012: E8-15), e.g. 64 = 2^6 ≙ 00010 node 00000 (Aus gutem Grund)}
+}
 }
 \description{
 Calculate the association scores between a node (target word) and words in a window around the it.
 }
+\references{
+Daille, B. (1994): Approche mixte pour l’extraction automatique de terminologie: statistiques lexicales et filtres linguistiques. PhD thesis, Université Paris 7.
+
+Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
+
+Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
+Free PDF available from \url{https://purl.org/stefan.evert/PUB/Evert2004phd.pdf}
+
+Thanopoulos, A., Fakotakis, N., Kokkinakis, G. (2002): Comparative evaluation of collocation extraction metrics. In: Proc. of LREC 2002: 620–625.
+}

diff --git a/man/cosineSimilarity.Rd b/man/cosineSimilarity.Rd
index 5b30c89..f942028 100644
--- a/man/cosineSimilarity.Rd
+++ b/man/cosineSimilarity.Rd

@@ -2,7 +2,7 @@
 % Please edit documentation in R/derekovecs.R
 \name{cosineSimilarity}
 \alias{cosineSimilarity}
-\title{cosineSimilarity}
+\title{Get cosine similarity}
 \usage{
 cosineSimilarity(w1, w2, ...)
 }

diff --git a/man/countbasedCollocates.Rd b/man/countbasedCollocates.Rd
index c3f153f..33fd248 100644
--- a/man/countbasedCollocates.Rd
+++ b/man/countbasedCollocates.Rd

@@ -2,7 +2,7 @@
 % Please edit documentation in R/derekovecs.R
 \name{countbasedCollocates}
 \alias{countbasedCollocates}
-\title{countbasedCollocates}
+\title{Get count-based collocates}
 \usage{
 countbasedCollocates(w = "Test", ...)
 }
@@ -12,8 +12,11 @@
 \item{...}{Additional parameters to pass to the API.}
 }
 \value{
-A data fram with the most salient collcates and their association scores.
+A data frame with the most salient collocates and their association scores.
 }
 \description{
 Get the collocates of a word in the count-based dereko model.
 }
+\seealso{
+\code{\link[=collocationScores]{collocationScores()}} for details
+}

diff --git a/man/derekovecsServer.Rd b/man/derekovecsServer.Rd
index 4ef9089..cac1cb1 100644
--- a/man/derekovecsServer.Rd
+++ b/man/derekovecsServer.Rd

@@ -2,7 +2,7 @@
 % Please edit documentation in R/derekovecs.R
 \name{derekovecsServer}
 \alias{derekovecsServer}
-\title{DeReKoVecsServer}
+\title{Get the DeReKoVecs server}
 \usage{
 derekovecsServer()
 }
@@ -10,5 +10,5 @@
 The URL of the DeReKoVecs API server.
 }
 \description{
-DeReKoVecsServer
+Get the DeReKoVecs server
 }

diff --git a/man/paradigmaticNeighbours.Rd b/man/paradigmaticNeighbours.Rd
index 225dae7..cdbf0a2 100644
--- a/man/paradigmaticNeighbours.Rd
+++ b/man/paradigmaticNeighbours.Rd

@@ -2,7 +2,7 @@
 % Please edit documentation in R/derekovecs.R
 \name{paradigmaticNeighbours}
 \alias{paradigmaticNeighbours}
-\title{paradigmaticNeighbours}
+\title{Get paradigmatic neighbours}
 \usage{
 paradigmaticNeighbours(word = "Test", ...)
 }

diff --git a/man/syntagmaticNeighbours.Rd b/man/syntagmaticNeighbours.Rd
index 6278606..e84038f 100644
--- a/man/syntagmaticNeighbours.Rd
+++ b/man/syntagmaticNeighbours.Rd

@@ -2,7 +2,7 @@
 % Please edit documentation in R/derekovecs.R
 \name{syntagmaticNeighbours}
 \alias{syntagmaticNeighbours}
-\title{syntagmaticNeighbours}
+\title{Get syntagmatic neighbours}
 \usage{
 syntagmaticNeighbours(word = "Test", ...)
 }
@@ -12,8 +12,18 @@
 \item{...}{Additional parameters to pass to the API.}
 }
 \value{
-Data frame syntagmatic neighbours of a word from the predictive derekovecs model.
+Data frame with the syntagmatic neighbours of a node predicted from derekovecs model, with the following columns:
+
+\describe{
+\item{average}{⟨a⟩ - Average raw activation of the collocator in the columns selected by auto-focus.}
+\item{heat}{Vector of activation of the respective collocator in the slots around the target normalized by its maximum.}
+\item{max}{max(a) - Maximum activation of the collocator anywhere in the output layer.}
+\item{overall}{Σa/Σw – Sum of the activations over the whole window normalized by the total window sum (no auto-focus).}
+\item{pos}{Binary encoded position of where in the window around the node the collocate is predecited with above 0 probability, e.g. 64 = 2^6 ≙ 00010 node 00000}
+\item{rank}{Frequency rank of predicted collocate}
+\item{word}{Predicted collocate}
+}
 }
 \description{
-Get the syntagmatic neighbours of a word from the predictive derekovecs model.
+Get the syntagmatic neighbour predictions of a word from the DeReKoVecs model (see Fankhauser/Kupietz 2022, 2017).
 }
commit	c6a66ee05a10d651ce4c00efdee2b4f218edba89	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Oct 23 13:18:48 2023 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Oct 23 15:32:39 2023 +0200
tree	bd172d0942b1ca97cd50d6a2fc6209100423aee4
parent	4c384ef2685380e7443eefbee578f56506d9950b [diff]