blob: a55c36517ff13ef4bfc8e2797aec80d7b31a51d8 [file] [log] [blame]
Marc Kupietzc6a66ee2023-10-23 13:18:48 +02001#' Get syntagmatic neighbours
Marc Kupietzbb4f54c2023-10-19 21:22:44 +02002#'
Marc Kupietzc6a66ee2023-10-23 13:18:48 +02003#' Get the syntagmatic neighbour predictions of a word from the DeReKoVecs model (see Fankhauser/Kupietz 2022, 2017).
4#'
Marc Kupietzbb4f54c2023-10-19 21:22:44 +02005#' @param word The word to get the syntagmatic neighbours for.
6#' @param ... Additional parameters to pass to the API.
7#'
Marc Kupietzc6a66ee2023-10-23 13:18:48 +02008#' @return Data frame with the syntagmatic neighbours of a node predicted from derekovecs model, with the following columns:
9#'
10#' \describe{
11#' \item{average}{⟨a⟩ - Average raw activation of the collocator in the columns selected by auto-focus.}
12#' \item{heat}{Vector of activation of the respective collocator in the slots around the target normalized by its maximum.}
13#' \item{max}{max(a) - Maximum activation of the collocator anywhere in the output layer.}
14#' \item{overall}{Σa/Σw – Sum of the activations over the whole window normalized by the total window sum (no auto-focus).}
15#' \item{pos}{Binary encoded position of where in the window around the node the collocate is predecited with above 0 probability, e.g. 64 = 2^6 ≙ 00010 node 00000}
16#' \item{rank}{Frequency rank of predicted collocate}
17#' \item{word}{Predicted collocate}
18#' }
Marc Kupietzbb4f54c2023-10-19 21:22:44 +020019#' @export
Marc Kupietze981eae2023-10-18 09:00:17 +020020syntagmaticNeighbours <- function(word = "Test", ...) {
Marc Kupietzbb4f54c2023-10-19 21:22:44 +020021 derekovecsApiCall("", word = word, json = 1, ...)$collocators
Marc Kupietze981eae2023-10-18 09:00:17 +020022}
23
Marc Kupietzc6a66ee2023-10-23 13:18:48 +020024#' Get count-based collocates
Marc Kupietzbb4f54c2023-10-19 21:22:44 +020025#'
26#' Get the collocates of a word in the count-based dereko model.
27#'
28#' @param w The word to get the collocates for.
29#' @param ... Additional parameters to pass to the API.
30#'
Marc Kupietzc6a66ee2023-10-23 13:18:48 +020031#' @return A data frame with the most salient collocates and their association scores.
32#' @seealso [collocationScores()] for details
Marc Kupietzbb4f54c2023-10-19 21:22:44 +020033#' @export
Marc Kupietze981eae2023-10-18 09:00:17 +020034countbasedCollocates <- function(w = "Test", ...) {
Marc Kupietzbb4f54c2023-10-19 21:22:44 +020035 derekovecsApiCall(method = "/getClassicCollocators", w = w, ...)$collocates
Marc Kupietze981eae2023-10-18 09:00:17 +020036}
37
Marc Kupietzd417ba62024-12-10 17:54:07 +010038#' Get word frequency
39#'
40#' Gets the absolute frequency of a word in the corpus.
41#'
42#' @param w The word to get the frequency of.
43#' @param ... Additional parameters to pass to the API.
44#'
45#' @return The absolute frequency of the word.
46#' @export
47wordFrequency <- function(w = "Test", ...) {
48 derekovecsApiCall(method = "/getClassicCollocators", w = w, ...)$f1
49}
50
Marc Kupietz6a6bdf12024-12-10 18:08:28 +010051#' Get corpus size
52#'
53#' Gets the token size of the corpus used to train the model.
54#'
55#' @param w Probe word (defaults to `Test`) required for old derekovecs servers.
56#' @param ... Additional parameters to pass to the API.
57#'
58#' @return The number of tokens in the corpus.
59#' @export
60corpusSize <- function(w = "Test", ...) {
61 derekovecsApiCall(method = "/getClassicCollocators", w = w, ...)$N
62}
Marc Kupietzd417ba62024-12-10 17:54:07 +010063
Marc Kupietzc6a66ee2023-10-23 13:18:48 +020064#' Get paradigmatic neighbours
Marc Kupietzbb4f54c2023-10-19 21:22:44 +020065#'
66#' Get the paradigmatic neighbours of a word in the derekovecs model.
67#'
68#' @param word The word to get the paradigmatic neighbours for.
69#' @param ... Additional parameters to pass to the API.
70#' @return A list of words with their similarity scores.
71#' @export
72#'
Marc Kupietze981eae2023-10-18 09:00:17 +020073paradigmaticNeighbours <- function(word = "Test", ...) {
Marc Kupietzbb4f54c2023-10-19 21:22:44 +020074 derekovecsApiCall("", word = word, json = 1, ...)$list[[1]]
Marc Kupietze981eae2023-10-18 09:00:17 +020075}
76
Marc Kupietzdb9bb1a2023-10-24 16:03:36 +020077#' Get word embedding
78#'
79#' Get the normalized embedding vector of a word from the derekovecs model.
80#'
81#' @param word The word to get the paradigmatic neighbours for.
82#' @param ... Additional parameters to pass to the API.
83#' @return Normalized embedding vector of the given word.
84#' @export
85#'
86wordEmbedding <- function(word = "Test", ...) {
87 derekovecsApiCall("", word = word, n=1, json = 1, ...)[["list"]][[1]][["vector"]][[1]]
88}
89
90#' Get frequency rank
91#'
92#' Gets the frequency rank of a word in the training data.
93#'
94#' @param word The word to get the frequency rank of.
95#' @param ... Additional parameters to pass to the API.
96#' @return Frequency rank.
97#' @export
Marc Kupietzdb9bb1a2023-10-24 16:03:36 +020098frequencyRank <- function(word = "Test", ...) {
99 derekovecsApiCall("/getWord", w = word, ...)$frequencyRank
100}
101
Marc Kupietzf977fa72023-11-05 18:02:39 +0100102#' Get derekovecs server version
103#' @return The version of the derekovecs server.
104#' @export
105serverVersion <- function() {
106 derekovecsApiCall("/getVersion")
107}
108
109#' Get vocabulary size
110#' @return The vocabulary size of the model.
111#' @export
112#' @seealso [frequencyRank()]
113vocabSize <- function() {
114 derekovecsApiCall("/getVocabSize")
115}
116
117#' Get model name
118#' @return The name of the model.
119#' @export
120modelName <- function() {
121 derekovecsApiCall("/getModelName")
122}
123
Marc Kupietzc6a66ee2023-10-23 13:18:48 +0200124#' Get collocation scores
Marc Kupietzbb4f54c2023-10-19 21:22:44 +0200125#'
126#' Calculate the association scores between a node (target word) and words in a window around the it.
127#'
128#' @param w The target word/node.
129#' @param c The collocate.
130#' @param ... Additional parameters to pass to the API.
131#'
132#' @return A one row data frame with collocate and its association scores.
Marc Kupietzc6a66ee2023-10-23 13:18:48 +0200133#' \describe{
134#' \item{word}{collocate}
135#' \item{f2}{abs. frequency of collocate}
136#' \item{f}{abs. frequency of collocation}
137#' \item{npmi}{normalized pmi (Bouma 2009)}
138#' \item{pmi}{pointwise mutual information}
139#' \item{dice}{dice score}
140#' \item{ld}{log-dice score (Rychlý 2008) for whole window}
141#' \item{lfmd}{log-frequency biased mutual dependency ≙ pmi³ (Dalle 1994; Thanopoulos et al. 2002)}
142#' \item{llr}{log-likelihood (Dunning 1993; Evert 2004)}
143#' \item{ln_count}{frequency of collocate as left neighbour of node}
144#' \item{ln_pmi}{pmi as left neighbour}
145#' \item{md}{mutual dependency ≙ pmi² (Dalle 1994; Thanopoulos et al. 2002)}
146#' \item{rn_count}{frequency of collocate as right neighbour of node}
147#' \item{rn_pmi}{pmi as right neighbour}
148#' \item{ldaf}{log-dice score for auto focus window}
149#' \item{win}{binary encoded positions at which the collocate appears at least once, e.g.: 1023 = 2^10-1 ≙ 11111 node 11111}
150#' \item{afwin}{binary encoded auto-focus window (see Perkuhn et al. 2012: E8-15), e.g. 64 = 2^6 ≙ 00010 node 00000 (Aus gutem Grund)}
151#' }
152#' @references
153#' Daille, B. (1994): Approche mixte pour l’extraction automatique de terminologie: statistiques lexicales et filtres linguistiques. PhD thesis, Université Paris 7.
154#'
155#' Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
156#'
157#' Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
158#' Free PDF available from <https://purl.org/stefan.evert/PUB/Evert2004phd.pdf>
159#'
160#' Thanopoulos, A., Fakotakis, N., Kokkinakis, G. (2002): Comparative evaluation of collocation extraction metrics. In: Proc. of LREC 2002: 620–625.
161#'
Marc Kupietzbb4f54c2023-10-19 21:22:44 +0200162#' @export
163#'
Marc Kupietze981eae2023-10-18 09:00:17 +0200164collocationScores <- function(w, c, ...) {
Marc Kupietzbb4f54c2023-10-19 21:22:44 +0200165 derekovecsApiCall("/getCollocationAssociation",
166 w = w, c = c, ...)$collocates
Marc Kupietze981eae2023-10-18 09:00:17 +0200167}
168
Marc Kupietzc6a66ee2023-10-23 13:18:48 +0200169#' Get cosine similarity
Marc Kupietzbb4f54c2023-10-19 21:22:44 +0200170#'
171#' @param w1 The first word.
172#' @param w2 The second word.
173#' @param ... Additional parameters to pass to the API.
174#'
175#' @return The cosine similarity between the two words.
176#' @export
177#'
178#' @description Calculate the cosine similarity between two words in the derekovecs model.
Marc Kupietze981eae2023-10-18 09:00:17 +0200179cosineSimilarity <- function(w1, w2, ...) {
Marc Kupietzbb4f54c2023-10-19 21:22:44 +0200180 derekovecsApiCall("/getSimilarity", w1 = w1, w2 = w2, ...)
181}
182
Marc Kupietzc6a66ee2023-10-23 13:18:48 +0200183#' Get the DeReKoVecs server
Marc Kupietzbb4f54c2023-10-19 21:22:44 +0200184#'
185#' @return The URL of the DeReKoVecs API server.
186#' @export
187#'
188derekovecsServer <- function() {
189 api_server <- Sys.getenv("DEREKOVECS_SERVER")
190 if (!identical(api_server, "")) {
191 return(api_server)
192 }
193 'https://corpora.ids-mannheim.de/openlab/derekovecs/'
194}
195
196#' DeReKoVecsCall
197#'
198#' Call the DeReKoVecs API.
199#'
200#' @param method The method to call.
201#' @param ... The parameters to pass to the method.
202#' @return The result of the call.
203#' @importFrom httr2 request req_url_path_append req_url_query req_perform resp_body_json
204#'
205#' @include utils-pipe.R
Marc Kupietzf977fa72023-11-05 18:02:39 +0100206#' @export
Marc Kupietzbb4f54c2023-10-19 21:22:44 +0200207#'
208derekovecsApiCall <- function(method = "", ...) {
209 httr2::request(derekovecsServer()) %>%
210 httr2::req_url_path_append(method) %>%
211 httr2::req_url_query(...) %>%
212 httr2::req_perform() %>%
213 httr2::resp_body_json(simplifyVector = TRUE)
Marc Kupietze981eae2023-10-18 09:00:17 +0200214}