Fix collocation scores for lemmatized node or collocate queries
Use the lemmatized versions also for calculating O1 and O2.
Resolves #8
Change-Id: I6644d80d84bba6b5c467618d6ab9a06e22d9915b
diff --git a/R/collocationScoreQuery.R b/R/collocationScoreQuery.R
index 478f03c..22a9f9f 100644
--- a/R/collocationScoreQuery.R
+++ b/R/collocationScoreQuery.R
@@ -110,10 +110,10 @@
w = leftContextSize + rightContextSize,
leftContextSize,
rightContextSize,
- N = frequencyQuery(kco, node, vc)$total + smoothingConstant,
+ N = frequencyQuery(kco, lemmatizeWordQuery(node, lemmatizeNodeQuery), vc)$total + smoothingConstant,
O = as.double( if(is.na(observed[1])) frequencyQuery(kco, query, vc)$totalResults else observed) + smoothingConstant,
- O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant,
- O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant,
+ O1 = frequencyQuery(kco, lemmatizeWordQuery(node, lemmatizeNodeQuery), vc)$totalResults + smoothingConstant,
+ O2 = frequencyQuery(kco, lemmatizeWordQuery(collocate, lemmatizeCollocateQuery), vc)$totalResults + smoothingConstant,
E = w * as.double(O1) * O2 / N
) %>%
mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w))
@@ -174,6 +174,9 @@
paste0(w, '/i')
}
-lemmatizeWordQuery <- function(w) {
- paste0('[tt/l=', w, ']')
+lemmatizeWordQuery <- function(w, apply = TRUE) {
+ if (apply)
+ paste0('[tt/l=', w, ']')
+ else
+ w
}