Subtract redundant smoothing constants in mergeDuplicateCollocates

Change-Id: I1b042974735c4b2e9a664e21d1eb87cf0794d5c4
diff --git a/NAMESPACE b/NAMESPACE
index 6526101..edf177e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -101,7 +101,7 @@
 importFrom(ggplot2,theme)
 importFrom(httr2,oauth_client)
 importFrom(httr2,oauth_flow_auth_code)
-importFrom(httr2,url_build)
+importFrom(httr2,url_modify)
 importFrom(httr2,url_parse)
 importFrom(jsonlite,fromJSON)
 importFrom(lubridate,year)
diff --git a/R/collocationScoreQuery.R b/R/collocationScoreQuery.R
index 9a93eab..d488995 100644
--- a/R/collocationScoreQuery.R
+++ b/R/collocationScoreQuery.R
@@ -182,15 +182,18 @@
     w
 }
 
-#' Merge duplicate collocate rows and re-calculate association scores and urls
+#' Merge duplicate collocate rows and re-calculate association scores and URLs.
+#' Useful if collocation analyses were performed separately for collocates on the
+#' left and right side of a node.
 #'
 #' @param ... tibbles with collocate rows returned from [collocationAnalysis()]
+#' @param smoothingConstant  original smoothing constant (to be added only once to the observed values)
 #' @return tibble with unique collocate rows
 #'
 #' @importFrom dplyr bind_rows group_by summarise ungroup mutate across first everything
 #' @importFrom httr2 url_modify
 #' @export
-mergeDuplicateCollocates <- function(...) {
+mergeDuplicateCollocates <- function(..., smoothingConstant = .5) {
   # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
   O1 <- O2 <- O <- N <- E <- w <- leftContextSize <- rightContextSize <- collocate <- tmp_positions <- 0
 
@@ -202,8 +205,8 @@
   combined_df %>%
     group_by(collocate, O2, N) %>%
     summarise(
-      O = sum(O),
-      O1 = sum(O1),
+      O = sum(O) - smoothingConstant * (n()-1),
+      O1 = sum(O1) - smoothingConstant * (n()-1),
       leftContextSize = sum(leftContextSize),
       rightContextSize = sum(rightContextSize),
       w = sum(w),
diff --git a/man/mergeDuplicateCollocates.Rd b/man/mergeDuplicateCollocates.Rd
index ba69bf4..9e5e660 100644
--- a/man/mergeDuplicateCollocates.Rd
+++ b/man/mergeDuplicateCollocates.Rd
@@ -2,16 +2,22 @@
 % Please edit documentation in R/collocationScoreQuery.R
 \name{mergeDuplicateCollocates}
 \alias{mergeDuplicateCollocates}
-\title{Merge duplicate collocate rows and re-calculate association scores and urls}
+\title{Merge duplicate collocate rows and re-calculate association scores and URLs.
+Useful if collocation analyses were performed separately for collocates on the
+left and right side of a node.}
 \usage{
-mergeDuplicateCollocates(...)
+mergeDuplicateCollocates(..., smoothingConstant = 0.5)
 }
 \arguments{
 \item{...}{tibbles with collocate rows returned from \code{\link[=collocationAnalysis]{collocationAnalysis()}}}
+
+\item{smoothingConstant}{original smoothing constant (to be added only once to the observed values)}
 }
 \value{
 tibble with unique collocate rows
 }
 \description{
-Merge duplicate collocate rows and re-calculate association scores and urls
+Merge duplicate collocate rows and re-calculate association scores and URLs.
+Useful if collocation analyses were performed separately for collocates on the
+left and right side of a node.
 }
diff --git a/tests/testthat/test-collocations.R b/tests/testthat/test-collocations.R
index f847d8b..aa4a771 100644
--- a/tests/testthat/test-collocations.R
+++ b/tests/testthat/test-collocations.R
@@ -45,3 +45,58 @@
     removeWithinSpan("contains(<base/s=s>, (machen []{0,1} aufmerksam | aufmerksam []{0,1} machen))", "base/s=s"),
     "(machen []{0,1} aufmerksam | aufmerksam []{0,1} machen)")
 })
+
+
+test_that("mergeDuplicateCollocatesWorksAsExpected", {
+  ldf <- tibble(
+    node = c("focus(in [tt/p=NN] {[tt/l=nehmen]})"),
+    collocate = c("Anspruch"),
+    label = c(""),
+    vc = c(""),
+    query = c("Anspruch focus(in [tt/p=NN] {[tt/l=nehmen]})"),
+    webUIRequestUrl = c(
+      "https://korap.ids-mannheim.de/?q=Anspruch%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dnehmen%5d%7d%29&ql=poliqarp"
+    ),
+    w = c(1),
+    leftContextSize = c(1),
+    rightContextSize = c(0),
+    N = c(23578528381.5),
+    O = c(0.5),
+    O1 = c(1168410.5),
+    O2 = c(1296870.5),
+    E = c(64.2651265093014),
+    pmi = c(11.9173498777957),
+    mi2 = c(29.8406639214616),
+    mi3 = c(47.7639779651274),
+    logDice = c(11.6899933757298),
+    ll = c(3717716.74208791)
+  )
+  rdf <- tibble(
+    node = c("focus({[tt/l=nehmen] in} [tt/p=NN])"),
+    collocate = c("Anspruch"),
+    label = c(""),
+    vc = c(""),
+    query = c("focus({[tt/l=nehmen] in} [tt/p=NN]) Anspruch"),
+    webUIRequestUrl = c(
+      "https://korap.ids-mannheim.de/?q=focus%28%7b%5btt%2fl%3dnehmen%5d%20in%7d%20%5btt%2fp%3dNN%5d%29%20Anspruch&ql=poliqarp"
+    ),
+    w = c(1),
+    leftContextSize = c(0),
+    rightContextSize = c(1),
+    N = c(23578528381.5),
+    O = c(0.5),
+    O1 = c(17077.5),
+    O2 = c(1296870.5),
+    E = c(0.939299756346416),
+    pmi = c(7.99469408391783),
+    mi2 = c(15.8990457079122),
+    mi3 = c(23.8033973319065),
+    logDice = c(2.57887487309409),
+    ll = c(2181.35986032019)
+  )
+  merged <- mergeDuplicateCollocates(ldf, rdf, smoothingConstant = 0.5)
+  expect_equal(merged$O, 0.5)
+  expect_equal(merged$O1, 1185487.5)
+  expect_equal(merged$O2, 1296870.5)
+  expect_equal(merged$query, "Anspruch focus(in [tt/p=NN] {[tt/l=nehmen]}) | focus({[tt/l=nehmen] in} [tt/p=NN]) Anspruch")
+})