Add `md_nws` MIĀ² score based on nominal window size (=10)

Change-Id: I5e431a8ba0f863191f378d447621f3a10039d122
diff --git a/README.md b/README.md
index 422aeb1..4f0f305 100644
--- a/README.md
+++ b/README.md
@@ -74,6 +74,7 @@
   double llr;
   double lfmd;
   double md;
+  double md_nws;
   uint64_t left_raw;
   uint64_t right_raw;
   double left_pmi;
@@ -100,6 +101,8 @@
 
 ## Changes
 
+  * added `md_nws` MI² score based on nominal window size (=10) instead of actual window size, for which only positions are counted where the collocate actually occurs
+
 * v1.3.2 (2024-11-15)
   * added `get_version()`, which returns version string
 
diff --git a/src/collocatordb.cc b/src/collocatordb.cc
index 41d0ad0..87d9dbd 100644
--- a/src/collocatordb.cc
+++ b/src/collocatordb.cc
@@ -7,6 +7,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
+#include "rocksdb/slice.h"
 #include <algorithm>
 #include <cassert>
 #include <cmath>
@@ -52,6 +53,7 @@
   double llr;
   double lfmd;
   double md;
+  double md_nws;
   uint64_t left_raw;
   uint64_t right_raw;
   double left_pmi;
@@ -603,8 +605,9 @@
   uint64_t f1 = _vocab[w1].freq, f2 = _vocab[w2].freq;
   double o = sum, r1 = f1 * true_window_size, c1 = f2, e = r1 * c1 / total,
          pmi = log2(o / e), md = log2(o * o / e), lfmd = log2(o * o * o / e),
-         llr = ca_ll(f1, f2, sum, total, true_window_size);
-  double ld = ca_logdice(f1, f2, sum, total, true_window_size);
+         llr = ca_ll(f1, f2, sum, total, true_window_size),
+         md_nws = ca_md(f1, f2, sum, total, 2 * WINDOW_SIZE),
+         ld = ca_logdice(f1, f2, sum, total, true_window_size);
 
   int bestWindow = usedPositions;
   double bestAF = ld;
@@ -638,6 +641,7 @@
              llr,
              lfmd,
              md,
+             md_nws,
              sumWindow[WINDOW_SIZE],
              sumWindow[WINDOW_SIZE - 1],
              ca_pmi(f1, f2, sumWindow[WINDOW_SIZE], total, 1),
@@ -828,7 +832,7 @@
       << "\"," << "\"f2\":" << c.f2 << "," << "\"f\":" << c.raw << ","
       << "\"npmi\":" << c.npmi << "," << "\"pmi\":" << c.pmi << ","
       << "\"llr\":" << c.llr << "," << "\"lfmd\":" << c.lfmd << ","
-      << "\"md\":" << c.md << "," << "\"dice\":" << c.dice << ","
+      << "\"md\":" << c.md << "," << "\"md_nws\":" << c.md_nws << "," << "\"dice\":" << c.dice << ","
       << "\"ld\":" << c.logdice << "," << "\"ln_count\":" << c.left_raw << ","
       << "\"rn_count\":" << c.right_raw << "," << "\"ln_pmi\":" << c.left_pmi
       << "," << "\"rn_pmi\":" << c.right_pmi << "," << "\"ldaf\":" << c.ldaf
diff --git a/src/collocatordb.h b/src/collocatordb.h
index d601902..7803cd1 100644
--- a/src/collocatordb.h
+++ b/src/collocatordb.h
@@ -106,6 +106,7 @@
   double llr;
   double lfmd;
   double md;
+  double md_nws;
   uint64_t left_raw;
   uint64_t right_raw;
   double left_pmi;
diff --git a/tests/basic_test.c b/tests/basic_test.c
index 6ca6d2e..929f216 100644
--- a/tests/basic_test.c
+++ b/tests/basic_test.c
@@ -27,7 +27,7 @@
 void test_collocation_scores() {
   COLLOCATORDB* cdb = open_collocatordb(dbpath);
   TEST_ASSERT(cdb != NULL);
-  char *expected = " { \"f1\": 217,\"w1\":\"Aluminium\", \"N\": 152743, \"collocates\": [{\"word\":\"Anwendungstechnologie\",\"f2\":16,\"f\":16,\"npmi\":0.594849,\"pmi\":8.4592,\"llr\":188.227,\"lfmd\":16.4592,\"md\":12.4592,\"dice\":0.0711111,\"ld\":10.1862,\"ln_count\":16,\"rn_count\":0,\"ln_pmi\":9.4592,\"rn_pmi\":-1,\"ldaf\":11.1358,\"win\":32,\"afwin\":32}]}\n";
+  char *expected = " { \"f1\": 217,\"w1\":\"Aluminium\", \"N\": 152743, \"collocates\": [{\"word\":\"Anwendungstechnologie\",\"f2\":16,\"f\":16,\"npmi\":0.594849,\"pmi\":8.4592,\"llr\":188.227,\"lfmd\":16.4592,\"md\":12.4592,\"md_nws\":10.1373,\"dice\":0.0711111,\"ld\":10.1862,\"ln_count\":16,\"rn_count\":0,\"ln_pmi\":9.4592,\"rn_pmi\":-1,\"ldaf\":11.1358,\"win\":32,\"afwin\":32}]}\n";
   char *produced = get_collocation_scores_as_json(cdb, 62, 966);
   TEST_CHECK(strcmp(produced, expected) == 0);
   TEST_MSG("Expected: %s", expected);
@@ -39,7 +39,7 @@
   COLLOCATORDB* cdb = open_collocatordb(dbpath);
   TEST_ASSERT(cdb != NULL);
   char *json = get_collocators_as_json(cdb, testword);
-  char *needle = "\"word\":\"um\",\"f2\":264,\"f\":5,\"npmi\":-0.0556349,\"pmi\":-0.958074,\"llr\":2.87723,\"lfmd\":3.68578,\"md\":1.36385,\"dice\":0.00169952,\"ld\":4.79935,\"ln_count\":0,\"rn_count\":1,\"ln_pmi\":-1,\"rn_pmi\":-1,\"ldaf\":4.79935,\"win\":668,\"afwin\":668";
+  char *needle = "\"word\":\"um\",\"f2\":264,\"f\":5,\"npmi\":-0.0556349,\"pmi\":-0.958074,\"llr\":2.87723,\"lfmd\":3.68578,\"md\":1.36385,\"md_nws\":0.363854,\"dice\":0.00169952,\"ld\":4.79935,\"ln_count\":0,\"rn_count\":1,\"ln_pmi\":-1,\"rn_pmi\":-1,\"ldaf\":4.79935,\"win\":668,\"afwin\":668";
   TEST_CHECK(strstr(json, needle) > 0);
   TEST_MSG("Expected to contain: %s", needle);
 }