Add `md_nws` MIĀ² score based on nominal window size (=10)
Change-Id: I5e431a8ba0f863191f378d447621f3a10039d122
diff --git a/README.md b/README.md
index 422aeb1..4f0f305 100644
--- a/README.md
+++ b/README.md
@@ -74,6 +74,7 @@
double llr;
double lfmd;
double md;
+ double md_nws;
uint64_t left_raw;
uint64_t right_raw;
double left_pmi;
@@ -100,6 +101,8 @@
## Changes
+ * added `md_nws` MI² score based on nominal window size (=10) instead of actual window size, for which only positions are counted where the collocate actually occurs
+
* v1.3.2 (2024-11-15)
* added `get_version()`, which returns version string
diff --git a/src/collocatordb.cc b/src/collocatordb.cc
index 41d0ad0..87d9dbd 100644
--- a/src/collocatordb.cc
+++ b/src/collocatordb.cc
@@ -7,6 +7,7 @@
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/table.h"
+#include "rocksdb/slice.h"
#include <algorithm>
#include <cassert>
#include <cmath>
@@ -52,6 +53,7 @@
double llr;
double lfmd;
double md;
+ double md_nws;
uint64_t left_raw;
uint64_t right_raw;
double left_pmi;
@@ -603,8 +605,9 @@
uint64_t f1 = _vocab[w1].freq, f2 = _vocab[w2].freq;
double o = sum, r1 = f1 * true_window_size, c1 = f2, e = r1 * c1 / total,
pmi = log2(o / e), md = log2(o * o / e), lfmd = log2(o * o * o / e),
- llr = ca_ll(f1, f2, sum, total, true_window_size);
- double ld = ca_logdice(f1, f2, sum, total, true_window_size);
+ llr = ca_ll(f1, f2, sum, total, true_window_size),
+ md_nws = ca_md(f1, f2, sum, total, 2 * WINDOW_SIZE),
+ ld = ca_logdice(f1, f2, sum, total, true_window_size);
int bestWindow = usedPositions;
double bestAF = ld;
@@ -638,6 +641,7 @@
llr,
lfmd,
md,
+ md_nws,
sumWindow[WINDOW_SIZE],
sumWindow[WINDOW_SIZE - 1],
ca_pmi(f1, f2, sumWindow[WINDOW_SIZE], total, 1),
@@ -828,7 +832,7 @@
<< "\"," << "\"f2\":" << c.f2 << "," << "\"f\":" << c.raw << ","
<< "\"npmi\":" << c.npmi << "," << "\"pmi\":" << c.pmi << ","
<< "\"llr\":" << c.llr << "," << "\"lfmd\":" << c.lfmd << ","
- << "\"md\":" << c.md << "," << "\"dice\":" << c.dice << ","
+ << "\"md\":" << c.md << "," << "\"md_nws\":" << c.md_nws << "," << "\"dice\":" << c.dice << ","
<< "\"ld\":" << c.logdice << "," << "\"ln_count\":" << c.left_raw << ","
<< "\"rn_count\":" << c.right_raw << "," << "\"ln_pmi\":" << c.left_pmi
<< "," << "\"rn_pmi\":" << c.right_pmi << "," << "\"ldaf\":" << c.ldaf
diff --git a/src/collocatordb.h b/src/collocatordb.h
index d601902..7803cd1 100644
--- a/src/collocatordb.h
+++ b/src/collocatordb.h
@@ -106,6 +106,7 @@
double llr;
double lfmd;
double md;
+ double md_nws;
uint64_t left_raw;
uint64_t right_raw;
double left_pmi;
diff --git a/tests/basic_test.c b/tests/basic_test.c
index 6ca6d2e..929f216 100644
--- a/tests/basic_test.c
+++ b/tests/basic_test.c
@@ -27,7 +27,7 @@
void test_collocation_scores() {
COLLOCATORDB* cdb = open_collocatordb(dbpath);
TEST_ASSERT(cdb != NULL);
- char *expected = " { \"f1\": 217,\"w1\":\"Aluminium\", \"N\": 152743, \"collocates\": [{\"word\":\"Anwendungstechnologie\",\"f2\":16,\"f\":16,\"npmi\":0.594849,\"pmi\":8.4592,\"llr\":188.227,\"lfmd\":16.4592,\"md\":12.4592,\"dice\":0.0711111,\"ld\":10.1862,\"ln_count\":16,\"rn_count\":0,\"ln_pmi\":9.4592,\"rn_pmi\":-1,\"ldaf\":11.1358,\"win\":32,\"afwin\":32}]}\n";
+ char *expected = " { \"f1\": 217,\"w1\":\"Aluminium\", \"N\": 152743, \"collocates\": [{\"word\":\"Anwendungstechnologie\",\"f2\":16,\"f\":16,\"npmi\":0.594849,\"pmi\":8.4592,\"llr\":188.227,\"lfmd\":16.4592,\"md\":12.4592,\"md_nws\":10.1373,\"dice\":0.0711111,\"ld\":10.1862,\"ln_count\":16,\"rn_count\":0,\"ln_pmi\":9.4592,\"rn_pmi\":-1,\"ldaf\":11.1358,\"win\":32,\"afwin\":32}]}\n";
char *produced = get_collocation_scores_as_json(cdb, 62, 966);
TEST_CHECK(strcmp(produced, expected) == 0);
TEST_MSG("Expected: %s", expected);
@@ -39,7 +39,7 @@
COLLOCATORDB* cdb = open_collocatordb(dbpath);
TEST_ASSERT(cdb != NULL);
char *json = get_collocators_as_json(cdb, testword);
- char *needle = "\"word\":\"um\",\"f2\":264,\"f\":5,\"npmi\":-0.0556349,\"pmi\":-0.958074,\"llr\":2.87723,\"lfmd\":3.68578,\"md\":1.36385,\"dice\":0.00169952,\"ld\":4.79935,\"ln_count\":0,\"rn_count\":1,\"ln_pmi\":-1,\"rn_pmi\":-1,\"ldaf\":4.79935,\"win\":668,\"afwin\":668";
+ char *needle = "\"word\":\"um\",\"f2\":264,\"f\":5,\"npmi\":-0.0556349,\"pmi\":-0.958074,\"llr\":2.87723,\"lfmd\":3.68578,\"md\":1.36385,\"md_nws\":0.363854,\"dice\":0.00169952,\"ld\":4.79935,\"ln_count\":0,\"rn_count\":1,\"ln_pmi\":-1,\"rn_pmi\":-1,\"ldaf\":4.79935,\"win\":668,\"afwin\":668";
TEST_CHECK(strstr(json, needle) > 0);
TEST_MSG("Expected to contain: %s", needle);
}