Add get_word_frequency function
Change-Id: Idef55f5a8f84c4bcba0a1bfdbe563eb74f26857a
diff --git a/README.md b/README.md
index 3db0db7..7205779 100644
--- a/README.md
+++ b/README.md
@@ -99,12 +99,14 @@
char *get_version();
uint64_t get_word_id(COLLOCATORDB *db, const char *word);
uint64_t get_corpus_size(COLLOCATORDB *db);
+uint64_t get_word_frequency(COLLOCATORDB *db, uint64_t w);
```
## Changes
* v1.4.0.9000 (unpublished)
* added `get_corpus_size()`, which returns the total token count
+ * added `get_word_frequency()`, which returns the absolute frequency of a word in the corpus
* v1.4.0 (2024-11-23)
* added `collocatordb_query` command line tool
diff --git a/src/collocatordb.cc b/src/collocatordb.cc
index 444a030..57a01d9 100644
--- a/src/collocatordb.cc
+++ b/src/collocatordb.cc
@@ -311,6 +311,8 @@
uint64_t getCorpusSize() const;
+ uint64_t getWordFrequency(uint64_t w1);
+
CollocatorDB(const char *db_name, bool read_only);
// public interface of CollocatorDB.
@@ -813,6 +815,10 @@
return total;
}
+uint64_t CollocatorDB::getWordFrequency(uint64_t w1) {
+ return _vocab[w1].freq;
+}
+
string CollocatorDB::collocators2json(uint32_t w1,
const vector<Collocator>& collocators) {
ostringstream s;
@@ -923,6 +929,10 @@
DLL_EXPORT uint64_t get_corpus_size(COLLOCATORS *db) { return db->getCorpusSize(); };
+DLL_EXPORT uint64_t get_word_frequency(COLLOCATORS *db, uint64_t w1) {
+ return db->getWordFrequency(w1);
+}
+
#ifdef __clang__
#pragma clang diagnostic push
#endif
diff --git a/src/collocatordb.h b/src/collocatordb.h
index 8bb3402..74f2f0e 100644
--- a/src/collocatordb.h
+++ b/src/collocatordb.h
@@ -131,5 +131,6 @@
extern char *get_version();
extern uint64_t get_word_id(COLLOCATORDB *db, const char *word);
extern uint64_t get_corpus_size(COLLOCATORDB *db);
+extern uint64_t get_word_frequency(COLLOCATORDB *db, uint64_t w1);
#endif
diff --git a/tests/basic_test.c b/tests/basic_test.c
index 276837a..d6d236e 100644
--- a/tests/basic_test.c
+++ b/tests/basic_test.c
@@ -141,6 +141,15 @@
TEST_MSG("Unexpected corpus size: %lu", size);
}
+void test_get_word_frequency() {
+ COLLOCATORDB* cdb = open_collocatordb(dbpath);
+ TEST_ASSERT(cdb != NULL);
+ int w1 = get_word_id(cdb, "Test");
+ uint64_t freq = get_word_frequency(cdb, w1);
+ TEST_CHECK(freq == 3);
+ TEST_MSG("Unexpected word frequency: %lu", freq);
+}
+
TEST_LIST = {
{ "open database for reading", test_open_db },
{ "get word", test_get_word },
@@ -151,6 +160,7 @@
{ "version function", test_version_function },
{ "get word id", test_get_word_id },
{ "get corpus size", test_get_corpus_size},
+ { "get word frequency", test_get_word_frequency},
{ "collocatordb_query command line tool", test_collocatordb_query_command_line_tool},
{ NULL, NULL }
};