Add method to compute measures for a specific node collocate pair
Change-Id: Idbc59ad850dff5e33ba3c56215855667d9b7cf6e
diff --git a/collocatordb.cc b/collocatordb.cc
index 6970501..9a23173 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc
@@ -390,6 +390,8 @@
void dump(uint32_t w1, uint32_t w2, int8_t dist);
vector<Collocator> get_collocators(uint32_t w1);
vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
+ vector<Collocator> get_collocation_scores(uint32_t w1, uint32_t w2);
+ vector<Collocator> get_collocators(uint32_t w1, uint32_t min_w2, uint32_t max_w2);
void applyCAMeasures(const uint32_t w1, const uint32_t w2, uint64_t *sumWindow, const uint64_t sum, const int usedPositions, int true_window_size, rocksdb::Collocator *result);
void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
@@ -416,7 +418,7 @@
rocksdb::CollocatorDB::CollocatorDB(const char *db_name, bool read_only = false) {
// merge_option_.sync = true;
if(read_only)
- db_ = OpenDbForRead(db_name);
+ db_ = OpenDbForRead(strdup(db_name));
else
db_ = OpenDb(db_name);
assert(db_);
@@ -538,7 +540,10 @@
EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
Iterator *it = db_->NewIterator(options);
CollocatorIterator *cit = new CollocatorIterator(it);
- cit->Seek(std::string(prefixc,3));// it->Valid() && it->key().starts_with(std::string(prefixc,3)); it->Next()) {
+ if (w2 > 0)
+ cit->Seek(std::string(prefixc, 6));
+ else
+ cit->Seek(std::string(prefixc, 3));
cit->setPrefix(prefixc);
return cit;
}
@@ -610,7 +615,7 @@
}
- std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t max_w2) {
+ std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t min_w2, uint32_t max_w2) {
std::vector<Collocator> collocators;
uint64_t w2, last_w2 = 0xffffffffffffffff;
uint64_t maxv = 0, sum = 0;
@@ -628,7 +633,7 @@
#endif
// #pragma omp parallel num_threads(40)
// #pragma omp single
- for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
+ for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, min_w2, 0)); it->isValid(); it->Next()) {
uint64_t value = it->intValue(),
key = it->intKey();
if((w2 = W2(key)) > max_w2)
@@ -653,6 +658,8 @@
maxv = value;
sum = value;
true_window_size = 1;
+ if (min_w2 == max_w2 && w2 != min_w2)
+ break;
} else {
sum += value;
if(value > maxv)
@@ -687,8 +694,13 @@
return collocators;
}
+
+ std::vector<Collocator> rocksdb::CollocatorDB::get_collocation_scores(uint32_t w1, uint32_t w2) {
+ return get_collocators(w1, w2, w2);
+ }
+
std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
- return get_collocators(w1, UINT32_MAX);
+ return get_collocators(w1, 0, UINT32_MAX);
}
void rocksdb::CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
@@ -799,11 +811,19 @@
db->get_collocators(w1);
}
+ void get_collocation_scores(COLLOCATORS *db, uint32_t w1, uint32_t w2) {
+ db->get_collocation_scores(w1, w2);
+ }
+
const char *get_word(COLLOCATORS *db, uint32_t w) {
- return db->getWord(w).c_str();
+ return strdup(db->getWord(w).c_str());
}
const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
return strdup(db->collocators2json(w1, db->get_collocators(w1)).c_str());
}
+
+ const char *get_collocation_scores_as_json(COLLOCATORS *db, uint32_t w1, uint32_t w2) {
+ return strdup(db->collocators2json(w1, db->get_collocation_scores(w1, w2)).c_str());
+ }
}
diff --git a/collocatordb.h b/collocatordb.h
index ac76866..f249791 100644
--- a/collocatordb.h
+++ b/collocatordb.h
@@ -67,10 +67,12 @@
typedef struct COLLOCATORDB COLLOCATORDB;
#endif
-extern COLLOCATORDB *open_collocatordb(char *s);
-extern COLLOCATORDB *open_collocatordb_for_write(char *s);
+extern COLLOCATORDB *open_collocatordb(const char *s);
+extern COLLOCATORDB *open_collocatordb_for_write(const char *s);
extern void inc_collocator(COLLOCATORDB *db, uint64_t w1, uint64_t w2, int8_t dist);
extern void dump_collocators(COLLOCATORDB *db, uint32_t w1, uint32_t w2, int8_t dist);
extern void get_collocators(COLLOCATORDB *db, uint32_t w1);
+extern void get_collocation_scores(COLLOCATORDB *db, uint32_t w1, uint32_t w2);
extern char *get_collocators_as_json(COLLOCATORDB *db, uint32_t w1);
+extern char *get_collocation_scores_as_json(COLLOCATORDB *db, uint32_t w1, uint32_t w2);
extern char *get_word(COLLOCATORDB *db, uint32_t w1);
diff --git a/hello_world.c b/hello_world.c
index b96979f..74aed60 100644
--- a/hello_world.c
+++ b/hello_world.c
@@ -4,8 +4,8 @@
#include <math.h>
#include "collocatordb.h"
-char dbpath[] = "/vol/work/kupietz/Work2/kl/trunk/Analysemethoden/word2vec/models/dereko-2017-ii";
-const int testword = 259; // Welt
+char dbpath[] = "../models/dereko-2021-i";
+const int testword = 431; // Grund
int main(int argc, char* argv[]) {
COLLOCATORDB *cdb;
@@ -17,6 +17,7 @@
}
fprintf(stderr, "Successfully opened %s.\n", dbpath);
+ printf("associations between two words:\n %s", get_collocation_scores_as_json(cdb, 431, 218717));
/*
printf("raw dump of all “%s”-neighbour positions and frequencies:\n", get_word(cdb, testword));
dump_collocators(cdb, testword, 0, 0);