collocatordb: read vocab and calc total in lib
diff --git a/c_testanalysis.c b/c_testanalysis.c
index 5a660cb..bb40f4a 100644
--- a/c_testanalysis.c
+++ b/c_testanalysis.c
@@ -4,34 +4,10 @@
#include <math.h>
#include "collocatordb.h"
-uint64_t total=0;
-
-vocab_entry vocab[100000];
-
-void read_vocab(char *fname) {
- char strbuf[2048];
- long long freq;
- FILE *fin = fopen(fname, "rb");
- if (fin == NULL) {
- printf("Vocabulary file not found\n");
- exit(1);
- }
- uint64_t i = 0;
- while(!feof(fin)) {
- fscanf(fin, "%s %lld", strbuf, &freq);
- vocab[i].word = strdup(strbuf);
- vocab[i].freq = freq;
- total += freq;
- i++;
- }
- fclose(fin);
-}
-
int main() {
COLLOCATORS *cdb = open_collocators_for_read("/vol/work/kupietz/Work2/kl/trunk/Analysemethoden/wang2vec/sample");
- read_vocab("/vol/work/kupietz/Work2/kl/trunk/Analysemethoden/wang2vec/sample.vocab");
for(int i=100; i < 1000; i++)
- get_collocators(cdb, i, vocab, total);
- printf("%s\n", get_collocators_as_json(cdb, 500, vocab, total));
+ get_collocators(cdb, i);
+ printf("%s\n", get_collocators_as_json(cdb, 500));
return 0;
}
diff --git a/collocatordb.cc b/collocatordb.cc
index 56fc53a..2f06c63 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc
@@ -2,6 +2,7 @@
#define EXPORT __attribute__((visibility("visible")))
#define IMPORT
#include <assert.h>
+#include <inttypes.h>
#include <memory>
#include <iostream>
#include <algorithm>
@@ -190,13 +191,20 @@
return DecodeFixed64(base_iterator_->value().data());
}
+ class VocabEntry {
+ public:
+ string word;
+ uint64_t freq;
+ };
+
class Collocators {
private:
WriteOptions merge_option_; // for merge
char _one[sizeof(uint64_t)];
Slice _one_slice;
- vocab_entry *_vocab = NULL;
-
+ vector<VocabEntry> _vocab;
+ uint64_t total;
+
protected:
std::shared_ptr<DB> db_;
@@ -208,7 +216,8 @@
std::shared_ptr<DB> OpenDb(const char *dbname);
std::shared_ptr<DB> OpenDbForRead(const char *dbname);
-
+ void read_vocab(string fname);
+
public:
Collocators(const char *db_name, bool read_only);
~Collocators();
@@ -293,7 +302,7 @@
virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
void dump(uint32_t w1, uint32_t w2, int8_t dist);
- vector<Collocator> get_collocators(uint32_t w1, vocab_entry *vocab, uint64_t total);
+ vector<Collocator> get_collocators(uint32_t w1);
string collocators2json(vector<Collocator> collocators);
// mapped to a rocksdb Merge operation
@@ -338,16 +347,36 @@
inc(encodeCollocation(w1, w2, dist));
}
+ void rocksdb::Collocators::read_vocab(string fname) {
+ char strbuf[2048];
+ uint64_t freq;
+ FILE *fin = fopen(fname.c_str(), "rb");
+ if (fin == NULL) {
+ cout << "Vocabulary file " << fname <<" not found\n";
+ exit(1);
+ }
+ uint64_t i = 0;
+ while(!feof(fin)) {
+ fscanf(fin, "%s %" PRIu64, strbuf, &freq);
+ _vocab.push_back({strbuf, freq});
+ total += freq;
+ i++;
+ }
+ fclose(fin);
+ }
+
std::shared_ptr<DB> rocksdb::Collocators::OpenDbForRead(const char *name) {
DB* db;
Options options;
- ostringstream dbname;
+ ostringstream dbname, vocabname;
dbname << name << ".rocksdb";
auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
if (!s.ok()) {
std::cerr << s.ToString() << std::endl;
assert(false);
}
+ vocabname << name << ".vocab";
+ read_vocab(vocabname.str());
return std::shared_ptr<DB>(db);
}
@@ -457,8 +486,7 @@
bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
- std::vector<Collocator> rocksdb::Collocators::get_collocators(uint32_t w1, vocab_entry *vocab, uint64_t total) {
- _vocab = vocab;
+ std::vector<Collocator> rocksdb::Collocators::get_collocators(uint32_t w1) {
std::vector<Collocator> collocators;
uint64_t w2, last_w2 = 0xffffffffffffffff;
uint64_t sum = 0, total_w1 = 0;
@@ -470,12 +498,12 @@
if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
if (w2 != last_w2) {
double pmi = log2( total * ((double) sum) /
- (AVG_WINDOW_SIZE * ((double)vocab[w1].freq) * ((double)vocab[last_w2].freq) ));
+ (AVG_WINDOW_SIZE * ((double)_vocab[w1].freq) * ((double)_vocab[last_w2].freq) ));
// Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics. In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
- // double md = log2(pow((double)sum * AVG_WINDOW_SIZE / total, 2) / (AVG_WINDOW_SIZE * ((double)vocab[w1].freq/total) * ((double)vocab[last_w2].freq/total)));
- double md = log2((double)sum * sum / ((double) total * AVG_WINDOW_SIZE * AVG_WINDOW_SIZE * vocab[w1].freq * vocab[last_w2].freq));
+ // double md = log2(pow((double)sum * AVG_WINDOW_SIZE / total, 2) / (AVG_WINDOW_SIZE * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
+ double md = log2((double)sum * sum / ((double) total * AVG_WINDOW_SIZE * AVG_WINDOW_SIZE * _vocab[w1].freq * _vocab[last_w2].freq));
collocators.push_back ( {last_w2, sum, pmi, pmi / (-log2(((double) sum / AVG_WINDOW_SIZE / total))), /* normalize to [-1,1] */
- calculateLLR(vocab[w1].freq, total, sum, vocab[last_w2].freq), md, md + log2((double)sum / AVG_WINDOW_SIZE / total), pmi*sum/total/AVG_WINDOW_SIZE} );
+ calculateLLR(_vocab[w1].freq, total, sum, _vocab[last_w2].freq), md, md + log2((double)sum / AVG_WINDOW_SIZE / total), pmi*sum/total/AVG_WINDOW_SIZE} );
last_w2 = w2;
sum = value;
} else {
@@ -488,9 +516,9 @@
int i=0;
for (Collocator c : collocators) {
if(i++>10) break;
- std::cout << "w1:" << vocab[w1].word << ", w2:" << vocab[c.w2].word
- << "\t f(w1):" << vocab[w1].freq
- << "\t f(w2):" << vocab[c.w2].freq
+ std::cout << "w1:" << _vocab[w1].word << ", w2:" << _vocab[c.w2].word
+ << "\t f(w1):" << _vocab[w1].freq
+ << "\t f(w2):" << _vocab[c.w2].freq
<< "\t f(w1, x):" << total_w1
<< "\t f(w1, w2):" << c.sum
<< "\t pmi:" << c.pmi
@@ -552,11 +580,11 @@
db->dump(w1, w2, dist);
}
- void get_collocators(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total) {
- db->get_collocators(w1, vocab, total);
+ void get_collocators(COLLOCATORS *db, uint32_t w1) {
+ db->get_collocators(w1);
}
- const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total) {
- return strdup(db->collocators2json(db->get_collocators(w1, vocab, total)).c_str());
+ const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
+ return strdup(db->collocators2json(db->get_collocators(w1)).c_str());
}
}
diff --git a/collocatordb.h b/collocatordb.h
index b370aae..406ea5d 100644
--- a/collocatordb.h
+++ b/collocatordb.h
@@ -10,12 +10,6 @@
#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
-
-typedef struct {
- uint64_t freq;
- char *word;
-} vocab_entry;
-
#ifdef __cplusplus
namespace rocksdb {
class CollocatorIterator : public Iterator {
@@ -53,6 +47,6 @@
extern COLLOCATORS *open_collocators_for_read(char *s);
extern void inc_collocators(COLLOCATORS *db, uint64_t w1, uint64_t w2, int8_t dist);
extern void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist);
-extern void get_collocators(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total);
-extern char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total);
+extern void get_collocators(COLLOCATORS *db, uint32_t w1);
+extern char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1);