collocatordb: read actual corpus size from <dbnam>.size file
Format:
<sentences>
<words>
diff --git a/collocatordb.cc b/collocatordb.cc
index 47fbac7..9afe027 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc
@@ -232,7 +232,8 @@
char _one[sizeof(uint64_t)];
Slice _one_slice;
vector<VocabEntry> _vocab;
- uint64_t total;
+ uint64_t total = 0;
+ uint64_t sentences = 0;
protected:
std::shared_ptr<DB> db_;
@@ -386,6 +387,28 @@
i++;
}
fclose(fin);
+
+ char size_fname[256];
+ strcpy(size_fname, fname.c_str());
+ char *pos = strstr(size_fname, ".vocab");
+ if(pos) {
+ *pos=0;
+ strcat(size_fname, ".size");
+ FILE *fp = fopen(size_fname, "r");
+ if (fp != NULL) {
+ fscanf(fp, "%lu", &sentences);
+ fscanf(fp, "%lu", &total);
+ float sl = (float)total/(float)sentences;
+ float w = WINDOW_SIZE;
+ avg_window_size = ((sl > 2*w? (sl-2*w)*2*w: 0) + (double) w * (3*w -1)) / sl;
+ fprintf(stdout, "Size corrections found: corpus size: %lu tokens in %lu sentences, avg. sentence size: %f, avg. window size: %f\n", total, sentences, sl, avg_window_size);
+ fclose(fp);
+ } else {
+ std::cout << "size file " << size_fname << " not found\n";
+ }
+ } else {
+ std::cout << "cannot determine size file " << size_fname << "\n";
+ }
}
std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDbForRead(const char *name) {