blob: 352efd327f88ab9dceeaf6b096d4e6c70250c9f6 [file] [log] [blame]
#define EXPORT __attribute__((visibility("visible")))
#define IMPORT
#include <cassert>
#include <memory>
#include <iostream>
#include <algorithm>
#include <vector>
#include <cstdint>
#include <string>
#include <sstream> // for ostringstream
#include <cmath>
#include <rocksdb/cache.h>
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/table.h"
#include <rocksdb/merge_operator.h>
#include <rocksdb/slice_transform.h>
#include "merge_operators.h"
#include "export.h"
#include "config.h"
#define WINDOW_SIZE 5
#define FREQUENCY_THRESHOLD 5
#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
#define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
#define W1(key) (uint64_t)(key & 0xffffff)
#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
typedef struct {
uint64_t freq;
char *word;
} vocab_entry;
// typedef struct Collocator {
// uint64_t w2;
// uint64_t sum;
// };
using namespace rocksdb;
using namespace std;
namespace rocksdb {
class Collocator {
public:
uint32_t w2;
uint64_t f2;
uint64_t raw;
double pmi;
double npmi;
double llr;
double lfmd;
double md;
uint64_t left_raw;
uint64_t right_raw;
double left_pmi;
double right_pmi;
double dice;
double logdice;
double ldaf;
int window;
int af_window;
};
size_t num_merge_operator_calls;
void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
size_t num_partial_merge_calls;
void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
inline void EncodeFixed64(char *buf, uint64_t value) {
if (!IS_BIG_ENDIAN) {
memcpy(buf, &value, sizeof(value));
} else {
buf[0] = value & 0xff;
buf[1] = (value >> 8) & 0xff;
buf[2] = (value >> 16) & 0xff;
buf[3] = (value >> 24) & 0xff;
buf[4] = (value >> 32) & 0xff;
buf[5] = (value >> 40) & 0xff;
buf[6] = (value >> 48) & 0xff;
buf[7] = (value >> 56) & 0xff;
}
}
inline uint32_t DecodeFixed32(const char *ptr) {
if (!IS_BIG_ENDIAN) {
// Load the raw bytes
uint32_t result;
memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
return result;
} else {
return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
| (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
| (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
| (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
}
}
inline uint64_t DecodeFixed64(const char *ptr) {
if (!IS_BIG_ENDIAN) {
// Load the raw bytes
uint64_t result;
memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
return result;
} else {
uint64_t lo = DecodeFixed32(ptr);
uint64_t hi = DecodeFixed32(ptr + 4);
return (hi << 32) | lo;
}
}
static inline double ca_pmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
double
r1 = f1 * window_size,
c1 = f2,
e = r1 * c1 / total,
o = f12;
if (f12 < FREQUENCY_THRESHOLD)
return -1.0;
else
return log2(o / e);
}
// Bouma, Gerlof (2009): <a href="https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf">
// Normalized (pointwise) mutual information in collocation extraction</a>. In Proceedings of GSCL.
static inline double ca_npmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
double
r1 = f1 * window_size,
c1 = f2,
e = r1 * c1 / total,
o = f12;
if (f12 < FREQUENCY_THRESHOLD)
return -1.0;
else
return log2(o / e) / (-log2(o / total / window_size));
}
// Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics.
// In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
// double md = log2(pow((double)max * window_size / total, 2) / (window_size * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
static inline double ca_md(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
double
r1 = f1 * window_size,
c1 = f2,
e = r1 * c1 / total,
o = f12;
return log2(o * o / e);
}
static inline double ca_lfmd(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
double
r1 = f1 * window_size,
c1 = f2,
e = r1 * c1 / total,
o = f12;
if (f12 == 0)
return 0;
else
return log2(o * o * o / e);
}
// Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
// Free PDF available from http://purl.org/stefan.evert/PUB/Evert2004phd.pdf
static inline double ca_ll(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
double
r1 = (double) w1 * window_size,
r2 = (double) n - r1,
c1 = w2,
c2 = n - c1,
o11 = w12, o12 = r1 - o11,
o21 = c1 - w12, o22 = r2 - o21,
e11 = r1 * c1 / n, e12 = r1 * c2 / n,
e21 = r2 * c1 / n, e22 = r2 * c2 / n;
return (2 * ((o11 > 0 ? o11 * log(o11 / e11) : 0) + (o12 > 0 ? o12 * log(o12 / e12) : 0) +
(o21 > 0 ? o21 * log(o21 / e21) : 0) + (o22 > 0 ? o22 * log(o22 / e22) : 0)));
}
static inline double ca_dice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
double
r1 = (double) w1 * window_size,
c1 = w2;
return 2 * w12 / (c1 + r1);
}
// Rychlý, Pavel (2008): <a href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A lexicographer-friendly association score.</a> In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
static inline double ca_logdice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
double
r1 = (double) w1 * window_size,
c1 = w2;
return 14 + log2(2 * w12 / (c1 + r1));
}
class CountMergeOperator : public AssociativeMergeOperator {
public:
CountMergeOperator() {
mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
}
virtual bool Merge(const Slice &key,
const Slice *existing_value,
const Slice &value,
std::string *new_value,
Logger *logger) const override {
assert(new_value->empty());
++num_merge_operator_calls;
if (existing_value == nullptr) {
new_value->assign(value.data(), value.size());
return true;
}
return mergeOperator_->PartialMerge(
key,
*existing_value,
value,
new_value,
logger);
}
virtual const char *Name() const override {
return "UInt64AddOperator";
}
private:
std::shared_ptr<MergeOperator> mergeOperator_;
};
class CollocatorIterator : public Iterator {
private:
char prefixc[sizeof(uint64_t)];
Iterator *base_iterator_;
public:
CollocatorIterator(Iterator *base_iterator)
: base_iterator_(base_iterator) {}
void setPrefix(char *prefix) {
memcpy(prefixc, prefix, sizeof(uint64_t));
}
virtual void SeekToFirst() { base_iterator_->SeekToFirst(); }
virtual void SeekToLast() { base_iterator_->SeekToLast(); }
virtual void Seek(const rocksdb::Slice &s) { base_iterator_->Seek(s); }
virtual void
SeekForPrev(const rocksdb::Slice &s) { base_iterator_->SeekForPrev(s); }
virtual void Prev() { base_iterator_->Prev(); }
virtual void Next() { base_iterator_->Next(); }
virtual Slice key() const;
virtual Slice value() const;
virtual Status status() const;
virtual bool Valid() const;
bool isValid();
uint64_t intValue();
uint64_t intKey();
};
// rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
bool rocksdb::CollocatorIterator::Valid() const {
return base_iterator_->Valid() && key().starts_with(std::string(prefixc, 3));
}
bool rocksdb::CollocatorIterator::isValid() {
return base_iterator_->Valid() && key().starts_with(std::string(prefixc, 3));
// return key().starts_with(std::string(prefixc,3));
}
uint64_t rocksdb::CollocatorIterator::intKey() {
return DecodeFixed64(base_iterator_->key().data());
}
uint64_t rocksdb::CollocatorIterator::intValue() {
return DecodeFixed64(base_iterator_->value().data());
}
class VocabEntry {
public:
string word;
uint64_t freq;
};
class CollocatorDB {
private:
WriteOptions merge_option_; // for merge
char _one[sizeof(uint64_t)];
Slice _one_slice;
vector<VocabEntry> _vocab;
uint64_t total = 0;
uint64_t sentences = 0;
float avg_window_size = 8.0;
protected:
std::shared_ptr<DB> db_;
WriteOptions put_option_;
ReadOptions get_option_;
WriteOptions delete_option_;
uint64_t default_;
std::shared_ptr<DB> OpenDb(const char *dbname);
std::shared_ptr<DB> OpenDbForRead(const char *dbname);
public:
void readVocab(string fname);
string getWord(uint32_t w1);
uint64_t getWordId(const char *word) const;
CollocatorDB(const char *db_name, bool read_only);
// public interface of CollocatorDB.
// All four functions return false
// if the underlying level db operation failed.
// mapped to a levedb Put
bool set(const std::string &key, uint64_t value) {
// just treat the internal rep of int64 as the string
char buf[sizeof(value)];
EncodeFixed64(buf, value);
Slice slice(buf, sizeof(value));
auto s = db_->Put(put_option_, key, slice);
if (s.ok()) {
return true;
} else {
std::cerr << s.ToString() << std::endl;
return false;
}
}
DB *getDb() {
return db_.get();
}
// mapped to a rocksdb Delete
bool remove(const std::string &key) {
auto s = db_->Delete(delete_option_, key);
if (s.ok()) {
return true;
} else {
std::cerr << s.ToString() << std::endl;
return false;
}
}
// mapped to a rocksdb Get
bool get(const std::string &key, uint64_t *value) {
std::string str;
auto s = db_->Get(get_option_, key, &str);
if (s.IsNotFound()) {
// return default value if not found;
*value = default_;
return true;
} else if (s.ok()) {
// deserialization
if (str.size() != sizeof(uint64_t)) {
std::cerr << "value corruption\n";
return false;
}
*value = DecodeFixed64(&str[0]);
return true;
} else {
std::cerr << s.ToString() << std::endl;
return false;
}
}
uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
char encoded_key[sizeof(uint64_t)];
EncodeFixed64(encoded_key, encodeCollocation(w1, w2, dist));
uint64_t value = default_;
get(std::string(encoded_key, 8), &value);
return value;
}
virtual void inc(const std::string &key) {
db_->Merge(merge_option_, key, _one_slice);
}
void inc(const uint64_t key) {
char encoded_key[sizeof(uint64_t)];
EncodeFixed64(encoded_key, key);
db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
}
virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
void dump(uint32_t w1, uint32_t w2, int8_t dist);
vector<Collocator> get_collocators(uint32_t w1);
vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
vector<Collocator> get_collocation_scores(uint32_t w1, uint32_t w2);
vector<Collocator>
get_collocators(uint32_t w1, uint32_t min_w2, uint32_t max_w2);
void
applyCAMeasures(const uint32_t w1, const uint32_t w2, uint64_t *sumWindow,
const uint64_t sum, const int usedPositions,
int true_window_size, rocksdb::Collocator *result);
void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
string collocators2json(uint32_t w1, vector<Collocator> collocators);
// mapped to a rocksdb Merge operation
virtual bool add(const std::string &key, uint64_t value) {
char encoded[sizeof(uint64_t)];
EncodeFixed64(encoded, value);
Slice slice(encoded, sizeof(uint64_t));
auto s = db_->Merge(merge_option_, key, slice);
if (s.ok()) {
return true;
} else {
std::cerr << s.ToString() << std::endl;
return false;
}
}
CollocatorIterator *SeekIterator(uint64_t w1, uint64_t w2, int8_t dist);
};
rocksdb::CollocatorDB::CollocatorDB(const char *db_name, bool read_only = false) {
// merge_option_.sync = true;
if (read_only)
db_ = OpenDbForRead(strdup(db_name));
else
db_ = OpenDb(db_name);
assert(db_);
uint64_t one = 1;
EncodeFixed64(_one, one);
_one_slice = Slice(_one, sizeof(uint64_t));
}
void rocksdb::CollocatorDB::inc(const uint32_t w1, const uint32_t w2, const uint8_t dist) {
inc(encodeCollocation(w1, w2, dist));
}
void rocksdb::CollocatorDB::readVocab(string fname) {
char strbuf[2048];
uint64_t freq;
FILE *fin = fopen(fname.c_str(), "rb");
if (fin == NULL) {
cout << "Vocabulary file " << fname << " not found\n";
exit(1);
}
uint64_t i = 0;
while (fscanf(fin, "%s %lu", strbuf, &freq) == 2) {
_vocab.push_back({strbuf, freq});
total += freq;
i++;
}
fclose(fin);
char size_fname[256];
strcpy(size_fname, fname.c_str());
char *pos = strstr(size_fname, ".vocab");
if (pos) {
*pos = 0;
strcat(size_fname, ".size");
FILE *fp = fopen(size_fname, "r");
if (fp != NULL) {
fscanf(fp, "%lu", &sentences);
fscanf(fp, "%lu", &total);
float sl = (float) total / (float) sentences;
float w = WINDOW_SIZE;
avg_window_size = ((sl > 2 * w ? (sl - 2 * w) * 2 * w : 0) + (double) w * (3 * w - 1)) / sl;
fprintf(stdout,
"Size corrections found: corpus size: %lu tokens in %lu sentences, avg. sentence size: %f, avg. window size: %f\n",
total, sentences, sl, avg_window_size);
fclose(fp);
} else {
// std::cout << "size file " << size_fname << " not found\n";
}
} else {
std::cout << "cannot determine size file " << size_fname << "\n";
}
}
std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDbForRead(const char *name) {
DB *db;
Options options;
options.env->SetBackgroundThreads(4);
options.create_if_missing = true;
options.merge_operator = std::make_shared<CountMergeOperator>();
options.max_successive_merges = 0;
// options.prefix_extractor.reset(NewFixedPrefixTransform(8));
options.IncreaseParallelism();
options.OptimizeLevelStyleCompaction();
options.prefix_extractor.reset(NewFixedPrefixTransform(3));
ostringstream dbname, vocabname;
dbname << name << ".rocksdb";
auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
if (!s.ok()) {
std::cerr << s.ToString() << std::endl;
assert(false);
}
vocabname << name << ".vocab";
readVocab(vocabname.str());
return std::shared_ptr<DB>(db);
}
std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDb(const char *dbname) {
DB *db;
Options options;
options.env->SetBackgroundThreads(4);
options.create_if_missing = true;
options.merge_operator = std::make_shared<CountMergeOperator>();
options.max_successive_merges = 0;
// options.prefix_extractor.reset(NewFixedPrefixTransform(8));
options.IncreaseParallelism();
options.OptimizeLevelStyleCompaction();
// options.max_write_buffer_number = 48;
// options.max_background_jobs = 48;
// options.allow_concurrent_memtable_write=true;
// options.memtable_factory.reset(rocksdb::NewHashLinkListRepFactory(200000));
// options.enable_write_thread_adaptive_yield = 1;
// options.allow_concurrent_memtable_write = 1;
// options.memtable_factory.reset(new rocksdb::SkipListFactory);
// options.write_buffer_size = 1 << 22;
// options.allow_mmap_reads = true;
// options.allow_mmap_writes = true;
// options.max_background_compactions = 40;
// BlockBasedTableOptions table_options;
// table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
// options.bloom_locality = 1;
// std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
// table_options.block_cache = cache;
// options.table_factory.reset(NewBlockBasedTableFactory(table_options));
Status s;
// DestroyDB(dbname, Options());
s = DB::Open(options, dbname, &db);
if (!s.ok()) {
std::cerr << s.ToString() << std::endl;
assert(false);
}
total = 1000;
return std::shared_ptr<DB>(db);
}
CollocatorIterator *rocksdb::CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) {
ReadOptions options;
options.prefix_same_as_start = true;
char prefixc[sizeof(uint64_t)];
EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
Iterator *it = db_->NewIterator(options);
CollocatorIterator *cit = new CollocatorIterator(it);
if (w2 > 0)
cit->Seek(std::string(prefixc, 6));
else
cit->Seek(std::string(prefixc, 3));
cit->setPrefix(prefixc);
return cit;
}
void rocksdb::CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) {
auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
for (; it->isValid(); it->Next()) {
uint64_t value = it->intValue();
uint64_t key = it->intKey();
std::cout << "w1:" << W1(key) << ", w2:" << W2(key) << ", dist:" << (int32_t) DIST(key) << " - count:" << value
<< std::endl;
}
std::cout << "ready dumping\n";
}
bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
bool sortByLogDice(const Collocator &lhs, const Collocator &rhs) { return lhs.logdice > rhs.logdice; }
bool sortByLogDiceAF(const Collocator &lhs, const Collocator &rhs) { return lhs.ldaf > rhs.ldaf; }
void rocksdb::CollocatorDB::applyCAMeasures(const uint32_t w1, const uint32_t w2, uint64_t *sumWindow,
const uint64_t sum, const int usedPositions, int true_window_size,
rocksdb::Collocator *result) {
uint64_t f1 = _vocab[w1].freq, f2 = _vocab[w2].freq;
double o = sum,
r1 = f1 * true_window_size,
c1 = f2,
e = r1 * c1 / total,
pmi = log2(o / e),
md = log2(o * o / e),
lfmd = log2(o * o * o / e),
llr = ca_ll(f1, f2, sum, total, true_window_size);
double ld = ca_logdice(f1, f2, sum, total, true_window_size);
int bestWindow = usedPositions;
double bestAF = ld;
double currentAF;
// if(f1<75000000)
//#pragma omp parallel for reduction(max:bestAF)
// #pragma omp target teams distribute parallel for reduction(max:bestAF) map(tofrom:bestAF,currentAF,bestWindow,usedPositions)
for (int bitmask = 1; bitmask < (1 << (2 * WINDOW_SIZE)); bitmask++) {
if ((bitmask & usedPositions) == 0 || (bitmask & ~usedPositions) > 0) continue;
uint64_t currentWindowSum = 0;
// #pragma omp target teams distribute parallel for reduction(+:currentWindowSum) map(tofrom:bitmask,usedPositions)
for (int pos = 0; pos < 2 * WINDOW_SIZE; pos++) {
if (((1 << pos) & bitmask & usedPositions) != 0)
currentWindowSum += sumWindow[pos];
}
currentAF = ca_logdice(f1, f2, currentWindowSum, total, __builtin_popcount(bitmask));
if (currentAF > bestAF) {
bestAF = currentAF;
bestWindow = bitmask;
}
}
*result = {w2,
f2,
sum,
pmi,
pmi / (-log2(o / total / true_window_size)),
llr,
lfmd,
md,
sumWindow[WINDOW_SIZE],
sumWindow[WINDOW_SIZE - 1],
ca_pmi(f1, f2, sumWindow[WINDOW_SIZE], total, 1),
ca_pmi(f1, f2, sumWindow[WINDOW_SIZE - 1], total, 1),
ca_dice(f1, f2, sum, total, true_window_size),
ld,
bestAF,
usedPositions,
bestWindow
};
}
std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t min_w2, uint32_t max_w2) {
std::vector<Collocator> collocators;
uint64_t w2, last_w2 = 0xffffffffffffffff;
uint64_t maxv = 0, sum = 0;
uint64_t *sumWindow = (uint64_t *) malloc(sizeof(uint64_t) * 2 * WINDOW_SIZE);
memset(sumWindow, 0, sizeof(uint64_t) * 2 * WINDOW_SIZE);
int true_window_size = 1;
int usedPositions = 0;
if (w1 > _vocab.size()) {
std::cout << w1 << "> vocabulary size " << _vocab.size() << "\n";
w1 -= _vocab.size();
}
#ifdef DEBUG
std::cout << "Searching for collocates of " << _vocab[w1].word << "\n";
#endif
// #pragma omp parallel num_threads(40)
// #pragma omp single
for (auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, min_w2, 0)); it->isValid(); it->Next()) {
uint64_t value = it->intValue(),
key = it->intKey();
if ((w2 = W2(key)) > max_w2)
continue;
if (last_w2 == 0xffffffffffffffff) last_w2 = w2;
if (w2 != last_w2) {
if (sum >= FREQUENCY_THRESHOLD) {
collocators.push_back({});
rocksdb::Collocator *result = &(collocators[collocators.size() - 1]);
// #pragma omp task firstprivate(last_w2, sumWindow, sum, usedPositions, true_window_size) shared(w1, result) if(sum > 1000000)
{
// uint64_t *nsw = (uint64_t *)malloc(sizeof(uint64_t) * 2 *WINDOW_SIZE);
// memcpy(nsw, sumWindow, sizeof(uint64_t) * 2 *WINDOW_SIZE);
applyCAMeasures(w1, last_w2, sumWindow, sum, usedPositions, true_window_size, result);
// free(nsw);
}
}
memset(sumWindow, 0, 2 * WINDOW_SIZE * sizeof(uint64_t));
usedPositions = 1 << (-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0));
sumWindow[-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0)] = value;
last_w2 = w2;
maxv = value;
sum = value;
true_window_size = 1;
if (min_w2 == max_w2 && w2 != min_w2)
break;
} else {
sum += value;
if (value > maxv)
maxv = value;
usedPositions |= 1 << (-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0));
sumWindow[-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0)] = value;
true_window_size++;
}
}
// #pragma omp taskwait
sort(collocators.begin(), collocators.end(), sortByLogDiceAF);
#ifdef DEBUG
int i=0;
for (Collocator c : collocators) {
if(i++>10) break;
std::cout << "w1:" << _vocab[w1].word << ", w2: *" << _vocab[c.w2].word << "*"
<< "\t f(w1):" << _vocab[w1].freq
<< "\t f(w2):" << _vocab[c.w2].freq
<< "\t f(w1, w2):" << c.raw
<< "\t pmi:" << c.pmi
<< "\t npmi:" << c.npmi
<< "\t llr:" << c.llr
<< "\t md:" << c.md
<< "\t lfmd:" << c.lfmd
<< "\t total:" << total
<< std::endl;
}
#endif
return collocators;
}
std::vector<Collocator> rocksdb::CollocatorDB::get_collocation_scores(uint32_t w1, uint32_t w2) {
return get_collocators(w1, w2, w2);
}
std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
return get_collocators(w1, 0, UINT32_MAX);
}
void rocksdb::CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
std::vector<Collocator> collocators;
std::stringstream stream;
uint64_t w2, last_w2 = 0xffffffffffffffff;
uint64_t maxv = 0, total_w1 = 0;
bool first = true;
for (auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
uint64_t value = it->intValue(),
key = it->intKey();
w2 = W2(key);
total_w1 += value;
if (last_w2 == 0xffffffffffffffff) last_w2 = w2;
if (w2 != last_w2) {
if (maxv >= min_cooccur) {
double llr = ca_ll(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, 1);
if (first)
first = false;
else
stream << " ";
stream << w2 << " " << llr;
}
last_w2 = w2;
maxv = value;
} else {
if (value > maxv)
maxv = value;
}
}
if (first)
stream << "1 0.0";
stream << "\n";
std::cout << stream.str();
}
rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }
};
string rocksdb::CollocatorDB::getWord(uint32_t w1) {
return _vocab[w1].word;
}
uint64_t rocksdb::CollocatorDB::getWordId(const char *word) const {
for (uint64_t i = 0; i < _vocab.size(); i++) {
if (strcmp(_vocab[i].word.c_str(), word) == 0)
return i;
}
return 0;
}
string rocksdb::CollocatorDB::collocators2json(uint32_t w1, vector<Collocator> collocators) {
ostringstream s;
int i = 0;
s << " { \"f1\": " << _vocab[w1].freq << "," <<
"\"w1\":\"" << string(_vocab[w1].word) << "\", " <<
"\"N\": " << total << ", " <<
"\"collocates\": [";
bool first = true;
for (Collocator c : collocators) {
if (strncmp(_vocab[c.w2].word.c_str(), "quot", 4) == 0) continue;
if (i++ > 200)
break;
if (!first)
s << ",\n";
else
first = false;
s << "{"
"\"word\":\"" << (string(_vocab[c.w2].word).compare("<num>") == 0 ? string("###") : string(_vocab[c.w2].word))
<< "\"," <<
"\"f2\":" << c.f2 << "," <<
"\"f\":" << c.raw << "," <<
"\"npmi\":" << c.npmi << "," <<
"\"pmi\":" << c.pmi << "," <<
"\"llr\":" << c.llr << "," <<
"\"lfmd\":" << c.lfmd << "," <<
"\"md\":" << c.md << "," <<
"\"dice\":" << c.dice << "," <<
"\"ld\":" << c.logdice << "," <<
"\"ln_count\":" << c.left_raw << "," <<
"\"rn_count\":" << c.right_raw << "," <<
"\"ln_pmi\":" << c.left_pmi << "," <<
"\"rn_pmi\":" << c.right_pmi << "," <<
"\"ldaf\":" << c.ldaf << "," <<
"\"win\":" << c.window << "," <<
"\"afwin\":" << c.af_window <<
"}";
}
s << "]}\n";
// std::cout << s.str();
return s.str();
}
typedef rocksdb::CollocatorDB COLLOCATORS;
extern "C" {
#ifdef __clang__
#pragma clang diagnostic push
#pragma ide diagnostic ignored "OCUnusedGlobalDeclarationInspection"
#endif
DLL_EXPORT COLLOCATORS *open_collocatordb_for_write(char *dbname) {
return new rocksdb::CollocatorDB(dbname, false);
}
DLL_EXPORT COLLOCATORS *open_collocatordb(char *dbname) {
return new rocksdb::CollocatorDB(dbname, true);
}
DLL_EXPORT void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
db->inc(w1, w2, dist);
}
DLL_EXPORT void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
db->dump(w1, w2, dist);
}
DLL_EXPORT COLLOCATORS *get_collocators(COLLOCATORS *db, uint32_t w1) {
std::vector<Collocator> c = db->get_collocators(w1);
if (c.empty())
return NULL;
uint64_t size = c.size() + sizeof c[0];
COLLOCATORS *p = (COLLOCATORS *) malloc(size);
memcpy(p, c.data(), size);
return p;
}
DLL_EXPORT COLLOCATORS *get_collocation_scores(COLLOCATORS *db, uint32_t w1, uint32_t w2) {
std::vector<Collocator> c = db->get_collocation_scores(w1, w2);
if (c.empty())
return NULL;
uint64_t size = c.size() + sizeof c[0];
COLLOCATORS *p = (COLLOCATORS *) malloc(size);
memcpy(p, c.data(), size);
return p;
}
DLL_EXPORT char *get_word(COLLOCATORS *db, uint32_t w) {
return strdup(db->getWord(w).c_str());
}
DLL_EXPORT uint64_t get_word_id(COLLOCATORS *db, char *word) {
return db->getWordId(word);
}
DLL_EXPORT void read_vocab(COLLOCATORS *db, char *fname) {
std::string fName(fname);
db->readVocab(fName);
}
DLL_EXPORT const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
return strdup(db->collocators2json(w1, db->get_collocators(w1)).c_str());
}
DLL_EXPORT const char *get_collocation_scores_as_json(COLLOCATORS *db, uint32_t w1, uint32_t w2) {
return strdup(db->collocators2json(w1, db->get_collocation_scores(w1, w2)).c_str());
}
DLL_EXPORT const char *get_version() {
return PROJECT_VERSION;
}
#ifdef __clang__
#pragma clang diagnostic push
#endif
}