on CentOS, Fedora, RHEL
sudo yum install cmake3 snappy snappy-devel zlib zlib-devel bzip2 bzip2-devel lz4-devel libzstd-devel libomp-devel git clone https://github.com/gflags/gflags.git cd gflags git checkout v2.0 ./configure && make && sudo make install cd ..
on Ubuntu, Debian
sudo apt-get install cmake libgflags-dev libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev libomp-dev
on MacOS
brew install cmake snappy zlib bzip2 lz4 zstd libomp gflags
install our fork of RocksDB v5.11.3.fb
on Linux
git clone https://github.com/kupietz/rocksdb.git -b 5.11.fb --single-branch cd rocksdb make -j $(nproc) static_lib DISABLE_WARNING_AS_ERROR=1 && sudo make install-static DISABLE_WARNING_AS_ERROR=1 make -j $(nproc) shared_lib DISABLE_WARNING_AS_ERROR=1 && sudo make install-shared DISABLE_WARNING_AS_ERROR=1 cd build ldconfig
on MacOS
git clone https://github.com/kupietz/rocksdb.git -b 5.11.fb --single-branch cd rocksdb mkdir -f build cd build cmake .. -DWITH_SNAPPY=1 -DWITH_LZ4=1 -DWITH_ZLIB=1 -DWITH_GFLAGS=1 -DCMAKE_INSTALL_LIBDIR=/usr/local/lib make -j $(sysctl -n hw.ncpu) && sudo make install
git clone "https://korap.ids-mannheim.de/gerrit/ids-kl/collocatordb" cd collocatordb mkdir -p build cd build cmake -DCMAKE_INSTALL_PREFIX=/usr/local .. make && ctest --extra-verbose && sudo make install
typedef struct { uint32_t w2; uint64_t f2; uint64_t raw; double pmi; double npmi; double llr; double lfmd; double md; double md_nws; uint64_t left_raw; uint64_t right_raw; double left_pmi; double right_pmi; double dice; double logdice; double ldaf; int window; int af_window; } COLLOCATOR ; COLLOCATORDB *open_collocatordb(const char *s); COLLOCATORDB *open_collocatordb_for_write(const char *s); void inc_collocator(COLLOCATORDB *db, uint64_t w1, uint64_t w2, int8_t dist); void dump_collocators(COLLOCATORDB *db, uint32_t w1, uint32_t w2, int8_t dist); COLLOCATOR *get_collocators(COLLOCATORDB *db, uint32_t w1); COLLOCATOR *get_collocation_scores(COLLOCATORDB *db, uint32_t w1, uint32_t w2); char *get_collocators_as_json(COLLOCATORDB *db, uint32_t w1); char *get_collocation_scores_as_json(COLLOCATORDB *db, uint32_t w1, uint32_t w2); char *get_word(COLLOCATORDB *db, uint32_t w1); void read_vocab(COLLOCATORDB *db, char *fname); char *get_version(); uint64_t get_word_id(COLLOCATORDB *db, const char *word); uint64_t get_corpus_size(COLLOCATORDB *db); uint64_t get_word_frequency(COLLOCATORDB *db, uint64_t w);
v1.4.0.9000 (unpublished)
get_corpus_size()
, which returns the total token countget_word_frequency()
, which returns the absolute frequency of a word in the corpusv1.4.0 (2024-11-23)
collocatordb_query
command line toolget_word_id()
, which returns the ID of a wordmd_nws
MIĀ² score based on nominal window size (=10) instead of actual window size, for which only positions are counted where the collocate actually occursv1.3.2 (2024-11-15)
get_version()
, which returns version stringv1.3.1 (2024-11-14)
Based on RocksDB, CollocatorDB is dual-licensed under both the GPLv2 (found in the COPYING file in the root directory) and Apache 2.0 License (found in the LICENSE.Apache file in the root directory). You may select, at your option, one of the above-listed licenses.