blob: d6d236e65c513df09586403c9e8e04b2900af1fa [file] [log] [blame]
#include <stddef.h>
#include <stdio.h>
#include <string.h>
#define __USE_XOPEN_EXTENDED
#include <ftw.h>
#include "../src/collocatordb.h"
#include "acutest.h"
char dbpath[] = "../tests/data/wpd19_10000";
const int testword = 10; // ist
void test_open_db() {
COLLOCATORDB* cdb = open_collocatordb(dbpath);
TEST_ASSERT(cdb != NULL);
}
void test_get_word() {
COLLOCATORDB* cdb = open_collocatordb(dbpath);
TEST_ASSERT(cdb != NULL);
char *word = get_word(cdb, testword);
char *expected = "ist";
TEST_CHECK(strcmp(word, expected) == 0);
TEST_MSG("Expected: %s", expected);
TEST_MSG("Produced: %s", word);
}
void test_collocation_scores() {
COLLOCATORDB* cdb = open_collocatordb(dbpath);
TEST_ASSERT(cdb != NULL);
char *expected = " { \"f1\": 217,\"w1\":\"Aluminium\", \"N\": 152743, \"collocates\": [{\"word\":\"Anwendungstechnologie\",\"f2\":16,\"f\":16,\"npmi\":0.594849,\"pmi\":8.4592,\"llr\":188.227,\"lfmd\":16.4592,\"md\":12.4592,\"md_nws\":10.1373,\"dice\":0.0711111,\"ld\":10.1862,\"ln_count\":16,\"rn_count\":0,\"ln_pmi\":9.4592,\"rn_pmi\":-1,\"ldaf\":11.1358,\"win\":32,\"afwin\":32}]}\n";
char *produced = get_collocation_scores_as_json(cdb, 62, 966);
TEST_CHECK(strcmp(produced, expected) == 0);
TEST_MSG("Expected: %s", expected);
TEST_MSG("Produced: %s", produced);
}
void test_collocation_analysis_as_json() {
COLLOCATORDB* cdb = open_collocatordb(dbpath);
TEST_ASSERT(cdb != NULL);
char *json = get_collocators_as_json(cdb, testword);
char *needle = "\"word\":\"um\",\"f2\":264,\"f\":5,\"npmi\":-0.0556349,\"pmi\":-0.958074,\"llr\":2.87723,\"lfmd\":3.68578,\"md\":1.36385,\"md_nws\":0.363854,\"dice\":0.00169952,\"ld\":4.79935,\"ln_count\":0,\"rn_count\":1,\"ln_pmi\":-1,\"rn_pmi\":-1,\"ldaf\":4.79935,\"win\":668,\"afwin\":668";
TEST_CHECK(strstr(json, needle) > 0);
TEST_MSG("Expected to contain: %s", needle);
}
void test_collocation_analysis() {
COLLOCATORDB* cdb = open_collocatordb(dbpath);
TEST_ASSERT(cdb != NULL);
char *expected = "Anwendungstechnologie";
const COLLOCATOR *c = get_collocators(cdb, 62);
char *produced = get_word(cdb,c[0].w2);
TEST_CHECK(strcmp(produced, expected) == 0);
TEST_MSG("Expected: %s", expected);
TEST_MSG("Produced: %s", produced);
}
int unlink_cb(const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf) {
int rv = remove(fpath);
if (rv)
perror(fpath);
return rv;
}
int rmrf(char *path) {
return nftw(path, unlink_cb, 64, FTW_DEPTH | FTW_PHYS);
}
void test_writing() {
char tmp_template[] = "/tmp/tmpfileXXXXXX";
int fd = mkstemp(tmp_template);
if (fd == -1) {
perror("mkstemp");
exit(EXIT_FAILURE);
}
close(fd);
char *tmp = strdup(tmp_template);
long size = 0;
int i;
char *rocksdbfn = malloc(strlen(tmp) + strlen(".rocksdb") + 1);
strcpy(rocksdbfn, tmp);
strcat(rocksdbfn, ".rocksdb");
COLLOCATORDB *cdb = open_collocatordb_for_write(rocksdbfn);
char *vocabfn = malloc(strlen(tmp) + strlen(".vocab") + 1);
strcpy(vocabfn, tmp);
strcat(vocabfn, ".vocab");
FILE *h = fopen(vocabfn, "w");
fprintf(h, "word0 2000\n");
fprintf(h, "word1 2000\n");
fprintf(h, "word2 2000\n");
fclose(h);
read_vocab(cdb, vocabfn);
inc_collocator(cdb, 0, 1, 4); size++;
for (i = 0; i < 1000; i++) {
inc_collocator(cdb, 0, 1, i % 5); size++;
inc_collocator(cdb, 0, 1, -i % 5); size++;
inc_collocator(cdb, 1, 0, i % 5); size++;
inc_collocator(cdb, 1, 0, -i % 5); size++;
inc_collocator(cdb, 0, 2, i % 5); size++;
inc_collocator(cdb, 0, 2, -i % 5); size++;
}
inc_collocator(cdb, 1, 2, 4); size++;
COLLOCATOR *c = get_collocators(cdb, 0);
TEST_ASSERT(c != NULL);
TEST_CHECK(c[0].w2 == 1);
TEST_CHECK(c[0].raw == 2001);
TEST_CHECK(c[0].left_raw == 200);
TEST_CHECK(c[0].right_raw == 200);
rmrf(rocksdbfn);
}
void test_version_function() {
char *version = get_version();
TEST_CHECK(strcmp(version, "1.4.0") == 0);
TEST_MSG("Unexpected version: %s", version);
}
void test_get_word_id() {
COLLOCATORDB* cdb = open_collocatordb(dbpath);
TEST_ASSERT(cdb != NULL);
uint64_t id = get_word_id(cdb, "ist");
TEST_CHECK(id == 10);
TEST_MSG("Unexpected word id: %lu", id);
}
void test_collocatordb_query_command_line_tool() {
int result = system("../build/collocatordb_query ../tests/data/wpd19_10000 ist > /dev/null 2>&1");
TEST_CHECK(result == 0);
TEST_MSG("collectordb_query command failed with result: %d", result);
}
void test_get_corpus_size() {
COLLOCATORDB* cdb = open_collocatordb(dbpath);
TEST_ASSERT(cdb != NULL);
uint64_t size = get_corpus_size(cdb);
TEST_CHECK(size == 152743);
TEST_MSG("Unexpected corpus size: %lu", size);
}
void test_get_word_frequency() {
COLLOCATORDB* cdb = open_collocatordb(dbpath);
TEST_ASSERT(cdb != NULL);
int w1 = get_word_id(cdb, "Test");
uint64_t freq = get_word_frequency(cdb, w1);
TEST_CHECK(freq == 3);
TEST_MSG("Unexpected word frequency: %lu", freq);
}
TEST_LIST = {
{ "open database for reading", test_open_db },
{ "get word", test_get_word },
{ "collocation scores", test_collocation_scores },
{ "collocation analysis", test_collocation_analysis },
{ "collocation analysis as json", test_collocation_analysis_as_json },
{ "writing", test_writing },
{ "version function", test_version_function },
{ "get word id", test_get_word_id },
{ "get corpus size", test_get_corpus_size},
{ "get word frequency", test_get_word_frequency},
{ "collocatordb_query command line tool", test_collocatordb_query_command_line_tool},
{ NULL, NULL }
};