blob: d6d236e65c513df09586403c9e8e04b2900af1fa [file] [log] [blame]
Marc Kupietzb4ade0f2024-11-23 10:12:22 +01001#include <stddef.h>
Marc Kupietz6663f112021-03-14 09:20:59 +01002#include <stdio.h>
3#include <string.h>
4#define __USE_XOPEN_EXTENDED
5#include <ftw.h>
6#include "../src/collocatordb.h"
7#include "acutest.h"
8
9char dbpath[] = "../tests/data/wpd19_10000";
10const int testword = 10; // ist
11
12void test_open_db() {
Marc Kupietz5ffc4742024-11-15 15:45:12 +010013 COLLOCATORDB* cdb = open_collocatordb(dbpath);
Marc Kupietz6663f112021-03-14 09:20:59 +010014 TEST_ASSERT(cdb != NULL);
15}
16
17void test_get_word() {
Marc Kupietz5ffc4742024-11-15 15:45:12 +010018 COLLOCATORDB* cdb = open_collocatordb(dbpath);
Marc Kupietz6663f112021-03-14 09:20:59 +010019 TEST_ASSERT(cdb != NULL);
20 char *word = get_word(cdb, testword);
21 char *expected = "ist";
22 TEST_CHECK(strcmp(word, expected) == 0);
23 TEST_MSG("Expected: %s", expected);
24 TEST_MSG("Produced: %s", word);
25}
26
27void test_collocation_scores() {
Marc Kupietz5ffc4742024-11-15 15:45:12 +010028 COLLOCATORDB* cdb = open_collocatordb(dbpath);
Marc Kupietz6663f112021-03-14 09:20:59 +010029 TEST_ASSERT(cdb != NULL);
Marc Kupietze889cec2024-11-23 12:08:42 +010030 char *expected = " { \"f1\": 217,\"w1\":\"Aluminium\", \"N\": 152743, \"collocates\": [{\"word\":\"Anwendungstechnologie\",\"f2\":16,\"f\":16,\"npmi\":0.594849,\"pmi\":8.4592,\"llr\":188.227,\"lfmd\":16.4592,\"md\":12.4592,\"md_nws\":10.1373,\"dice\":0.0711111,\"ld\":10.1862,\"ln_count\":16,\"rn_count\":0,\"ln_pmi\":9.4592,\"rn_pmi\":-1,\"ldaf\":11.1358,\"win\":32,\"afwin\":32}]}\n";
Marc Kupietz6663f112021-03-14 09:20:59 +010031 char *produced = get_collocation_scores_as_json(cdb, 62, 966);
32 TEST_CHECK(strcmp(produced, expected) == 0);
33 TEST_MSG("Expected: %s", expected);
34 TEST_MSG("Produced: %s", produced);
35}
36
37
38void test_collocation_analysis_as_json() {
Marc Kupietz5ffc4742024-11-15 15:45:12 +010039 COLLOCATORDB* cdb = open_collocatordb(dbpath);
Marc Kupietz6663f112021-03-14 09:20:59 +010040 TEST_ASSERT(cdb != NULL);
41 char *json = get_collocators_as_json(cdb, testword);
Marc Kupietze889cec2024-11-23 12:08:42 +010042 char *needle = "\"word\":\"um\",\"f2\":264,\"f\":5,\"npmi\":-0.0556349,\"pmi\":-0.958074,\"llr\":2.87723,\"lfmd\":3.68578,\"md\":1.36385,\"md_nws\":0.363854,\"dice\":0.00169952,\"ld\":4.79935,\"ln_count\":0,\"rn_count\":1,\"ln_pmi\":-1,\"rn_pmi\":-1,\"ldaf\":4.79935,\"win\":668,\"afwin\":668";
Marc Kupietz6663f112021-03-14 09:20:59 +010043 TEST_CHECK(strstr(json, needle) > 0);
44 TEST_MSG("Expected to contain: %s", needle);
Marc Kupietz6663f112021-03-14 09:20:59 +010045}
46
47void test_collocation_analysis() {
Marc Kupietz5ffc4742024-11-15 15:45:12 +010048 COLLOCATORDB* cdb = open_collocatordb(dbpath);
Marc Kupietz6663f112021-03-14 09:20:59 +010049 TEST_ASSERT(cdb != NULL);
50 char *expected = "Anwendungstechnologie";
51 const COLLOCATOR *c = get_collocators(cdb, 62);
52 char *produced = get_word(cdb,c[0].w2);
53 TEST_CHECK(strcmp(produced, expected) == 0);
54 TEST_MSG("Expected: %s", expected);
55 TEST_MSG("Produced: %s", produced);
Marc Kupietz6663f112021-03-14 09:20:59 +010056}
57
58int unlink_cb(const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf) {
59 int rv = remove(fpath);
60 if (rv)
61 perror(fpath);
62 return rv;
63}
64
65int rmrf(char *path) {
66 return nftw(path, unlink_cb, 64, FTW_DEPTH | FTW_PHYS);
67}
68
69void test_writing() {
Marc Kupietzb4ade0f2024-11-23 10:12:22 +010070 char tmp_template[] = "/tmp/tmpfileXXXXXX";
71 int fd = mkstemp(tmp_template);
72 if (fd == -1) {
73 perror("mkstemp");
74 exit(EXIT_FAILURE);
75 }
76 close(fd);
77 char *tmp = strdup(tmp_template);
78
Marc Kupietz6663f112021-03-14 09:20:59 +010079 long size = 0;
Marc Kupietz673bd812021-03-14 17:27:44 +010080 int i;
Marc Kupietz6663f112021-03-14 09:20:59 +010081
Marc Kupietzb4ade0f2024-11-23 10:12:22 +010082 char *rocksdbfn = malloc(strlen(tmp) + strlen(".rocksdb") + 1);
83 strcpy(rocksdbfn, tmp);
Marc Kupietz6663f112021-03-14 09:20:59 +010084 strcat(rocksdbfn, ".rocksdb");
85 COLLOCATORDB *cdb = open_collocatordb_for_write(rocksdbfn);
86
Marc Kupietzb4ade0f2024-11-23 10:12:22 +010087 char *vocabfn = malloc(strlen(tmp) + strlen(".vocab") + 1);
Marc Kupietz6663f112021-03-14 09:20:59 +010088 strcpy(vocabfn, tmp);
89 strcat(vocabfn, ".vocab");
90 FILE *h = fopen(vocabfn, "w");
91 fprintf(h, "word0 2000\n");
92 fprintf(h, "word1 2000\n");
93 fprintf(h, "word2 2000\n");
94 fclose(h);
95 read_vocab(cdb, vocabfn);
96 inc_collocator(cdb, 0, 1, 4); size++;
Marc Kupietzb4ade0f2024-11-23 10:12:22 +010097 for (i = 0; i < 1000; i++) {
Marc Kupietz6663f112021-03-14 09:20:59 +010098 inc_collocator(cdb, 0, 1, i % 5); size++;
99 inc_collocator(cdb, 0, 1, -i % 5); size++;
100 inc_collocator(cdb, 1, 0, i % 5); size++;
101 inc_collocator(cdb, 1, 0, -i % 5); size++;
102 inc_collocator(cdb, 0, 2, i % 5); size++;
103 inc_collocator(cdb, 0, 2, -i % 5); size++;
104 }
105 inc_collocator(cdb, 1, 2, 4); size++;
106 COLLOCATOR *c = get_collocators(cdb, 0);
107 TEST_ASSERT(c != NULL);
Marc Kupietz1b09e4d2021-03-14 15:20:19 +0100108 TEST_CHECK(c[0].w2 == 1);
Marc Kupietz6663f112021-03-14 09:20:59 +0100109 TEST_CHECK(c[0].raw == 2001);
110 TEST_CHECK(c[0].left_raw == 200);
111 TEST_CHECK(c[0].right_raw == 200);
112
113 rmrf(rocksdbfn);
114}
Marc Kupietzb4ade0f2024-11-23 10:12:22 +0100115
Marc Kupietz6208fd72024-11-15 15:46:19 +0100116void test_version_function() {
117 char *version = get_version();
Marc Kupietzfa731542024-11-23 12:35:14 +0100118 TEST_CHECK(strcmp(version, "1.4.0") == 0);
Marc Kupietz6208fd72024-11-15 15:46:19 +0100119 TEST_MSG("Unexpected version: %s", version);
120}
Marc Kupietz6663f112021-03-14 09:20:59 +0100121
Marc Kupietz6e2dbd12024-11-23 10:11:32 +0100122void test_get_word_id() {
123 COLLOCATORDB* cdb = open_collocatordb(dbpath);
124 TEST_ASSERT(cdb != NULL);
125 uint64_t id = get_word_id(cdb, "ist");
126 TEST_CHECK(id == 10);
127 TEST_MSG("Unexpected word id: %lu", id);
128}
129
Marc Kupietz94ea77b2024-11-23 12:32:19 +0100130void test_collocatordb_query_command_line_tool() {
131 int result = system("../build/collocatordb_query ../tests/data/wpd19_10000 ist > /dev/null 2>&1");
132 TEST_CHECK(result == 0);
133 TEST_MSG("collectordb_query command failed with result: %d", result);
134}
135
Marc Kupietzd26b1052024-12-10 16:56:39 +0100136void test_get_corpus_size() {
137 COLLOCATORDB* cdb = open_collocatordb(dbpath);
138 TEST_ASSERT(cdb != NULL);
139 uint64_t size = get_corpus_size(cdb);
140 TEST_CHECK(size == 152743);
141 TEST_MSG("Unexpected corpus size: %lu", size);
142}
143
Marc Kupietz21b964c2024-12-10 17:10:50 +0100144void test_get_word_frequency() {
145 COLLOCATORDB* cdb = open_collocatordb(dbpath);
146 TEST_ASSERT(cdb != NULL);
147 int w1 = get_word_id(cdb, "Test");
148 uint64_t freq = get_word_frequency(cdb, w1);
149 TEST_CHECK(freq == 3);
150 TEST_MSG("Unexpected word frequency: %lu", freq);
151}
152
Marc Kupietz6663f112021-03-14 09:20:59 +0100153TEST_LIST = {
154 { "open database for reading", test_open_db },
155 { "get word", test_get_word },
156 { "collocation scores", test_collocation_scores },
157 { "collocation analysis", test_collocation_analysis },
158 { "collocation analysis as json", test_collocation_analysis_as_json },
159 { "writing", test_writing },
Marc Kupietz6208fd72024-11-15 15:46:19 +0100160 { "version function", test_version_function },
Marc Kupietz6e2dbd12024-11-23 10:11:32 +0100161 { "get word id", test_get_word_id },
Marc Kupietzd26b1052024-12-10 16:56:39 +0100162 { "get corpus size", test_get_corpus_size},
Marc Kupietz21b964c2024-12-10 17:10:50 +0100163 { "get word frequency", test_get_word_frequency},
Marc Kupietz94ea77b2024-11-23 12:32:19 +0100164 { "collocatordb_query command line tool", test_collocatordb_query_command_line_tool},
Marc Kupietz6663f112021-03-14 09:20:59 +0100165 { NULL, NULL }
166};