Marc Kupietz | b4ade0f | 2024-11-23 10:12:22 +0100 | [diff] [blame] | 1 | #include <stddef.h> |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 2 | #include <stdio.h> |
| 3 | #include <string.h> |
| 4 | #define __USE_XOPEN_EXTENDED |
| 5 | #include <ftw.h> |
| 6 | #include "../src/collocatordb.h" |
| 7 | #include "acutest.h" |
| 8 | |
| 9 | char dbpath[] = "../tests/data/wpd19_10000"; |
| 10 | const int testword = 10; // ist |
| 11 | |
| 12 | void test_open_db() { |
Marc Kupietz | 5ffc474 | 2024-11-15 15:45:12 +0100 | [diff] [blame] | 13 | COLLOCATORDB* cdb = open_collocatordb(dbpath); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 14 | TEST_ASSERT(cdb != NULL); |
| 15 | } |
| 16 | |
| 17 | void test_get_word() { |
Marc Kupietz | 5ffc474 | 2024-11-15 15:45:12 +0100 | [diff] [blame] | 18 | COLLOCATORDB* cdb = open_collocatordb(dbpath); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 19 | TEST_ASSERT(cdb != NULL); |
| 20 | char *word = get_word(cdb, testword); |
| 21 | char *expected = "ist"; |
| 22 | TEST_CHECK(strcmp(word, expected) == 0); |
| 23 | TEST_MSG("Expected: %s", expected); |
| 24 | TEST_MSG("Produced: %s", word); |
| 25 | } |
| 26 | |
| 27 | void test_collocation_scores() { |
Marc Kupietz | 5ffc474 | 2024-11-15 15:45:12 +0100 | [diff] [blame] | 28 | COLLOCATORDB* cdb = open_collocatordb(dbpath); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 29 | TEST_ASSERT(cdb != NULL); |
Marc Kupietz | e889cec | 2024-11-23 12:08:42 +0100 | [diff] [blame] | 30 | char *expected = " { \"f1\": 217,\"w1\":\"Aluminium\", \"N\": 152743, \"collocates\": [{\"word\":\"Anwendungstechnologie\",\"f2\":16,\"f\":16,\"npmi\":0.594849,\"pmi\":8.4592,\"llr\":188.227,\"lfmd\":16.4592,\"md\":12.4592,\"md_nws\":10.1373,\"dice\":0.0711111,\"ld\":10.1862,\"ln_count\":16,\"rn_count\":0,\"ln_pmi\":9.4592,\"rn_pmi\":-1,\"ldaf\":11.1358,\"win\":32,\"afwin\":32}]}\n"; |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 31 | char *produced = get_collocation_scores_as_json(cdb, 62, 966); |
| 32 | TEST_CHECK(strcmp(produced, expected) == 0); |
| 33 | TEST_MSG("Expected: %s", expected); |
| 34 | TEST_MSG("Produced: %s", produced); |
| 35 | } |
| 36 | |
| 37 | |
| 38 | void test_collocation_analysis_as_json() { |
Marc Kupietz | 5ffc474 | 2024-11-15 15:45:12 +0100 | [diff] [blame] | 39 | COLLOCATORDB* cdb = open_collocatordb(dbpath); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 40 | TEST_ASSERT(cdb != NULL); |
| 41 | char *json = get_collocators_as_json(cdb, testword); |
Marc Kupietz | e889cec | 2024-11-23 12:08:42 +0100 | [diff] [blame] | 42 | char *needle = "\"word\":\"um\",\"f2\":264,\"f\":5,\"npmi\":-0.0556349,\"pmi\":-0.958074,\"llr\":2.87723,\"lfmd\":3.68578,\"md\":1.36385,\"md_nws\":0.363854,\"dice\":0.00169952,\"ld\":4.79935,\"ln_count\":0,\"rn_count\":1,\"ln_pmi\":-1,\"rn_pmi\":-1,\"ldaf\":4.79935,\"win\":668,\"afwin\":668"; |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 43 | TEST_CHECK(strstr(json, needle) > 0); |
| 44 | TEST_MSG("Expected to contain: %s", needle); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 45 | } |
| 46 | |
| 47 | void test_collocation_analysis() { |
Marc Kupietz | 5ffc474 | 2024-11-15 15:45:12 +0100 | [diff] [blame] | 48 | COLLOCATORDB* cdb = open_collocatordb(dbpath); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 49 | TEST_ASSERT(cdb != NULL); |
| 50 | char *expected = "Anwendungstechnologie"; |
| 51 | const COLLOCATOR *c = get_collocators(cdb, 62); |
| 52 | char *produced = get_word(cdb,c[0].w2); |
| 53 | TEST_CHECK(strcmp(produced, expected) == 0); |
| 54 | TEST_MSG("Expected: %s", expected); |
| 55 | TEST_MSG("Produced: %s", produced); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 56 | } |
| 57 | |
| 58 | int unlink_cb(const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf) { |
| 59 | int rv = remove(fpath); |
| 60 | if (rv) |
| 61 | perror(fpath); |
| 62 | return rv; |
| 63 | } |
| 64 | |
| 65 | int rmrf(char *path) { |
| 66 | return nftw(path, unlink_cb, 64, FTW_DEPTH | FTW_PHYS); |
| 67 | } |
| 68 | |
| 69 | void test_writing() { |
Marc Kupietz | b4ade0f | 2024-11-23 10:12:22 +0100 | [diff] [blame] | 70 | char tmp_template[] = "/tmp/tmpfileXXXXXX"; |
| 71 | int fd = mkstemp(tmp_template); |
| 72 | if (fd == -1) { |
| 73 | perror("mkstemp"); |
| 74 | exit(EXIT_FAILURE); |
| 75 | } |
| 76 | close(fd); |
| 77 | char *tmp = strdup(tmp_template); |
| 78 | |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 79 | long size = 0; |
Marc Kupietz | 673bd81 | 2021-03-14 17:27:44 +0100 | [diff] [blame] | 80 | int i; |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 81 | |
Marc Kupietz | b4ade0f | 2024-11-23 10:12:22 +0100 | [diff] [blame] | 82 | char *rocksdbfn = malloc(strlen(tmp) + strlen(".rocksdb") + 1); |
| 83 | strcpy(rocksdbfn, tmp); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 84 | strcat(rocksdbfn, ".rocksdb"); |
| 85 | COLLOCATORDB *cdb = open_collocatordb_for_write(rocksdbfn); |
| 86 | |
Marc Kupietz | b4ade0f | 2024-11-23 10:12:22 +0100 | [diff] [blame] | 87 | char *vocabfn = malloc(strlen(tmp) + strlen(".vocab") + 1); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 88 | strcpy(vocabfn, tmp); |
| 89 | strcat(vocabfn, ".vocab"); |
| 90 | FILE *h = fopen(vocabfn, "w"); |
| 91 | fprintf(h, "word0 2000\n"); |
| 92 | fprintf(h, "word1 2000\n"); |
| 93 | fprintf(h, "word2 2000\n"); |
| 94 | fclose(h); |
| 95 | read_vocab(cdb, vocabfn); |
| 96 | inc_collocator(cdb, 0, 1, 4); size++; |
Marc Kupietz | b4ade0f | 2024-11-23 10:12:22 +0100 | [diff] [blame] | 97 | for (i = 0; i < 1000; i++) { |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 98 | inc_collocator(cdb, 0, 1, i % 5); size++; |
| 99 | inc_collocator(cdb, 0, 1, -i % 5); size++; |
| 100 | inc_collocator(cdb, 1, 0, i % 5); size++; |
| 101 | inc_collocator(cdb, 1, 0, -i % 5); size++; |
| 102 | inc_collocator(cdb, 0, 2, i % 5); size++; |
| 103 | inc_collocator(cdb, 0, 2, -i % 5); size++; |
| 104 | } |
| 105 | inc_collocator(cdb, 1, 2, 4); size++; |
| 106 | COLLOCATOR *c = get_collocators(cdb, 0); |
| 107 | TEST_ASSERT(c != NULL); |
Marc Kupietz | 1b09e4d | 2021-03-14 15:20:19 +0100 | [diff] [blame] | 108 | TEST_CHECK(c[0].w2 == 1); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 109 | TEST_CHECK(c[0].raw == 2001); |
| 110 | TEST_CHECK(c[0].left_raw == 200); |
| 111 | TEST_CHECK(c[0].right_raw == 200); |
| 112 | |
| 113 | rmrf(rocksdbfn); |
| 114 | } |
Marc Kupietz | b4ade0f | 2024-11-23 10:12:22 +0100 | [diff] [blame] | 115 | |
Marc Kupietz | 6208fd7 | 2024-11-15 15:46:19 +0100 | [diff] [blame] | 116 | void test_version_function() { |
| 117 | char *version = get_version(); |
Marc Kupietz | fa73154 | 2024-11-23 12:35:14 +0100 | [diff] [blame] | 118 | TEST_CHECK(strcmp(version, "1.4.0") == 0); |
Marc Kupietz | 6208fd7 | 2024-11-15 15:46:19 +0100 | [diff] [blame] | 119 | TEST_MSG("Unexpected version: %s", version); |
| 120 | } |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 121 | |
Marc Kupietz | 6e2dbd1 | 2024-11-23 10:11:32 +0100 | [diff] [blame] | 122 | void test_get_word_id() { |
| 123 | COLLOCATORDB* cdb = open_collocatordb(dbpath); |
| 124 | TEST_ASSERT(cdb != NULL); |
| 125 | uint64_t id = get_word_id(cdb, "ist"); |
| 126 | TEST_CHECK(id == 10); |
| 127 | TEST_MSG("Unexpected word id: %lu", id); |
| 128 | } |
| 129 | |
Marc Kupietz | 94ea77b | 2024-11-23 12:32:19 +0100 | [diff] [blame] | 130 | void test_collocatordb_query_command_line_tool() { |
| 131 | int result = system("../build/collocatordb_query ../tests/data/wpd19_10000 ist > /dev/null 2>&1"); |
| 132 | TEST_CHECK(result == 0); |
| 133 | TEST_MSG("collectordb_query command failed with result: %d", result); |
| 134 | } |
| 135 | |
Marc Kupietz | d26b105 | 2024-12-10 16:56:39 +0100 | [diff] [blame] | 136 | void test_get_corpus_size() { |
| 137 | COLLOCATORDB* cdb = open_collocatordb(dbpath); |
| 138 | TEST_ASSERT(cdb != NULL); |
| 139 | uint64_t size = get_corpus_size(cdb); |
| 140 | TEST_CHECK(size == 152743); |
| 141 | TEST_MSG("Unexpected corpus size: %lu", size); |
| 142 | } |
| 143 | |
Marc Kupietz | 21b964c | 2024-12-10 17:10:50 +0100 | [diff] [blame] | 144 | void test_get_word_frequency() { |
| 145 | COLLOCATORDB* cdb = open_collocatordb(dbpath); |
| 146 | TEST_ASSERT(cdb != NULL); |
| 147 | int w1 = get_word_id(cdb, "Test"); |
| 148 | uint64_t freq = get_word_frequency(cdb, w1); |
| 149 | TEST_CHECK(freq == 3); |
| 150 | TEST_MSG("Unexpected word frequency: %lu", freq); |
| 151 | } |
| 152 | |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 153 | TEST_LIST = { |
| 154 | { "open database for reading", test_open_db }, |
| 155 | { "get word", test_get_word }, |
| 156 | { "collocation scores", test_collocation_scores }, |
| 157 | { "collocation analysis", test_collocation_analysis }, |
| 158 | { "collocation analysis as json", test_collocation_analysis_as_json }, |
| 159 | { "writing", test_writing }, |
Marc Kupietz | 6208fd7 | 2024-11-15 15:46:19 +0100 | [diff] [blame] | 160 | { "version function", test_version_function }, |
Marc Kupietz | 6e2dbd1 | 2024-11-23 10:11:32 +0100 | [diff] [blame] | 161 | { "get word id", test_get_word_id }, |
Marc Kupietz | d26b105 | 2024-12-10 16:56:39 +0100 | [diff] [blame] | 162 | { "get corpus size", test_get_corpus_size}, |
Marc Kupietz | 21b964c | 2024-12-10 17:10:50 +0100 | [diff] [blame] | 163 | { "get word frequency", test_get_word_frequency}, |
Marc Kupietz | 94ea77b | 2024-11-23 12:32:19 +0100 | [diff] [blame] | 164 | { "collocatordb_query command line tool", test_collocatordb_query_command_line_tool}, |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 165 | { NULL, NULL } |
| 166 | }; |