Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 1 | #include <stdio.h> |
| 2 | #include <string.h> |
| 3 | #define __USE_XOPEN_EXTENDED |
| 4 | #include <ftw.h> |
| 5 | #include "../src/collocatordb.h" |
| 6 | #include "acutest.h" |
| 7 | |
| 8 | char dbpath[] = "../tests/data/wpd19_10000"; |
| 9 | const int testword = 10; // ist |
| 10 | |
| 11 | void test_open_db() { |
Marc Kupietz | 5ffc474 | 2024-11-15 15:45:12 +0100 | [diff] [blame] | 12 | COLLOCATORDB* cdb = open_collocatordb(dbpath); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 13 | TEST_ASSERT(cdb != NULL); |
| 14 | } |
| 15 | |
| 16 | void test_get_word() { |
Marc Kupietz | 5ffc474 | 2024-11-15 15:45:12 +0100 | [diff] [blame] | 17 | COLLOCATORDB* cdb = open_collocatordb(dbpath); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 18 | TEST_ASSERT(cdb != NULL); |
| 19 | char *word = get_word(cdb, testword); |
| 20 | char *expected = "ist"; |
| 21 | TEST_CHECK(strcmp(word, expected) == 0); |
| 22 | TEST_MSG("Expected: %s", expected); |
| 23 | TEST_MSG("Produced: %s", word); |
| 24 | } |
| 25 | |
| 26 | void test_collocation_scores() { |
Marc Kupietz | 5ffc474 | 2024-11-15 15:45:12 +0100 | [diff] [blame] | 27 | COLLOCATORDB* cdb = open_collocatordb(dbpath); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 28 | TEST_ASSERT(cdb != NULL); |
„feldmueller“ | 2441f7c | 2024-11-14 16:31:30 +0100 | [diff] [blame] | 29 | char *expected = " { \"f1\": 217,\"w1\":\"Aluminium\", \"N\": 152743, \"collocates\": [{\"word\":\"Anwendungstechnologie\",\"f2\":16,\"f\":16,\"npmi\":0.594849,\"pmi\":8.4592,\"llr\":188.227,\"lfmd\":16.4592,\"md\":12.4592,\"dice\":0.0711111,\"ld\":10.1862,\"ln_count\":16,\"rn_count\":0,\"ln_pmi\":9.4592,\"rn_pmi\":-1,\"ldaf\":11.1358,\"win\":32,\"afwin\":32}]}\n"; |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 30 | char *produced = get_collocation_scores_as_json(cdb, 62, 966); |
| 31 | TEST_CHECK(strcmp(produced, expected) == 0); |
| 32 | TEST_MSG("Expected: %s", expected); |
| 33 | TEST_MSG("Produced: %s", produced); |
| 34 | } |
| 35 | |
| 36 | |
| 37 | void test_collocation_analysis_as_json() { |
Marc Kupietz | 5ffc474 | 2024-11-15 15:45:12 +0100 | [diff] [blame] | 38 | COLLOCATORDB* cdb = open_collocatordb(dbpath); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 39 | TEST_ASSERT(cdb != NULL); |
| 40 | char *json = get_collocators_as_json(cdb, testword); |
„feldmueller“ | 2441f7c | 2024-11-14 16:31:30 +0100 | [diff] [blame] | 41 | char *needle = "\"word\":\"um\",\"f2\":264,\"f\":5,\"npmi\":-0.0556349,\"pmi\":-0.958074,\"llr\":2.87723,\"lfmd\":3.68578,\"md\":1.36385,\"dice\":0.00169952,\"ld\":4.79935,\"ln_count\":0,\"rn_count\":1,\"ln_pmi\":-1,\"rn_pmi\":-1,\"ldaf\":4.79935,\"win\":668,\"afwin\":668"; |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 42 | TEST_CHECK(strstr(json, needle) > 0); |
| 43 | TEST_MSG("Expected to contain: %s", needle); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 44 | } |
| 45 | |
| 46 | void test_collocation_analysis() { |
Marc Kupietz | 5ffc474 | 2024-11-15 15:45:12 +0100 | [diff] [blame] | 47 | COLLOCATORDB* cdb = open_collocatordb(dbpath); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 48 | TEST_ASSERT(cdb != NULL); |
| 49 | char *expected = "Anwendungstechnologie"; |
| 50 | const COLLOCATOR *c = get_collocators(cdb, 62); |
| 51 | char *produced = get_word(cdb,c[0].w2); |
| 52 | TEST_CHECK(strcmp(produced, expected) == 0); |
| 53 | TEST_MSG("Expected: %s", expected); |
| 54 | TEST_MSG("Produced: %s", produced); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 55 | } |
| 56 | |
| 57 | int unlink_cb(const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf) { |
| 58 | int rv = remove(fpath); |
| 59 | if (rv) |
| 60 | perror(fpath); |
| 61 | return rv; |
| 62 | } |
| 63 | |
| 64 | int rmrf(char *path) { |
| 65 | return nftw(path, unlink_cb, 64, FTW_DEPTH | FTW_PHYS); |
| 66 | } |
| 67 | |
| 68 | void test_writing() { |
| 69 | char *tmp = tempnam(NULL, NULL); |
| 70 | long size = 0; |
Marc Kupietz | 673bd81 | 2021-03-14 17:27:44 +0100 | [diff] [blame] | 71 | int i; |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 72 | |
| 73 | char *rocksdbfn = malloc(strlen(tmp)+strlen(".rocksdb")); |
| 74 | strcpy (rocksdbfn, tmp); |
| 75 | strcat(rocksdbfn, ".rocksdb"); |
| 76 | COLLOCATORDB *cdb = open_collocatordb_for_write(rocksdbfn); |
| 77 | |
| 78 | char *vocabfn = malloc(strlen(tmp)+strlen(".vocab")); |
| 79 | strcpy(vocabfn, tmp); |
| 80 | strcat(vocabfn, ".vocab"); |
| 81 | FILE *h = fopen(vocabfn, "w"); |
| 82 | fprintf(h, "word0 2000\n"); |
| 83 | fprintf(h, "word1 2000\n"); |
| 84 | fprintf(h, "word2 2000\n"); |
| 85 | fclose(h); |
| 86 | read_vocab(cdb, vocabfn); |
| 87 | inc_collocator(cdb, 0, 1, 4); size++; |
Marc Kupietz | 673bd81 | 2021-03-14 17:27:44 +0100 | [diff] [blame] | 88 | for (i=0; i < 1000; i++) { |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 89 | inc_collocator(cdb, 0, 1, i % 5); size++; |
| 90 | inc_collocator(cdb, 0, 1, -i % 5); size++; |
| 91 | inc_collocator(cdb, 1, 0, i % 5); size++; |
| 92 | inc_collocator(cdb, 1, 0, -i % 5); size++; |
| 93 | inc_collocator(cdb, 0, 2, i % 5); size++; |
| 94 | inc_collocator(cdb, 0, 2, -i % 5); size++; |
| 95 | } |
| 96 | inc_collocator(cdb, 1, 2, 4); size++; |
| 97 | COLLOCATOR *c = get_collocators(cdb, 0); |
| 98 | TEST_ASSERT(c != NULL); |
Marc Kupietz | 1b09e4d | 2021-03-14 15:20:19 +0100 | [diff] [blame] | 99 | TEST_CHECK(c[0].w2 == 1); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 100 | TEST_CHECK(c[0].raw == 2001); |
| 101 | TEST_CHECK(c[0].left_raw == 200); |
| 102 | TEST_CHECK(c[0].right_raw == 200); |
| 103 | |
| 104 | rmrf(rocksdbfn); |
| 105 | } |
Marc Kupietz | 6208fd7 | 2024-11-15 15:46:19 +0100 | [diff] [blame] | 106 | void test_version_function() { |
| 107 | char *version = get_version(); |
| 108 | TEST_CHECK(strcmp(version, "1.3.2") == 0); |
| 109 | TEST_MSG("Unexpected version: %s", version); |
| 110 | } |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 111 | |
Marc Kupietz | 6e2dbd1 | 2024-11-23 10:11:32 +0100 | [diff] [blame^] | 112 | void test_get_word_id() { |
| 113 | COLLOCATORDB* cdb = open_collocatordb(dbpath); |
| 114 | TEST_ASSERT(cdb != NULL); |
| 115 | uint64_t id = get_word_id(cdb, "ist"); |
| 116 | TEST_CHECK(id == 10); |
| 117 | TEST_MSG("Unexpected word id: %lu", id); |
| 118 | } |
| 119 | |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 120 | TEST_LIST = { |
| 121 | { "open database for reading", test_open_db }, |
| 122 | { "get word", test_get_word }, |
| 123 | { "collocation scores", test_collocation_scores }, |
| 124 | { "collocation analysis", test_collocation_analysis }, |
| 125 | { "collocation analysis as json", test_collocation_analysis_as_json }, |
| 126 | { "writing", test_writing }, |
Marc Kupietz | 6208fd7 | 2024-11-15 15:46:19 +0100 | [diff] [blame] | 127 | { "version function", test_version_function }, |
Marc Kupietz | 6e2dbd1 | 2024-11-23 10:11:32 +0100 | [diff] [blame^] | 128 | { "get word id", test_get_word_id }, |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 129 | { NULL, NULL } |
| 130 | }; |