Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 1 | #include <stdio.h> |
| 2 | #include <string.h> |
| 3 | #define __USE_XOPEN_EXTENDED |
| 4 | #include <ftw.h> |
| 5 | #include "../src/collocatordb.h" |
| 6 | #include "acutest.h" |
| 7 | |
| 8 | char dbpath[] = "../tests/data/wpd19_10000"; |
| 9 | const int testword = 10; // ist |
| 10 | |
| 11 | void test_open_db() { |
| 12 | COLLOCATORDB *cdb; |
| 13 | |
| 14 | cdb = open_collocatordb(dbpath); |
| 15 | TEST_ASSERT(cdb != NULL); |
| 16 | } |
| 17 | |
| 18 | void test_get_word() { |
| 19 | COLLOCATORDB *cdb; |
| 20 | |
| 21 | cdb = open_collocatordb(dbpath); |
| 22 | TEST_ASSERT(cdb != NULL); |
| 23 | char *word = get_word(cdb, testword); |
| 24 | char *expected = "ist"; |
| 25 | TEST_CHECK(strcmp(word, expected) == 0); |
| 26 | TEST_MSG("Expected: %s", expected); |
| 27 | TEST_MSG("Produced: %s", word); |
| 28 | } |
| 29 | |
| 30 | void test_collocation_scores() { |
| 31 | COLLOCATORDB *cdb; |
| 32 | |
| 33 | cdb = open_collocatordb(dbpath); |
| 34 | TEST_ASSERT(cdb != NULL); |
| 35 | char *expected = " { \"f1\": 217,\"w1\":\"Aluminium\", \"N\": 152744, \"collocates\": [{\"word\":\"Anwendungstechnologie\",\"f2\":16,\"f\":16,\"npmi\":0.594849,\"pmi\":8.4592,\"llr\":188.227,\"lfmd\":16.4592,\"md\":12.4592,\"dice\":0.0711111,\"ld\":10.1862,\"ln_count\":16,\"rn_count\":0,\"ln_pmi\":9.4592,\"rn_pmi\":-1,\"ldaf\":11.1358,\"win\":32,\"afwin\":32}]}\n"; |
| 36 | char *produced = get_collocation_scores_as_json(cdb, 62, 966); |
| 37 | TEST_CHECK(strcmp(produced, expected) == 0); |
| 38 | TEST_MSG("Expected: %s", expected); |
| 39 | TEST_MSG("Produced: %s", produced); |
| 40 | } |
| 41 | |
| 42 | |
| 43 | void test_collocation_analysis_as_json() { |
| 44 | COLLOCATORDB *cdb; |
| 45 | |
| 46 | cdb = open_collocatordb(dbpath); |
| 47 | TEST_ASSERT(cdb != NULL); |
| 48 | char *json = get_collocators_as_json(cdb, testword); |
| 49 | char *needle = "\"word\":\"um\",\"f2\":264,\"f\":5,\"npmi\":-0.0556343,\"pmi\":-0.958064,\"llr\":2.87717,\"lfmd\":3.68579,\"md\":1.36386,\"dice\":0.00169952,\"ld\":4.79935,\"ln_count\":0,\"rn_count\":1,\"ln_pmi\":-1,\"rn_pmi\":-1,\"ldaf\":4.79935,\"win\":668,\"afwin\":668"; |
| 50 | TEST_CHECK(strstr(json, needle) > 0); |
| 51 | TEST_MSG("Expected to contain: %s", needle); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 52 | } |
| 53 | |
| 54 | void test_collocation_analysis() { |
| 55 | COLLOCATORDB *cdb; |
| 56 | |
| 57 | cdb = open_collocatordb(dbpath); |
| 58 | TEST_ASSERT(cdb != NULL); |
| 59 | char *expected = "Anwendungstechnologie"; |
| 60 | const COLLOCATOR *c = get_collocators(cdb, 62); |
| 61 | char *produced = get_word(cdb,c[0].w2); |
| 62 | TEST_CHECK(strcmp(produced, expected) == 0); |
| 63 | TEST_MSG("Expected: %s", expected); |
| 64 | TEST_MSG("Produced: %s", produced); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 65 | } |
| 66 | |
| 67 | int unlink_cb(const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf) { |
| 68 | int rv = remove(fpath); |
| 69 | if (rv) |
| 70 | perror(fpath); |
| 71 | return rv; |
| 72 | } |
| 73 | |
| 74 | int rmrf(char *path) { |
| 75 | return nftw(path, unlink_cb, 64, FTW_DEPTH | FTW_PHYS); |
| 76 | } |
| 77 | |
| 78 | void test_writing() { |
| 79 | char *tmp = tempnam(NULL, NULL); |
| 80 | long size = 0; |
Marc Kupietz | 673bd81 | 2021-03-14 17:27:44 +0100 | [diff] [blame] | 81 | int i; |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 82 | |
| 83 | char *rocksdbfn = malloc(strlen(tmp)+strlen(".rocksdb")); |
| 84 | strcpy (rocksdbfn, tmp); |
| 85 | strcat(rocksdbfn, ".rocksdb"); |
| 86 | COLLOCATORDB *cdb = open_collocatordb_for_write(rocksdbfn); |
| 87 | |
| 88 | char *vocabfn = malloc(strlen(tmp)+strlen(".vocab")); |
| 89 | strcpy(vocabfn, tmp); |
| 90 | strcat(vocabfn, ".vocab"); |
| 91 | FILE *h = fopen(vocabfn, "w"); |
| 92 | fprintf(h, "word0 2000\n"); |
| 93 | fprintf(h, "word1 2000\n"); |
| 94 | fprintf(h, "word2 2000\n"); |
| 95 | fclose(h); |
| 96 | read_vocab(cdb, vocabfn); |
| 97 | inc_collocator(cdb, 0, 1, 4); size++; |
Marc Kupietz | 673bd81 | 2021-03-14 17:27:44 +0100 | [diff] [blame] | 98 | for (i=0; i < 1000; i++) { |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 99 | inc_collocator(cdb, 0, 1, i % 5); size++; |
| 100 | inc_collocator(cdb, 0, 1, -i % 5); size++; |
| 101 | inc_collocator(cdb, 1, 0, i % 5); size++; |
| 102 | inc_collocator(cdb, 1, 0, -i % 5); size++; |
| 103 | inc_collocator(cdb, 0, 2, i % 5); size++; |
| 104 | inc_collocator(cdb, 0, 2, -i % 5); size++; |
| 105 | } |
| 106 | inc_collocator(cdb, 1, 2, 4); size++; |
| 107 | COLLOCATOR *c = get_collocators(cdb, 0); |
| 108 | TEST_ASSERT(c != NULL); |
Marc Kupietz | 1b09e4d | 2021-03-14 15:20:19 +0100 | [diff] [blame] | 109 | TEST_CHECK(c[0].w2 == 1); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 110 | TEST_CHECK(c[0].raw == 2001); |
| 111 | TEST_CHECK(c[0].left_raw == 200); |
| 112 | TEST_CHECK(c[0].right_raw == 200); |
| 113 | |
| 114 | rmrf(rocksdbfn); |
| 115 | } |
| 116 | |
| 117 | TEST_LIST = { |
| 118 | { "open database for reading", test_open_db }, |
| 119 | { "get word", test_get_word }, |
| 120 | { "collocation scores", test_collocation_scores }, |
| 121 | { "collocation analysis", test_collocation_analysis }, |
| 122 | { "collocation analysis as json", test_collocation_analysis_as_json }, |
| 123 | { "writing", test_writing }, |
| 124 | { NULL, NULL } |
| 125 | }; |