Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | #include <stdio.h> |
| 16 | #include <string.h> |
| 17 | #include <math.h> |
| 18 | #include <stdlib.h> |
| 19 | #include <time.h> |
| 20 | |
| 21 | const long long max_size = 2000; // max length of strings |
| 22 | const long long N = 10; // number of closest words that will be shown |
| 23 | const long long max_w = 50; // max length of vocabulary entries |
| 24 | |
| 25 | #define MAX_STRING 100 |
| 26 | void ReadWord(char *word, FILE *fin) { |
| 27 | int a = 0, ch; |
| 28 | while (!feof(fin)) { |
| 29 | ch = fgetc(fin); |
| 30 | if (ch == 13) continue; |
| 31 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { |
| 32 | if (a > 0) { |
| 33 | if (ch == '\n') ungetc(ch, fin); |
| 34 | break; |
| 35 | } |
| 36 | if (ch == '\n') { |
| 37 | strcpy(word, (char *)"</s>"); |
| 38 | return; |
| 39 | } else continue; |
| 40 | } |
| 41 | word[a] = ch; |
| 42 | a++; |
| 43 | if (a >= MAX_STRING - 1) a--; // Truncate too long words |
| 44 | } |
| 45 | word[a] = 0; |
| 46 | } |
| 47 | |
| 48 | int main(int argc, char **argv) { |
| 49 | FILE *f; |
| 50 | char st1[max_size]; |
| 51 | char *bestw[N]; |
| 52 | char file_name[max_size], st[100][max_size]; |
| 53 | float dist, len, bestd[N], bestclasses[N], vec[max_size]; |
| 54 | int bestclasses_ids[N]; |
| 55 | long long words, size, a, b, c, d, e, cn, bi[100]; |
| 56 | float *M; |
| 57 | char *vocab; |
| 58 | char word[MAX_STRING]; |
| 59 | clock_t begin; |
| 60 | if (argc < 2) { |
| 61 | printf("Usage: ./kmeans_txt <FILE>\nwhere FILE contains features\n <number_of_classes>"); |
| 62 | return 0; |
| 63 | } |
| 64 | strcpy(file_name, argv[1]); |
| 65 | int classes = atoi(argv[2]); |
| 66 | f = fopen(file_name, "rb"); |
| 67 | if (f == NULL) { |
| 68 | printf("Input file not found\n"); |
| 69 | return -1; |
| 70 | } |
| 71 | |
| 72 | printf("reading data\n"); |
| 73 | ReadWord(word, f); |
| 74 | words = atoi(word); |
| 75 | ReadWord(word, f); |
| 76 | size = atoi(word); |
| 77 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); |
| 78 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); |
| 79 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); |
| 80 | if (M == NULL) { |
| 81 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); |
| 82 | return -1; |
| 83 | } |
| 84 | for (b = 0; b < words; b++) { |
| 85 | a = 0; |
| 86 | while (1) { |
| 87 | vocab[b * max_w + a] = fgetc(f); |
| 88 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; |
| 89 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; |
| 90 | } |
| 91 | vocab[b * max_w + a] = 0; |
| 92 | for (a = 0; a < size; a++) { |
| 93 | ReadWord(word,f); |
| 94 | M[a + b * size] = atof(word); |
| 95 | } |
| 96 | len = 0; |
| 97 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; |
| 98 | len = sqrt(len); |
| 99 | for (a = 0; a < size; a++) M[a + b * size] /= len; |
| 100 | } |
| 101 | fclose(f); |
| 102 | |
| 103 | //run kmeans |
| 104 | printf("running k-means with %i classes...\n",classes); |
| 105 | int clcn = classes, iter = 10, closeid; |
| 106 | int *centcn = (int *)malloc(classes * sizeof(int)); |
| 107 | int *cl = (int *)calloc(words, sizeof(int)); |
| 108 | float closev, x; |
| 109 | float *cent = (float *)calloc(classes * size, sizeof(float)); |
| 110 | for (a = 0; a < words; a++) cl[a] = a % clcn; |
| 111 | for (a = 0; a < iter; a++) { |
| 112 | for (b = 0; b < clcn * size; b++) cent[b] = 0; |
| 113 | for (b = 0; b < clcn; b++) centcn[b] = 1; |
| 114 | for (c = 0; c < words; c++) { |
| 115 | for (d = 0; d < size; d++) cent[size * cl[c] + d] += M[c * size + d]; |
| 116 | centcn[cl[c]]++; |
| 117 | } |
| 118 | for (b = 0; b < clcn; b++) { |
| 119 | closev = 0; |
| 120 | for (c = 0; c < size; c++) { |
| 121 | cent[size * b + c] /= centcn[b]; |
| 122 | closev += cent[size * b + c] * cent[size * b + c]; |
| 123 | } |
| 124 | closev = sqrt(closev); |
| 125 | for (c = 0; c < size; c++) cent[size * b + c] /= closev; |
| 126 | } |
| 127 | for (c = 0; c < words; c++) { |
| 128 | closev = -10; |
| 129 | closeid = 0; |
| 130 | for (d = 0; d < clcn; d++) { |
| 131 | x = 0; |
| 132 | for (b = 0; b < size; b++) x += cent[size * d + b] * M[c * size + b]; |
| 133 | if (x > closev) { |
| 134 | closev = x; |
| 135 | closeid = d; |
| 136 | } |
| 137 | } |
| 138 | cl[c] = closeid; |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | // build an array of words ordered by class and their offsets (index where each class starts) |
| 143 | int class_words[words]; |
| 144 | int class_offsets[classes]; |
| 145 | for(a = 0; a < classes; a++) class_offsets[a]=0; |
| 146 | for(a = 0; a < words; a++) class_offsets[cl[a]]++; |
| 147 | for(a = 1; a < classes; a++) class_offsets[a] += class_offsets[a-1]; |
| 148 | for(a = 0; a < words; a++) class_words[--class_offsets[cl[a]]] = a; |
| 149 | |
| 150 | //reading from input |
| 151 | while (1) { |
| 152 | for (a = 0; a < N; a++) bestd[a] = 0; |
| 153 | for (a = 0; a < N; a++) bestclasses[a] = 0; |
| 154 | for (a = 0; a < N; a++) bestw[a][0] = 0; |
| 155 | printf("Enter word or sentence (EXIT to break): "); |
| 156 | a = 0; |
| 157 | while (1) { |
| 158 | st1[a] = fgetc(stdin); |
| 159 | if ((st1[a] == '\n') || (a >= max_size - 1)) { |
| 160 | st1[a] = 0; |
| 161 | break; |
| 162 | } |
| 163 | a++; |
| 164 | } |
| 165 | if (!strcmp(st1, "EXIT")) break; |
| 166 | cn = 0; |
| 167 | b = 0; |
| 168 | c = 0; |
| 169 | while (1) { |
| 170 | st[cn][b] = st1[c]; |
| 171 | b++; |
| 172 | c++; |
| 173 | st[cn][b] = 0; |
| 174 | if (st1[c] == 0) break; |
| 175 | if (st1[c] == ' ') { |
| 176 | cn++; |
| 177 | b = 0; |
| 178 | c++; |
| 179 | } |
| 180 | } |
| 181 | cn++; |
| 182 | for (a = 0; a < cn; a++) { |
| 183 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; |
| 184 | if (b == words) b = -1; |
| 185 | bi[a] = b; |
| 186 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); |
| 187 | if (b == -1) { |
| 188 | printf("Out of dictionary word!\n"); |
| 189 | break; |
| 190 | } |
| 191 | } |
| 192 | if (b == -1) continue; |
| 193 | begin = clock(); |
| 194 | |
| 195 | printf("\n Word Cosine distance\n------------------------------------------------------------------------\n"); |
| 196 | |
| 197 | for (a = 0; a < size; a++) vec[a] = 0; |
| 198 | for (b = 0; b < cn; b++) { |
| 199 | if (bi[b] == -1) continue; |
| 200 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; |
| 201 | } |
| 202 | |
| 203 | len = 0; |
| 204 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; |
| 205 | len = sqrt(len); |
| 206 | for (a = 0; a < size; a++) vec[a] /= len; |
| 207 | |
| 208 | // find top N centroids |
| 209 | for (a = 0; a < N; a++) bestclasses[a] = -1; |
| 210 | for (a = 0; a < N; a++) bestclasses_ids[a] = -1; |
| 211 | for (c = 0; c < classes; c++){ |
| 212 | dist = 0; |
| 213 | for (a = 0; a < size; a++) dist += vec[a] * cent[a + size * c]; |
| 214 | for (a = 0; a < N; a++) { |
| 215 | if (dist > bestclasses[a]) { |
| 216 | for(d = N - 1; d > a; d--){ |
| 217 | bestclasses[d] = bestclasses[d-1]; |
| 218 | bestclasses_ids[d] = bestclasses_ids[d-1]; |
| 219 | } |
| 220 | bestclasses[a] = dist; |
| 221 | bestclasses_ids[a] = c; |
| 222 | break; |
| 223 | } |
| 224 | } |
| 225 | } |
| 226 | |
| 227 | // find top N words in the centroids |
| 228 | for (a = 0; a < N; a++) bestd[a] = -1; |
| 229 | for (a = 0; a < N; a++) bestw[a][0] = 0; |
| 230 | for (a = 0; a < N; a++){ |
| 231 | c = words; |
| 232 | if(bestclasses_ids[a] < classes-1) c = class_offsets[bestclasses_ids[a]+1]; |
| 233 | b = class_offsets[bestclasses_ids[a]]; |
| 234 | for(; b < c; b++){ |
| 235 | dist = 0; |
| 236 | for (d = 0; d < size; d++) dist += vec[d] * M[d + class_words[b] * size]; |
| 237 | for (d = 0; d < N; d++){ |
| 238 | if(dist > bestd[d]){ |
| 239 | for (e = N -1; e > d; e--){ |
| 240 | bestd[e] = bestd[e-1]; |
| 241 | strcpy(bestw[e], bestw[e-1]); |
| 242 | } |
| 243 | bestd[d] = dist; |
| 244 | strcpy(bestw[d], &vocab[class_words[b] * max_w]); |
| 245 | break; |
| 246 | } |
| 247 | } |
| 248 | } |
| 249 | } |
| 250 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); |
| 251 | printf("time spent = %f seconds\n", (double)(clock() - begin) / CLOCKS_PER_SEC); |
| 252 | } |
| 253 | // Save the K-means classes |
| 254 | |
| 255 | free(centcn); |
| 256 | free(cent); |
| 257 | free(cl); |
| 258 | |
| 259 | //start running distance |
| 260 | return 0; |
| 261 | } |