Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | #include <stdio.h> |
| 16 | #include <string.h> |
| 17 | #include <math.h> |
| 18 | #include <stdlib.h> |
| 19 | |
| 20 | const long long max_size = 2000; // max length of strings |
| 21 | const long long N = 40; // number of closest words that will be shown |
| 22 | const long long max_w = 50; // max length of vocabulary entries |
| 23 | |
| 24 | #define MAX_STRING 100 |
| 25 | void ReadWord(char *word, FILE *fin) { |
| 26 | int a = 0, ch; |
| 27 | while (!feof(fin)) { |
| 28 | ch = fgetc(fin); |
| 29 | if (ch == 13) continue; |
| 30 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { |
| 31 | if (a > 0) { |
| 32 | if (ch == '\n') ungetc(ch, fin); |
| 33 | break; |
| 34 | } |
| 35 | if (ch == '\n') { |
| 36 | strcpy(word, (char *)"</s>"); |
| 37 | return; |
| 38 | } else continue; |
| 39 | } |
| 40 | word[a] = ch; |
| 41 | a++; |
| 42 | if (a >= MAX_STRING - 1) a--; // Truncate too long words |
| 43 | } |
| 44 | word[a] = 0; |
| 45 | } |
| 46 | |
| 47 | int main(int argc, char **argv) { |
| 48 | FILE *f; |
| 49 | char file_name[max_size], output_file[max_size]; |
| 50 | float len; |
| 51 | long long words, size, a, b, c, d; |
| 52 | float *M; |
| 53 | char *vocab; |
| 54 | char word[MAX_STRING]; |
| 55 | if (argc < 3) { |
| 56 | printf("Usage: ./kmeans_txt <FILE>\nwhere FILE contains features\n <number_of_classes>"); |
| 57 | return 0; |
| 58 | } |
| 59 | strcpy(file_name, argv[1]); |
| 60 | strcpy(output_file, argv[2]); |
| 61 | int classes = atoi(argv[3]); |
| 62 | f = fopen(file_name, "rb"); |
| 63 | if (f == NULL) { |
| 64 | printf("Input file not found\n"); |
| 65 | return -1; |
| 66 | } |
| 67 | |
| 68 | FILE *fo = fopen(output_file, "wb"); |
| 69 | |
| 70 | ReadWord(word, f); |
| 71 | words = atoi(word); |
| 72 | ReadWord(word, f); |
| 73 | size = atoi(word); |
| 74 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); |
| 75 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); |
| 76 | if (M == NULL) { |
| 77 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); |
| 78 | return -1; |
| 79 | } |
| 80 | for (b = 0; b < words; b++) { |
| 81 | a = 0; |
| 82 | while (1) { |
| 83 | vocab[b * max_w + a] = fgetc(f); |
| 84 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; |
| 85 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; |
| 86 | } |
| 87 | vocab[b * max_w + a] = 0; |
| 88 | for (a = 0; a < size; a++) { |
| 89 | ReadWord(word,f); |
| 90 | M[a + b * size] = atof(word); |
| 91 | } |
| 92 | len = 0; |
| 93 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; |
| 94 | len = sqrt(len); |
| 95 | for (a = 0; a < size; a++) M[a + b * size] /= len; |
| 96 | } |
| 97 | fclose(f); |
| 98 | |
| 99 | //run kmeans |
| 100 | int clcn = classes, iter = 2, closeid; |
| 101 | int *centcn = (int *)malloc(classes * sizeof(int)); |
| 102 | int *cl = (int *)calloc(words, sizeof(int)); |
| 103 | float closev, x; |
| 104 | float *cent = (float *)calloc(classes * size, sizeof(float)); |
| 105 | for (a = 0; a < words; a++) cl[a] = a % clcn; |
| 106 | for (a = 0; a < iter; a++) { |
| 107 | for (b = 0; b < clcn * size; b++) cent[b] = 0; |
| 108 | for (b = 0; b < clcn; b++) centcn[b] = 1; |
| 109 | for (c = 0; c < words; c++) { |
| 110 | for (d = 0; d < size; d++) cent[size * cl[c] + d] += M[c * size + d]; |
| 111 | centcn[cl[c]]++; |
| 112 | } |
| 113 | for (b = 0; b < clcn; b++) { |
| 114 | closev = 0; |
| 115 | for (c = 0; c < size; c++) { |
| 116 | cent[size * b + c] /= centcn[b]; |
| 117 | closev += cent[size * b + c] * cent[size * b + c]; |
| 118 | } |
| 119 | closev = sqrt(closev); |
| 120 | for (c = 0; c < size; c++) cent[size * b + c] /= closev; |
| 121 | } |
| 122 | for (c = 0; c < words; c++) { |
| 123 | closev = -10; |
| 124 | closeid = 0; |
| 125 | for (d = 0; d < clcn; d++) { |
| 126 | x = 0; |
| 127 | for (b = 0; b < size; b++) x += cent[size * d + b] * M[c * size + b]; |
| 128 | if (x > closev) { |
| 129 | closev = x; |
| 130 | closeid = d; |
| 131 | } |
| 132 | } |
| 133 | cl[c] = closeid; |
| 134 | } |
| 135 | } |
| 136 | |
| 137 | // build an array of words ordered by class and their offsets (index where each class starts) |
| 138 | int class_words[words]; |
| 139 | int class_offsets[classes]; |
| 140 | for(a = 0; a < classes; a++) class_offsets[a]=0; |
| 141 | for(a = 0; a < words; a++) class_offsets[cl[a]]++; |
| 142 | for(a = 1; a < classes; a++) class_offsets[a] += class_offsets[a-1]; |
| 143 | for(a = 0; a < words; a++) class_words[--class_offsets[cl[a]]] = a; |
| 144 | |
| 145 | for (a = 0; a < classes; a++){ |
| 146 | c = words; |
| 147 | if(a < classes-1) c = class_offsets[a+1]; |
| 148 | b = class_offsets[a]; |
| 149 | for(; b < c; b++){ |
| 150 | fprintf(fo, "%lld %s\n", a ,&vocab[class_words[b] * max_w]); |
| 151 | } |
| 152 | } |
| 153 | // Save the K-means classes |
| 154 | //for (a = 0; a < words; a++) fprintf(fo, "%s %d\n", &vocab[a * max_w], cl[a]); |
| 155 | free(centcn); |
| 156 | free(cent); |
| 157 | free(cl); |
| 158 | free(M); |
| 159 | free(vocab); |
| 160 | return 0; |
| 161 | } |