Blame - kmeans_txt.c - ids-kl/dereko2vec

blob: 16934abdf0eae3d99a7fe6c63e7397678d252a28 [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include <stdio.h>
				16	#include <string.h>
				17	#include <math.h>
				18	#include <stdlib.h>
				19
				20	const long long max_size = 2000; // max length of strings
				21	const long long N = 40; // number of closest words that will be shown
				22	const long long max_w = 50; // max length of vocabulary entries
				23
				24	#define MAX_STRING 100
				25	void ReadWord(char word, FILE fin) {
				26	int a = 0, ch;
				27	while (!feof(fin)) {
				28	ch = fgetc(fin);
				29	if (ch == 13) continue;
				30	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				31	if (a > 0) {
				32	if (ch == '\n') ungetc(ch, fin);
				33	break;
				34	}
				35	if (ch == '\n') {
				36	strcpy(word, (char *)"</s>");
				37	return;
				38	} else continue;
				39	}
				40	word[a] = ch;
				41	a++;
				42	if (a >= MAX_STRING - 1) a--; // Truncate too long words
				43	}
				44	word[a] = 0;
				45	}
				46
				47	int main(int argc, char **argv) {
				48	FILE *f;
				49	char file_name[max_size], output_file[max_size];
				50	float len;
				51	long long words, size, a, b, c, d;
				52	float *M;
				53	char *vocab;
				54	char word[MAX_STRING];
				55	if (argc < 3) {
				56	printf("Usage: ./kmeans_txt <FILE>\nwhere FILE contains features\n <number_of_classes>");
				57	return 0;
				58	}
				59	strcpy(file_name, argv[1]);
				60	strcpy(output_file, argv[2]);
				61	int classes = atoi(argv[3]);
				62	f = fopen(file_name, "rb");
				63	if (f == NULL) {
				64	printf("Input file not found\n");
				65	return -1;
				66	}
				67
				68	FILE *fo = fopen(output_file, "wb");
				69
				70	ReadWord(word, f);
				71	words = atoi(word);
				72	ReadWord(word, f);
				73	size = atoi(word);
				74	vocab = (char )malloc((long long)words max_w * sizeof(char));
				75	M = (float )malloc((long long)words (long long)size * sizeof(float));
				76	if (M == NULL) {
				77	printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
				78	return -1;
				79	}
				80	for (b = 0; b < words; b++) {
				81	a = 0;
				82	while (1) {
				83	vocab[b * max_w + a] = fgetc(f);
				84	if (feof(f) \|\| (vocab[b * max_w + a] == ' ')) break;
				85	if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
				86	}
				87	vocab[b * max_w + a] = 0;
				88	for (a = 0; a < size; a++) {
				89	ReadWord(word,f);
				90	M[a + b * size] = atof(word);
				91	}
				92	len = 0;
				93	for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
				94	len = sqrt(len);
				95	for (a = 0; a < size; a++) M[a + b * size] /= len;
				96	}
				97	fclose(f);
				98
				99	//run kmeans
				100	int clcn = classes, iter = 2, closeid;
				101	int centcn = (int )malloc(classes * sizeof(int));
				102	int cl = (int )calloc(words, sizeof(int));
				103	float closev, x;
				104	float cent = (float )calloc(classes * size, sizeof(float));
				105	for (a = 0; a < words; a++) cl[a] = a % clcn;
				106	for (a = 0; a < iter; a++) {
				107	for (b = 0; b < clcn * size; b++) cent[b] = 0;
				108	for (b = 0; b < clcn; b++) centcn[b] = 1;
				109	for (c = 0; c < words; c++) {
				110	for (d = 0; d < size; d++) cent[size * cl[c] + d] += M[c * size + d];
				111	centcn[cl[c]]++;
				112	}
				113	for (b = 0; b < clcn; b++) {
				114	closev = 0;
				115	for (c = 0; c < size; c++) {
				116	cent[size * b + c] /= centcn[b];
				117	closev += cent[size * b + c] * cent[size * b + c];
				118	}
				119	closev = sqrt(closev);
				120	for (c = 0; c < size; c++) cent[size * b + c] /= closev;
				121	}
				122	for (c = 0; c < words; c++) {
				123	closev = -10;
				124	closeid = 0;
				125	for (d = 0; d < clcn; d++) {
				126	x = 0;
				127	for (b = 0; b < size; b++) x += cent[size * d + b] * M[c * size + b];
				128	if (x > closev) {
				129	closev = x;
				130	closeid = d;
				131	}
				132	}
				133	cl[c] = closeid;
				134	}
				135	}
				136
				137	// build an array of words ordered by class and their offsets (index where each class starts)
				138	int class_words[words];
				139	int class_offsets[classes];
				140	for(a = 0; a < classes; a++) class_offsets[a]=0;
				141	for(a = 0; a < words; a++) class_offsets[cl[a]]++;
				142	for(a = 1; a < classes; a++) class_offsets[a] += class_offsets[a-1];
				143	for(a = 0; a < words; a++) class_words[--class_offsets[cl[a]]] = a;
				144
				145	for (a = 0; a < classes; a++){
				146	c = words;
				147	if(a < classes-1) c = class_offsets[a+1];
				148	b = class_offsets[a];
				149	for(; b < c; b++){
				150	fprintf(fo, "%lld %s\n", a ,&vocab[class_words[b] * max_w]);
				151	}
				152	}
				153	// Save the K-means classes
				154	//for (a = 0; a < words; a++) fprintf(fo, "%s %d\n", &vocab[a * max_w], cl[a]);
				155	free(centcn);
				156	free(cent);
				157	free(cl);
				158	free(M);
				159	free(vocab);
				160	return 0;
				161	}