wang2vec: add option -show-cc <int> to show collocations
Show words with their collocators starting from word rank
<int>. Depends on -read-vocab and -read-net.
diff --git a/word2vecExt.c b/word2vecExt.c
index 4e05e96..1cfff99 100644
--- a/word2vecExt.c
+++ b/word2vecExt.c
@@ -59,6 +59,8 @@
const int table_size = 1e8;
int *table;
+long cc = 0;
+
//constrastive negative sampling
char negative_classes_file[MAX_STRING];
int *word_to_group;
@@ -1345,6 +1347,10 @@
* (EXP_TABLE_SIZE
/ MAX_EXP / 2))])
* alpha;
+ if(debug_mode > 2 && ((long long) id) == 0) {
+ printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
+ printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
+ }
for (c = 0; c < layer1_size; c++)
neu1e[c] +=
g
@@ -1580,6 +1586,63 @@
pthread_exit(NULL);
}
+void ShowCollocations() {
+ long a, b, c, d, window_offset, target, max_target=0, maxmax_target;
+ real f, max_f, maxmax_f;
+ real *target_sums;
+ a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
+
+ for (d = cc; d < vocab_size; d++) {
+ for (b = 0; b < vocab_size; b++)
+ target_sums[b]=0;
+ maxmax_f = -1;
+ maxmax_target = 0;
+ for (a = 0; a < window * 2 + 1; a++) {
+ if (a != window) {
+ max_f = -1;
+ window_offset = a * layer1_size;
+ if (a > window)
+ window_offset -= layer1_size;
+ for(target = 0; target < vocab_size; target ++) {
+ if(target == d)
+ continue;
+ f = 0;
+ for (c = 0; c < layer1_size; c++)
+ f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
+ if (f < -MAX_EXP)
+ continue;
+ else if (f > MAX_EXP)
+ continue;
+ else
+ f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
+ if(f > max_f) {
+ max_f = f;
+ max_target = target;
+ }
+ target_sums[target]+=f;
+ }
+ printf("%s (%.2f) ", vocab[max_target].word, max_f);
+ if(max_f > maxmax_f) {
+ maxmax_f = max_f;
+ maxmax_target = max_target;
+ }
+ } else {
+ printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
+ }
+ }
+ max_f = -1;
+ for (b = 0; b < vocab_size; b++) {
+ if(target_sums[b] > max_f) {
+ max_f = target_sums[b];
+ max_target = b;
+ }
+ }
+ printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
+ vocab[max_target].word, max_f/window/2,
+ vocab[maxmax_target].word, maxmax_f);
+ }
+}
+
void TrainModel() {
long a, b, c, d;
FILE *fo;
@@ -1595,6 +1658,8 @@
if (output_file[0] == 0)
return;
InitNet();
+ if(cc > 0)
+ ShowCollocations();
if (negative > 0 || nce > 0)
InitUnigramTable();
if (negative_classes_file[0] != 0)
@@ -1748,6 +1813,8 @@
"\t\tThe net parameters will be read from <file>, not initialized randomly\n");
printf("\t-save-net <file>\n");
printf("\t\tThe net parameters will be saved to <file>\n");
+ printf("\t-show-cc <int>\n");
+ printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
printf("\t-type <int>\n");
printf(
"\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
@@ -1781,6 +1848,8 @@
debug_mode = atoi(argv[i + 1]);
if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
binary = atoi(argv[i + 1]);
+ if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
+ cc = atoi(argv[i + 1]);
if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
type = atoi(argv[i + 1]);
if ((i = ArgPos((char *) "-output", argc, argv)) > 0)