add script to create vocab counts based on focus corpus
Change-Id: I84dc39ce8230fc61b6e2c31b963dd2f2cce51aab
diff --git a/README.md b/README.md
index 37fd7d9..5fef2fe 100644
--- a/README.md
+++ b/README.md
@@ -47,12 +47,22 @@
## Retrain existing model with new data
-For example:
+### For example:
+### Retrain Vectors:
```bash
dereko2vec -train new.traindata -output new.vecs -save-net new.net -type 3 -size 200 -window 5 -negative 10 -threads 44 -binary 1 -iter 100 -read-vocab old.vocab -read-net old.net
```
+### Create new RocksDB:
+
+```bash
+dereko2vec -train new.traindata -output new.rocksdb -type 5 -window 5 -threads 8 -binary 1 -iter 1 -read-vocab old.vocab -sample 0 -min-count 0
+dereko2vec -train new.traindata -output .temp.rocksdb -type 5 -window 5 -threads 8 -binary 1 -iter 1 -save-vocab new_focus.vocab -sample 0 -min-count 0
+rm -rf .temp.rocksdb
+python scripts/merge_vocabs.py old.vocab new_focus.vocab new.vocab
+```
+
## References
```bash
diff --git a/scripts/merge_vocabs.py b/scripts/merge_vocabs.py
new file mode 100644
index 0000000..dc2c16d
--- /dev/null
+++ b/scripts/merge_vocabs.py
@@ -0,0 +1,55 @@
+import sys
+
+def load_vocab(file_path):
+ """Loads a vocabulary file into a dictionary."""
+ vocab = {}
+ with open(file_path, 'r', encoding='utf-8', errors="replace") as f:
+ for line in f:
+ word, freq = line.strip("\n").rsplit(" ", maxsplit=1)
+ if word in vocab:
+ print(word)
+ vocab[word] = int(freq)
+ return vocab
+
+def merge_vocab(reference_path, focus_path, merged_path):
+ """Merges vocabularies by replacing reference frequencies with focus frequencies."""
+ # Load reference and focus vocabularies
+ reference_vocab = load_vocab(reference_path)
+ focus_vocab = load_vocab(focus_path)
+
+ # Create the merged vocabulary
+ merged_vocab = {}
+ for word in reference_vocab:
+ # Replace frequency with focus_vocab frequency, or set to 0 if not found
+ merged_vocab[word] = focus_vocab.get(word, 0)
+
+ # Check if dicts are identical
+
+ len_merged = len(merged_vocab)
+ len_reference = len(reference_vocab)
+
+ if len_reference != len_merged:
+ print(f"Length of reference ({len_reference}) and merged ({len_merged}) vocab are not identical.")
+ for word, freq in reference_vocab.items():
+ if word not in merged_vocab:
+ print(f"{word} is missing in merged vocab!")
+
+
+ # Save the merged vocabulary to the output file
+ with open(merged_path, 'w', encoding='utf-8') as f:
+ for word, freq in merged_vocab.items():
+ f.write(f"{word} {freq}\n")
+
+if __name__ == "__main__":
+ # Example usage:
+ # python script.py reference_vocab.txt focus_vocab.txt merged_vocab.txt
+
+ if len(sys.argv) != 4:
+ print("Usage: python script.py <reference_vocab> <focus_vocab> <merged_vocab>")
+ sys.exit(1)
+
+ reference_vocab_path = sys.argv[1]
+ focus_vocab_path = sys.argv[2]
+ merged_vocab_path = sys.argv[3]
+
+ merge_vocab(reference_vocab_path, focus_vocab_path, merged_vocab_path)
\ No newline at end of file