blob: dc2c16d50dabe3281917b2ced67484545f3a55be [file] [log] [blame]
import sys
def load_vocab(file_path):
"""Loads a vocabulary file into a dictionary."""
vocab = {}
with open(file_path, 'r', encoding='utf-8', errors="replace") as f:
for line in f:
word, freq = line.strip("\n").rsplit(" ", maxsplit=1)
if word in vocab:
print(word)
vocab[word] = int(freq)
return vocab
def merge_vocab(reference_path, focus_path, merged_path):
"""Merges vocabularies by replacing reference frequencies with focus frequencies."""
# Load reference and focus vocabularies
reference_vocab = load_vocab(reference_path)
focus_vocab = load_vocab(focus_path)
# Create the merged vocabulary
merged_vocab = {}
for word in reference_vocab:
# Replace frequency with focus_vocab frequency, or set to 0 if not found
merged_vocab[word] = focus_vocab.get(word, 0)
# Check if dicts are identical
len_merged = len(merged_vocab)
len_reference = len(reference_vocab)
if len_reference != len_merged:
print(f"Length of reference ({len_reference}) and merged ({len_merged}) vocab are not identical.")
for word, freq in reference_vocab.items():
if word not in merged_vocab:
print(f"{word} is missing in merged vocab!")
# Save the merged vocabulary to the output file
with open(merged_path, 'w', encoding='utf-8') as f:
for word, freq in merged_vocab.items():
f.write(f"{word} {freq}\n")
if __name__ == "__main__":
# Example usage:
# python script.py reference_vocab.txt focus_vocab.txt merged_vocab.txt
if len(sys.argv) != 4:
print("Usage: python script.py <reference_vocab> <focus_vocab> <merged_vocab>")
sys.exit(1)
reference_vocab_path = sys.argv[1]
focus_vocab_path = sys.argv[2]
merged_vocab_path = sys.argv[3]
merge_vocab(reference_vocab_path, focus_vocab_path, merged_vocab_path)