blob: dc2c16d50dabe3281917b2ced67484545f3a55be [file] [log] [blame]
feldmuellerb1032202025-02-11 09:02:09 +01001import sys
2
3def load_vocab(file_path):
4 """Loads a vocabulary file into a dictionary."""
5 vocab = {}
6 with open(file_path, 'r', encoding='utf-8', errors="replace") as f:
7 for line in f:
8 word, freq = line.strip("\n").rsplit(" ", maxsplit=1)
9 if word in vocab:
10 print(word)
11 vocab[word] = int(freq)
12 return vocab
13
14def merge_vocab(reference_path, focus_path, merged_path):
15 """Merges vocabularies by replacing reference frequencies with focus frequencies."""
16 # Load reference and focus vocabularies
17 reference_vocab = load_vocab(reference_path)
18 focus_vocab = load_vocab(focus_path)
19
20 # Create the merged vocabulary
21 merged_vocab = {}
22 for word in reference_vocab:
23 # Replace frequency with focus_vocab frequency, or set to 0 if not found
24 merged_vocab[word] = focus_vocab.get(word, 0)
25
26 # Check if dicts are identical
27
28 len_merged = len(merged_vocab)
29 len_reference = len(reference_vocab)
30
31 if len_reference != len_merged:
32 print(f"Length of reference ({len_reference}) and merged ({len_merged}) vocab are not identical.")
33 for word, freq in reference_vocab.items():
34 if word not in merged_vocab:
35 print(f"{word} is missing in merged vocab!")
36
37
38 # Save the merged vocabulary to the output file
39 with open(merged_path, 'w', encoding='utf-8') as f:
40 for word, freq in merged_vocab.items():
41 f.write(f"{word} {freq}\n")
42
43if __name__ == "__main__":
44 # Example usage:
45 # python script.py reference_vocab.txt focus_vocab.txt merged_vocab.txt
46
47 if len(sys.argv) != 4:
48 print("Usage: python script.py <reference_vocab> <focus_vocab> <merged_vocab>")
49 sys.exit(1)
50
51 reference_vocab_path = sys.argv[1]
52 focus_vocab_path = sys.argv[2]
53 merged_vocab_path = sys.argv[3]
54
55 merge_vocab(reference_vocab_path, focus_vocab_path, merged_vocab_path)