feldmueller | b103220 | 2025-02-11 09:02:09 +0100 | [diff] [blame] | 1 | import sys |
| 2 | |
| 3 | def load_vocab(file_path): |
| 4 | """Loads a vocabulary file into a dictionary.""" |
| 5 | vocab = {} |
| 6 | with open(file_path, 'r', encoding='utf-8', errors="replace") as f: |
| 7 | for line in f: |
| 8 | word, freq = line.strip("\n").rsplit(" ", maxsplit=1) |
| 9 | if word in vocab: |
| 10 | print(word) |
| 11 | vocab[word] = int(freq) |
| 12 | return vocab |
| 13 | |
| 14 | def merge_vocab(reference_path, focus_path, merged_path): |
| 15 | """Merges vocabularies by replacing reference frequencies with focus frequencies.""" |
| 16 | # Load reference and focus vocabularies |
| 17 | reference_vocab = load_vocab(reference_path) |
| 18 | focus_vocab = load_vocab(focus_path) |
| 19 | |
| 20 | # Create the merged vocabulary |
| 21 | merged_vocab = {} |
| 22 | for word in reference_vocab: |
| 23 | # Replace frequency with focus_vocab frequency, or set to 0 if not found |
| 24 | merged_vocab[word] = focus_vocab.get(word, 0) |
| 25 | |
| 26 | # Check if dicts are identical |
| 27 | |
| 28 | len_merged = len(merged_vocab) |
| 29 | len_reference = len(reference_vocab) |
| 30 | |
| 31 | if len_reference != len_merged: |
| 32 | print(f"Length of reference ({len_reference}) and merged ({len_merged}) vocab are not identical.") |
| 33 | for word, freq in reference_vocab.items(): |
| 34 | if word not in merged_vocab: |
| 35 | print(f"{word} is missing in merged vocab!") |
| 36 | |
| 37 | |
| 38 | # Save the merged vocabulary to the output file |
| 39 | with open(merged_path, 'w', encoding='utf-8') as f: |
| 40 | for word, freq in merged_vocab.items(): |
| 41 | f.write(f"{word} {freq}\n") |
| 42 | |
| 43 | if __name__ == "__main__": |
| 44 | # Example usage: |
| 45 | # python script.py reference_vocab.txt focus_vocab.txt merged_vocab.txt |
| 46 | |
| 47 | if len(sys.argv) != 4: |
| 48 | print("Usage: python script.py <reference_vocab> <focus_vocab> <merged_vocab>") |
| 49 | sys.exit(1) |
| 50 | |
| 51 | reference_vocab_path = sys.argv[1] |
| 52 | focus_vocab_path = sys.argv[2] |
| 53 | merged_vocab_path = sys.argv[3] |
| 54 | |
| 55 | merge_vocab(reference_vocab_path, focus_vocab_path, merged_vocab_path) |