daza | a3c8c8b | 2020-08-31 17:19:11 +0200 | [diff] [blame] | 1 | from collections import defaultdict, OrderedDict |
| 2 | import re |
| 3 | |
| 4 | # CoNLL-U Format - https://universaldependencies.org/format.html |
| 5 | |
| 6 | |
| 7 | class CoNLLUP_Token(): |
| 8 | def __init__(self, raw_line, word_ix): |
| 9 | info = raw_line.split() |
| 10 | # [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC] |
| 11 | # [11, Prügel, Prügel, NN, NN, _, _, _, _, 1.000000] |
| 12 | self.info = info |
daza | e94ab18 | 2020-09-01 16:41:30 +0200 | [diff] [blame^] | 13 | self.id = info[0] # 1-based ID as in the CoNLL file |
| 14 | self.position = word_ix # 0-based position in sentence |
daza | a3c8c8b | 2020-08-31 17:19:11 +0200 | [diff] [blame] | 15 | self.word = info[1] |
| 16 | self.lemma = info[2] |
| 17 | self.pos_universal = info[3] |
| 18 | self.pos_tag = info[4] |
| 19 | self.detail_tag = info[5] |
| 20 | self.head = info[6] |
| 21 | self.dep_tag = info[7] |
| 22 | self.blank = info[8] # ??? |
| 23 | self.auto_score = info[9] |
| 24 | |
| 25 | def get_info(self): |
| 26 | return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag, |
| 27 | str(self.head), self.dep_tag, self.blank, self.auto_score] |
| 28 | |
| 29 | def get_conllU_line(self, separator="\t"): |
| 30 | info = self.get_info() |
| 31 | return separator.join(info) |
| 32 | |
| 33 | |
daza | e94ab18 | 2020-09-01 16:41:30 +0200 | [diff] [blame^] | 34 | |
| 35 | class CoNLL09_Token(): |
| 36 | def __init__(self, raw_line, word_ix): |
| 37 | info = raw_line.split() |
| 38 | # print(info) |
| 39 | # # ['1', 'Frau', 'Frau', 'Frau', 'NN', 'NN', '_', 'nom|sg|fem', '5', '5', 'CJ', 'CJ', '_', '_', 'AM-DIS', '_'] |
| 40 | self.info = info |
| 41 | self.id = info[0] # 1-based ID as in the CoNLL file |
| 42 | self.position = word_ix # 0-based position in sentence |
| 43 | self.word = info[1] |
| 44 | self.lemma = info[2] |
| 45 | self.pos_tag = info[4] |
| 46 | self.pos_universal = "_" # _convert_to_universal(self.pos_tag, self.lemma) |
| 47 | self.head = info[8] |
| 48 | self.dep_tag = info[10] |
| 49 | self.detail_tag = "_" |
| 50 | self.is_pred = True if info[12] == "Y" else False |
| 51 | if self.is_pred: |
| 52 | self.pred_sense = info[13].strip("[]") |
| 53 | self.pred_sense_id = str(self.position) + "##" + self.pred_sense |
| 54 | else: |
| 55 | self.pred_sense = None |
| 56 | self.pred_sense_id = "" |
| 57 | if len(info) > 14: |
| 58 | self.labels = info[14:] |
| 59 | else: |
| 60 | self.labels = [] |
| 61 | |
| 62 | def get_conllU_line(self, separator="\t"): |
| 63 | # We want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC] |
| 64 | tok_id = str(self.id) #.split("_")[0] |
| 65 | conllUinfo = [tok_id, self.word, self.lemma, self.pos_tag, self.pos_universal, self.detail_tag, self.head, self.dep_tag, "_", "_"] |
| 66 | return separator.join(conllUinfo) |
| 67 | |
| 68 | def get_conll09_line(self, delim="\t"): |
| 69 | # We want: |
| 70 | # 1 Frau Frau Frau NN NN _ nom|sg|fem 5 5 CJ CJ _ _ AM-DIS _ |
| 71 | # 10 fall fall fall VB VB _ _ 8 8 VC VC Y fall.01 _ _ _ _ _ |
| 72 | is_pred_str = "Y" if self.is_pred else "_" |
| 73 | sense_str = self.pred_sense if self.is_pred else "_" |
| 74 | info = [self.id, self.word, self.lemma, self.lemma, self.pos_tag, self.pos_tag, "_", self.detail_tag, |
| 75 | self.head, self.head, self.dep_tag, self.dep_tag, is_pred_str, sense_str] + self.labels |
| 76 | return delim.join(info) |
| 77 | |
| 78 | |
| 79 | |
daza | a3c8c8b | 2020-08-31 17:19:11 +0200 | [diff] [blame] | 80 | ################################# GETTING SENTENCE ANNOTATIONS #################################### |
| 81 | class AnnotatedSentence(): |
| 82 | def __init__(self): |
| 83 | self.metadata = [] |
| 84 | self.tokens = [] |
| 85 | |
| 86 | def get_words(self): |
| 87 | return [tok.word for tok in self.tokens] |
| 88 | |
| 89 | def get_sentence(self): |
| 90 | return " ".join([tok.word for tok in self.tokens]) |
| 91 | |
| 92 | def get_pos_tags(self, universal=False): |
| 93 | if universal: |
| 94 | return [tok.pos_universal for tok in self.tokens] |
| 95 | else: |
| 96 | return [tok.pos_tag for tok in self.tokens] |
| 97 | |
| 98 | |
daza | e94ab18 | 2020-09-01 16:41:30 +0200 | [diff] [blame^] | 99 | def get_annotation(raw_lines, raw_meta, token_class): |
daza | a3c8c8b | 2020-08-31 17:19:11 +0200 | [diff] [blame] | 100 | ann = AnnotatedSentence() |
| 101 | ann.metadata = [m.strip("\n") for m in raw_meta] |
| 102 | # Annotate the predicates and senses |
| 103 | real_index = 0 |
| 104 | for i, line in enumerate(raw_lines): |
| 105 | tok = token_class(line, real_index) |
daza | e94ab18 | 2020-09-01 16:41:30 +0200 | [diff] [blame^] | 106 | ann.tokens.append(tok) |
daza | a3c8c8b | 2020-08-31 17:19:11 +0200 | [diff] [blame] | 107 | real_index += 1 |
| 108 | return ann |
daza | e94ab18 | 2020-09-01 16:41:30 +0200 | [diff] [blame^] | 109 | |
| 110 | |
| 111 | def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token): |
| 112 | n_sents = 0 |
| 113 | annotated_sentences, buffer_meta, buffer_lst = [], [], [] |
| 114 | for i, line in enumerate(line_generator): |
| 115 | if line.startswith("###C:"): |
| 116 | buffer_meta.append(line) |
| 117 | continue |
| 118 | if len(line.split()) > 0: |
| 119 | buffer_lst.append(line) |
| 120 | else: |
| 121 | ann = get_annotation(buffer_lst, buffer_meta, token_class) |
| 122 | n_sents += 1 |
| 123 | buffer_lst, buffer_meta = [], [] |
| 124 | annotated_sentences.append(ann) |
| 125 | if chunk_size > 0 and n_sents == chunk_size: break |
| 126 | # logger.info("Read {} Sentences!".format(n_sents)) |
| 127 | return annotated_sentences, n_sents |
| 128 | |
| 129 | |
| 130 | def read_conll_generator(filepath, token_class=CoNLLUP_Token): |
| 131 | buffer_meta, buffer_lst = [], [] |
| 132 | with open(filepath) as f: |
| 133 | for i, line in enumerate(f.readlines()): |
| 134 | if line.startswith("###C:"): |
| 135 | continue |
| 136 | if len(line.split()) > 0: |
| 137 | buffer_lst.append(line) |
| 138 | else: |
| 139 | ann = get_annotation(buffer_lst, buffer_meta, token_class) |
| 140 | buffer_lst, buffer_meta = [], [] |
| 141 | yield ann |