Handle KorAP-CoNLLU comment metadata (foundry, filename) in case of spacy
Change-Id: I9e41e7369126aaa5a0043dca7e4fc7d8609d750b
diff --git a/lib/CoNLL_Annotation.py b/lib/CoNLL_Annotation.py
index 02fae7d..69bf4a7 100644
--- a/lib/CoNLL_Annotation.py
+++ b/lib/CoNLL_Annotation.py
@@ -182,12 +182,14 @@
return ann
-def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token, comment_str="###C:"):
+def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token, comment_str="###C:", our_foundry="spacy"):
n_sents = 0
annotated_sentences, buffer_meta, buffer_lst = [], [], []
for i, line in enumerate(line_generator):
if line.startswith(comment_str):
- buffer_meta.append(line)
+ line = re.sub("(foundry\s*=\s*).*", r"\1" + our_foundry, line)
+ line = re.sub("(filename\s*=\s* .[^/]*/[^/]+/[^/]+/).*", r"\1" + our_foundry + "/morpho.xml", line)
+ buffer_meta.append(line)
continue
if len(line.split()) > 0:
buffer_lst.append(line)