Handle KorAP-CoNLLU comment metadata (foundry, filename) in case of spacy
Change-Id: I9e41e7369126aaa5a0043dca7e4fc7d8609d750b
diff --git a/lib/CoNLL_Annotation.py b/lib/CoNLL_Annotation.py
index 02fae7d..69bf4a7 100644
--- a/lib/CoNLL_Annotation.py
+++ b/lib/CoNLL_Annotation.py
@@ -182,12 +182,14 @@
return ann
-def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token, comment_str="###C:"):
+def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token, comment_str="###C:", our_foundry="spacy"):
n_sents = 0
annotated_sentences, buffer_meta, buffer_lst = [], [], []
for i, line in enumerate(line_generator):
if line.startswith(comment_str):
- buffer_meta.append(line)
+ line = re.sub("(foundry\s*=\s*).*", r"\1" + our_foundry, line)
+ line = re.sub("(filename\s*=\s* .[^/]*/[^/]+/[^/]+/).*", r"\1" + our_foundry + "/morpho.xml", line)
+ buffer_meta.append(line)
continue
if len(line.split()) > 0:
buffer_lst.append(line)
diff --git a/my_utils/file_utils.py b/my_utils/file_utils.py
index 4dbe1b4..c5eb8ed 100644
--- a/my_utils/file_utils.py
+++ b/my_utils/file_utils.py
@@ -45,9 +45,9 @@
yield line
-def get_file_annos_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+def get_file_annos_chunk(line_generator, chunk_size, token_class, comment_str="###C:", our_foundry="spacy"):
file_has_next = True
- chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
+ chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str, our_foundry=our_foundry)
if n_sents == 0: file_has_next = False
sents, gld, meta = [], [], []
return chunk, file_has_next
diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
index b39e2f2..36f6de3 100644
--- a/systems/parse_spacy_pipe.py
+++ b/systems/parse_spacy_pipe.py
@@ -86,7 +86,7 @@
total_processed_sents = 0
while file_has_next:
- annos, file_has_next = fu.get_file_annos_chunk(stdin, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
+ annos, file_has_next = fu.get_file_annos_chunk(stdin, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str, our_foundry="spacy")
if len(annos) == 0: break
total_processed_sents += len(annos)
logger.info(f"Already processed {total_processed_sents} sentences...")