Handle KorAP-CoNLLU comment metadata (foundry, filename) in case of spacy Change-Id: I9e41e7369126aaa5a0043dca7e4fc7d8609d750b

commit: a01314f4e6fa0382399b7a109aeda4e6c5e3830e [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Feb 11 17:02:08 2021 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Thu Feb 11 17:02:08 2021 +0100
tree: d4e120bb8a67bd9e8184baff41bb41a782f599e9
parent: 48606ba6066f150ae8223167ad968a65f58be80f [diff]
diff --git a/lib/CoNLL_Annotation.py b/lib/CoNLL_Annotation.py
index 02fae7d..69bf4a7 100644
--- a/lib/CoNLL_Annotation.py
+++ b/lib/CoNLL_Annotation.py

@@ -182,12 +182,14 @@
     return ann
 
 
-def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token, comment_str="###C:"):
+def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token, comment_str="###C:", our_foundry="spacy"):
     n_sents = 0
     annotated_sentences, buffer_meta, buffer_lst = [], [], []
     for i, line in enumerate(line_generator):
         if line.startswith(comment_str):
-            buffer_meta.append(line) 
+            line = re.sub("(foundry\s*=\s*).*", r"\1" + our_foundry, line)
+            line = re.sub("(filename\s*=\s* .[^/]*/[^/]+/[^/]+/).*", r"\1" + our_foundry + "/morpho.xml", line)
+            buffer_meta.append(line)
             continue
         if len(line.split()) > 0:
             buffer_lst.append(line)

diff --git a/my_utils/file_utils.py b/my_utils/file_utils.py
index 4dbe1b4..c5eb8ed 100644
--- a/my_utils/file_utils.py
+++ b/my_utils/file_utils.py

@@ -45,9 +45,9 @@
             yield line
 
 
-def get_file_annos_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+def get_file_annos_chunk(line_generator, chunk_size, token_class, comment_str="###C:", our_foundry="spacy"):
     file_has_next = True
-    chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
+    chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str, our_foundry=our_foundry)
     if n_sents == 0: file_has_next = False
     sents, gld, meta = [], [], []
     return chunk, file_has_next

diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
index b39e2f2..36f6de3 100644
--- a/systems/parse_spacy_pipe.py
+++ b/systems/parse_spacy_pipe.py

@@ -86,7 +86,7 @@
 	total_processed_sents = 0
 	
 	while file_has_next:
-		annos, file_has_next = fu.get_file_annos_chunk(stdin, chunk_size=CHUNK_SIZE, 		token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
+		annos, file_has_next = fu.get_file_annos_chunk(stdin, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str, our_foundry="spacy")
 		if len(annos) == 0: break
 		total_processed_sents += len(annos)
 		logger.info(f"Already processed {total_processed_sents} sentences...")
commit	a01314f4e6fa0382399b7a109aeda4e6c5e3830e	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Feb 11 17:02:08 2021 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Feb 11 17:02:08 2021 +0100
tree	d4e120bb8a67bd9e8184baff41bb41a782f599e9
parent	48606ba6066f150ae8223167ad968a65f58be80f [diff]