Add morphological features

Change-Id: Iccb044fd31f93c681de5036f37720304e6b5c2c1
diff --git a/systems/parse_spacy.py b/systems/parse_spacy.py
index 90ca237..e4259bf 100644
--- a/systems/parse_spacy.py
+++ b/systems/parse_spacy.py
@@ -6,6 +6,31 @@
 import my_utils.file_utils as fu
 from germalemma import GermaLemma
 
+def format_morphological_features(token):
+	"""
+	Extract and format morphological features from a spaCy token for CoNLL-U output.
+	
+	Args:
+		token: spaCy token object
+		
+	Returns:
+		str: Formatted morphological features string for CoNLL-U 5th column
+			 Returns "_" if no features are available
+	"""
+	if not hasattr(token, 'morph') or not token.morph:
+		return "_"
+	
+	morph_dict = token.morph.to_dict()
+	if not morph_dict:
+		return "_"
+	
+	# Format as CoNLL-U format: Feature=Value|Feature2=Value2
+	features = []
+	for feature, value in sorted(morph_dict.items()):
+		features.append(f"{feature}={value}")
+	
+	return "|".join(features)
+
 
 class WhitespaceTokenizer(object):
 	def __init__(self, vocab):
@@ -22,10 +47,11 @@
 	#  First lines are comments. (metadata)
 	conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
 	for ix, token in enumerate(spacy_doc):
+		morph_features = format_morphological_features(token)
 		if use_germalemma == "True":
-			content = (str(ix), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, "_", "_", "_", "_", "_")
+			content = (str(ix), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, "_", "_", "_", "_")
 		else:
-			content = (str(ix), token.text, token.lemma_, token.pos_, token.tag_, "_", "_", "_", "_", "_") # Pure SpaCy!
+			content = (str(ix), token.text, token.lemma_, token.pos_, token.tag_, morph_features, "_", "_", "_", "_") # Pure SpaCy!
 		conll_lines.append("\t".join(content))
 	return "\n".join(conll_lines)
 
diff --git a/systems/parse_spacy3.py b/systems/parse_spacy3.py
index 14e5a9f..9696a9d 100644
--- a/systems/parse_spacy3.py
+++ b/systems/parse_spacy3.py
@@ -7,6 +7,31 @@
 import my_utils.file_utils as fu
 from germalemma import GermaLemma
 
+def format_morphological_features(token):
+	"""
+	Extract and format morphological features from a spaCy token for CoNLL-U output.
+	
+	Args:
+		token: spaCy token object
+		
+	Returns:
+		str: Formatted morphological features string for CoNLL-U 5th column
+			 Returns "_" if no features are available
+	"""
+	if not hasattr(token, 'morph') or not token.morph:
+		return "_"
+	
+	morph_dict = token.morph.to_dict()
+	if not morph_dict:
+		return "_"
+	
+	# Format as CoNLL-U format: Feature=Value|Feature2=Value2
+	features = []
+	for feature, value in sorted(morph_dict.items()):
+		features.append(f"{feature}={value}")
+	
+	return "|".join(features)
+
 
 @Language.factory("my_component")
 class WhitespaceTokenizer(object):
@@ -24,10 +49,11 @@
 	#  First lines are comments. (metadata)
 	conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
 	for ix, token in enumerate(spacy_doc):
+		morph_features = format_morphological_features(token)
 		if use_germalemma == "True":
-			content = (str(ix), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, "_", "_", "_", "_", "_")
+			content = (str(ix), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, "_", "_", "_", "_")
 		else:
-			content = (str(ix), token.text, token.lemma_, token.pos_, token.tag_, "_", "_", "_", "_", "_") # Pure SpaCy!
+			content = (str(ix), token.text, token.lemma_, token.pos_, token.tag_, morph_features, "_", "_", "_", "_") # Pure SpaCy!
 		conll_lines.append("\t".join(content))
 	return "\n".join(conll_lines)
 
diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
index 0b66e4b..eeb1efa 100644
--- a/systems/parse_spacy_pipe.py
+++ b/systems/parse_spacy_pipe.py
@@ -7,6 +7,31 @@
 import my_utils.file_utils as fu
 from germalemma import GermaLemma
 
+def format_morphological_features(token):
+	"""
+	Extract and format morphological features from a spaCy token for CoNLL-U output.
+	
+	Args:
+		token: spaCy token object
+		
+	Returns:
+		str: Formatted morphological features string for CoNLL-U 5th column
+			 Returns "_" if no features are available
+	"""
+	if not hasattr(token, 'morph') or not token.morph:
+		return "_"
+	
+	morph_dict = token.morph.to_dict()
+	if not morph_dict:
+		return "_"
+	
+	# Format as CoNLL-U format: Feature=Value|Feature2=Value2
+	features = []
+	for feature, value in sorted(morph_dict.items()):
+		features.append(f"{feature}={value}")
+	
+	return "|".join(features)
+
 
 class WhitespaceTokenizer(object):
 	def __init__(self, vocab):
@@ -23,10 +48,11 @@
 	#  First lines are comments. (metadata)
 	conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
 	for ix, token in enumerate(spacy_doc):
+		morph_features = format_morphological_features(token)
 		if use_germalemma == "True":
-			content = (str(ix+1), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, "_", "_", "_", "_", "_")
+			content = (str(ix+1), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, "_", "_", "_", "_")
 		else:
-			content = (str(ix+1), token.text, token.lemma_, token.pos_, token.tag_, "_", "_", "_", "_", "_") # Pure SpaCy!
+			content = (str(ix+1), token.text, token.lemma_, token.pos_, token.tag_, morph_features, "_", "_", "_", "_") # Pure SpaCy!
 		conll_lines.append("\t".join(content))
 	return "\n".join(conll_lines)