Add morphological features Change-Id: Iccb044fd31f93c681de5036f37720304e6b5c2c1

commit: 88eea72811dca084078ea20b0b14c4d94e06e017 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sun Oct 26 15:21:14 2025 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sun Oct 26 15:21:14 2025 +0100
tree: b945c4a50e0f898c535385554cd1d71da1c24cc5
parent: b1c30b2643779e1eada4b56bc56cdf6376f45abb [diff]
diff --git a/systems/parse_spacy.py b/systems/parse_spacy.py
index 90ca237..e4259bf 100644
--- a/systems/parse_spacy.py
+++ b/systems/parse_spacy.py

@@ -6,6 +6,31 @@
 import my_utils.file_utils as fu
 from germalemma import GermaLemma
 
+def format_morphological_features(token):
+	"""
+	Extract and format morphological features from a spaCy token for CoNLL-U output.
+	
+	Args:
+		token: spaCy token object
+		
+	Returns:
+		str: Formatted morphological features string for CoNLL-U 5th column
+			 Returns "_" if no features are available
+	"""
+	if not hasattr(token, 'morph') or not token.morph:
+		return "_"
+	
+	morph_dict = token.morph.to_dict()
+	if not morph_dict:
+		return "_"
+	
+	# Format as CoNLL-U format: Feature=Value|Feature2=Value2
+	features = []
+	for feature, value in sorted(morph_dict.items()):
+		features.append(f"{feature}={value}")
+	
+	return "|".join(features)
+
 
 class WhitespaceTokenizer(object):
 	def __init__(self, vocab):
@@ -22,10 +47,11 @@
 	#  First lines are comments. (metadata)
 	conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
 	for ix, token in enumerate(spacy_doc):
+		morph_features = format_morphological_features(token)
 		if use_germalemma == "True":
-			content = (str(ix), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, "_", "_", "_", "_", "_")
+			content = (str(ix), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, "_", "_", "_", "_")
 		else:
-			content = (str(ix), token.text, token.lemma_, token.pos_, token.tag_, "_", "_", "_", "_", "_") # Pure SpaCy!
+			content = (str(ix), token.text, token.lemma_, token.pos_, token.tag_, morph_features, "_", "_", "_", "_") # Pure SpaCy!
 		conll_lines.append("\t".join(content))
 	return "\n".join(conll_lines)
 

diff --git a/systems/parse_spacy3.py b/systems/parse_spacy3.py
index 14e5a9f..9696a9d 100644
--- a/systems/parse_spacy3.py
+++ b/systems/parse_spacy3.py

@@ -7,6 +7,31 @@
 import my_utils.file_utils as fu
 from germalemma import GermaLemma
 
+def format_morphological_features(token):
+	"""
+	Extract and format morphological features from a spaCy token for CoNLL-U output.
+	
+	Args:
+		token: spaCy token object
+		
+	Returns:
+		str: Formatted morphological features string for CoNLL-U 5th column
+			 Returns "_" if no features are available
+	"""
+	if not hasattr(token, 'morph') or not token.morph:
+		return "_"
+	
+	morph_dict = token.morph.to_dict()
+	if not morph_dict:
+		return "_"
+	
+	# Format as CoNLL-U format: Feature=Value|Feature2=Value2
+	features = []
+	for feature, value in sorted(morph_dict.items()):
+		features.append(f"{feature}={value}")
+	
+	return "|".join(features)
+
 
 @Language.factory("my_component")
 class WhitespaceTokenizer(object):
@@ -24,10 +49,11 @@
 	#  First lines are comments. (metadata)
 	conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
 	for ix, token in enumerate(spacy_doc):
+		morph_features = format_morphological_features(token)
 		if use_germalemma == "True":
-			content = (str(ix), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, "_", "_", "_", "_", "_")
+			content = (str(ix), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, "_", "_", "_", "_")
 		else:
-			content = (str(ix), token.text, token.lemma_, token.pos_, token.tag_, "_", "_", "_", "_", "_") # Pure SpaCy!
+			content = (str(ix), token.text, token.lemma_, token.pos_, token.tag_, morph_features, "_", "_", "_", "_") # Pure SpaCy!
 		conll_lines.append("\t".join(content))
 	return "\n".join(conll_lines)
 

diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
index 0b66e4b..eeb1efa 100644
--- a/systems/parse_spacy_pipe.py
+++ b/systems/parse_spacy_pipe.py

@@ -7,6 +7,31 @@
 import my_utils.file_utils as fu
 from germalemma import GermaLemma
 
+def format_morphological_features(token):
+	"""
+	Extract and format morphological features from a spaCy token for CoNLL-U output.
+	
+	Args:
+		token: spaCy token object
+		
+	Returns:
+		str: Formatted morphological features string for CoNLL-U 5th column
+			 Returns "_" if no features are available
+	"""
+	if not hasattr(token, 'morph') or not token.morph:
+		return "_"
+	
+	morph_dict = token.morph.to_dict()
+	if not morph_dict:
+		return "_"
+	
+	# Format as CoNLL-U format: Feature=Value|Feature2=Value2
+	features = []
+	for feature, value in sorted(morph_dict.items()):
+		features.append(f"{feature}={value}")
+	
+	return "|".join(features)
+
 
 class WhitespaceTokenizer(object):
 	def __init__(self, vocab):
@@ -23,10 +48,11 @@
 	#  First lines are comments. (metadata)
 	conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
 	for ix, token in enumerate(spacy_doc):
+		morph_features = format_morphological_features(token)
 		if use_germalemma == "True":
-			content = (str(ix+1), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, "_", "_", "_", "_", "_")
+			content = (str(ix+1), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, "_", "_", "_", "_")
 		else:
-			content = (str(ix+1), token.text, token.lemma_, token.pos_, token.tag_, "_", "_", "_", "_", "_") # Pure SpaCy!
+			content = (str(ix+1), token.text, token.lemma_, token.pos_, token.tag_, morph_features, "_", "_", "_", "_") # Pure SpaCy!
 		conll_lines.append("\t".join(content))
 	return "\n".join(conll_lines)
commit	88eea72811dca084078ea20b0b14c4d94e06e017	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Oct 26 15:21:14 2025 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Oct 26 15:21:14 2025 +0100
tree	b945c4a50e0f898c535385554cd1d71da1c24cc5
parent	b1c30b2643779e1eada4b56bc56cdf6376f45abb [diff]