Add morphological features
Change-Id: Iccb044fd31f93c681de5036f37720304e6b5c2c1
diff --git a/systems/parse_spacy.py b/systems/parse_spacy.py
index 90ca237..e4259bf 100644
--- a/systems/parse_spacy.py
+++ b/systems/parse_spacy.py
@@ -6,6 +6,31 @@
import my_utils.file_utils as fu
from germalemma import GermaLemma
+def format_morphological_features(token):
+ """
+ Extract and format morphological features from a spaCy token for CoNLL-U output.
+
+ Args:
+ token: spaCy token object
+
+ Returns:
+ str: Formatted morphological features string for CoNLL-U 5th column
+ Returns "_" if no features are available
+ """
+ if not hasattr(token, 'morph') or not token.morph:
+ return "_"
+
+ morph_dict = token.morph.to_dict()
+ if not morph_dict:
+ return "_"
+
+ # Format as CoNLL-U format: Feature=Value|Feature2=Value2
+ features = []
+ for feature, value in sorted(morph_dict.items()):
+ features.append(f"{feature}={value}")
+
+ return "|".join(features)
+
class WhitespaceTokenizer(object):
def __init__(self, vocab):
@@ -22,10 +47,11 @@
# First lines are comments. (metadata)
conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
for ix, token in enumerate(spacy_doc):
+ morph_features = format_morphological_features(token)
if use_germalemma == "True":
- content = (str(ix), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, "_", "_", "_", "_", "_")
+ content = (str(ix), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, "_", "_", "_", "_")
else:
- content = (str(ix), token.text, token.lemma_, token.pos_, token.tag_, "_", "_", "_", "_", "_") # Pure SpaCy!
+ content = (str(ix), token.text, token.lemma_, token.pos_, token.tag_, morph_features, "_", "_", "_", "_") # Pure SpaCy!
conll_lines.append("\t".join(content))
return "\n".join(conll_lines)
diff --git a/systems/parse_spacy3.py b/systems/parse_spacy3.py
index 14e5a9f..9696a9d 100644
--- a/systems/parse_spacy3.py
+++ b/systems/parse_spacy3.py
@@ -7,6 +7,31 @@
import my_utils.file_utils as fu
from germalemma import GermaLemma
+def format_morphological_features(token):
+ """
+ Extract and format morphological features from a spaCy token for CoNLL-U output.
+
+ Args:
+ token: spaCy token object
+
+ Returns:
+ str: Formatted morphological features string for CoNLL-U 5th column
+ Returns "_" if no features are available
+ """
+ if not hasattr(token, 'morph') or not token.morph:
+ return "_"
+
+ morph_dict = token.morph.to_dict()
+ if not morph_dict:
+ return "_"
+
+ # Format as CoNLL-U format: Feature=Value|Feature2=Value2
+ features = []
+ for feature, value in sorted(morph_dict.items()):
+ features.append(f"{feature}={value}")
+
+ return "|".join(features)
+
@Language.factory("my_component")
class WhitespaceTokenizer(object):
@@ -24,10 +49,11 @@
# First lines are comments. (metadata)
conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
for ix, token in enumerate(spacy_doc):
+ morph_features = format_morphological_features(token)
if use_germalemma == "True":
- content = (str(ix), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, "_", "_", "_", "_", "_")
+ content = (str(ix), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, "_", "_", "_", "_")
else:
- content = (str(ix), token.text, token.lemma_, token.pos_, token.tag_, "_", "_", "_", "_", "_") # Pure SpaCy!
+ content = (str(ix), token.text, token.lemma_, token.pos_, token.tag_, morph_features, "_", "_", "_", "_") # Pure SpaCy!
conll_lines.append("\t".join(content))
return "\n".join(conll_lines)
diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
index 0b66e4b..eeb1efa 100644
--- a/systems/parse_spacy_pipe.py
+++ b/systems/parse_spacy_pipe.py
@@ -7,6 +7,31 @@
import my_utils.file_utils as fu
from germalemma import GermaLemma
+def format_morphological_features(token):
+ """
+ Extract and format morphological features from a spaCy token for CoNLL-U output.
+
+ Args:
+ token: spaCy token object
+
+ Returns:
+ str: Formatted morphological features string for CoNLL-U 5th column
+ Returns "_" if no features are available
+ """
+ if not hasattr(token, 'morph') or not token.morph:
+ return "_"
+
+ morph_dict = token.morph.to_dict()
+ if not morph_dict:
+ return "_"
+
+ # Format as CoNLL-U format: Feature=Value|Feature2=Value2
+ features = []
+ for feature, value in sorted(morph_dict.items()):
+ features.append(f"{feature}={value}")
+
+ return "|".join(features)
+
class WhitespaceTokenizer(object):
def __init__(self, vocab):
@@ -23,10 +48,11 @@
# First lines are comments. (metadata)
conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
for ix, token in enumerate(spacy_doc):
+ morph_features = format_morphological_features(token)
if use_germalemma == "True":
- content = (str(ix+1), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, "_", "_", "_", "_", "_")
+ content = (str(ix+1), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, "_", "_", "_", "_")
else:
- content = (str(ix+1), token.text, token.lemma_, token.pos_, token.tag_, "_", "_", "_", "_", "_") # Pure SpaCy!
+ content = (str(ix+1), token.text, token.lemma_, token.pos_, token.tag_, morph_features, "_", "_", "_", "_") # Pure SpaCy!
conll_lines.append("\t".join(content))
return "\n".join(conll_lines)