Add dependency parses
Change-Id: Ideaf4a03e4071634e9d55bcdacadf8f25fd1c98c
diff --git a/Dockerfile b/Dockerfile
index cde20de..aad4d5c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,6 +8,10 @@
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
ENV MAKEFLAGS="-j$(nproc)"
+# spaCy processing configuration
+ENV SPACY_USE_DEPENDENCIES="True"
+ENV SPACY_USE_GERMALEMMA="True"
+
# Set the working directory
COPY lib /app/lib
COPY requirements.txt /app/requirements.txt
diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
index eeb1efa..74b110b 100644
--- a/systems/parse_spacy_pipe.py
+++ b/systems/parse_spacy_pipe.py
@@ -33,6 +33,39 @@
return "|".join(features)
+def format_dependency_relations(doc):
+ """
+ Extract and format dependency relations from a spaCy doc for CoNLL-U output.
+
+ Args:
+ doc: spaCy Doc object
+
+ Returns:
+ list: List of tuples (head_id, deprel) for each token
+ """
+ dependencies = []
+ for i, token in enumerate(doc):
+ # HEAD column: 1-based index of the head token (0 for root)
+ if token.dep_ == "ROOT":
+ head_id = 0
+ else:
+ # Find the 1-based index of the head token
+ head_id = None
+ for j, potential_head in enumerate(doc):
+ if potential_head == token.head:
+ head_id = j + 1
+ break
+ if head_id is None:
+ head_id = 0 # Fallback to root if head not found
+
+ # DEPREL column: dependency relation
+ deprel = token.dep_ if token.dep_ else "_"
+
+ dependencies.append((head_id, deprel))
+
+ return dependencies
+
+
class WhitespaceTokenizer(object):
def __init__(self, vocab):
self.vocab = vocab
@@ -44,15 +77,26 @@
return Doc(self.vocab, words=words, spaces=spaces)
-def get_conll_str(anno_obj, spacy_doc, use_germalemma):
+def get_conll_str(anno_obj, spacy_doc, use_germalemma, use_dependencies):
# First lines are comments. (metadata)
conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+
+ # Get dependency relations if enabled
+ dependencies = format_dependency_relations(spacy_doc) if use_dependencies == "True" else None
+
for ix, token in enumerate(spacy_doc):
morph_features = format_morphological_features(token)
- if use_germalemma == "True":
- content = (str(ix+1), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, "_", "_", "_", "_")
+
+ # Get HEAD and DEPREL columns
+ if dependencies:
+ head_id, deprel = dependencies[ix]
else:
- content = (str(ix+1), token.text, token.lemma_, token.pos_, token.tag_, morph_features, "_", "_", "_", "_") # Pure SpaCy!
+ head_id, deprel = "_", "_"
+
+ if use_germalemma == "True":
+ content = (str(ix+1), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, str(head_id), deprel, "_", "_")
+ else:
+ content = (str(ix+1), token.text, token.lemma_, token.pos_, token.tag_, morph_features, str(head_id), deprel, "_", "_") # Pure SpaCy!
conll_lines.append("\t".join(content))
return "\n".join(conll_lines)
@@ -84,6 +128,7 @@
parser.add_argument("-sm", "--spacy_model", help="Spacy model containing the pipeline to tag", default="de_core_news_lg")
parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLLUP_Token")
parser.add_argument("-ugl", "--use_germalemma", help="Use Germalemma lemmatizer on top of SpaCy", default="True")
+ parser.add_argument("-udp", "--use_dependencies", help="Include dependency parsing (adds HEAD/DEPREL columns, set to False for faster processing)", default="True")
parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
args = parser.parse_args()
@@ -99,12 +144,31 @@
console_hdlr = logging.StreamHandler(sys.stderr)
file_hdlr = logging.FileHandler(filename=f"logs/Parse_{args.corpus_name}.SpaCy.log")
logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
+
+ # Override with environment variables if set (useful for Docker)
+ import os
+ if os.getenv("SPACY_USE_DEPENDENCIES") is not None:
+ args.use_dependencies = os.getenv("SPACY_USE_DEPENDENCIES", "True")
+ logger.info(f"Using SPACY_USE_DEPENDENCIES environment variable: {args.use_dependencies}")
+
+ if os.getenv("SPACY_USE_GERMALEMMA") is not None:
+ args.use_germalemma = os.getenv("SPACY_USE_GERMALEMMA", "True")
+ logger.info(f"Using SPACY_USE_GERMALEMMA environment variable: {args.use_germalemma}")
+
logger.info(f"Chunking {args.corpus_name} Corpus in chunks of {CHUNK_SIZE} Sentences")
# =====================================================================================
# POS TAG DOCUMENTS
# =====================================================================================
- spacy_de = spacy.load(args.spacy_model, disable=["ner", "parser"])
+ # Configure which components to disable based on dependency parsing option
+ disabled_components = ["ner"]
+ if args.use_dependencies != "True":
+ disabled_components.append("parser")
+ logger.info("Dependency parsing disabled for faster processing")
+ else:
+ logger.info("Dependency parsing enabled (slower but includes HEAD/DEPREL)")
+
+ spacy_de = spacy.load(args.spacy_model, disable=disabled_components)
spacy_de.tokenizer = WhitespaceTokenizer(spacy_de.vocab) # We won't re-tokenize to respect how the source CoNLL are tokenized!
lemmatizer = GermaLemma()
@@ -118,7 +182,7 @@
logger.info(f"Already processed {total_processed_sents} sentences...")
sents = [a.get_sentence() for a in annos]
for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=SPACY_PROC)):
- conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma)
+ conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
print(conll_str+ "\n")
end = time.time()