Add dependency parses Change-Id: Ideaf4a03e4071634e9d55bcdacadf8f25fd1c98c

commit: 0ce98a6274daf271a501c4ac290a18a078e2daf5 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sun Oct 26 15:59:27 2025 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sun Oct 26 15:59:27 2025 +0100
tree: c428975374710cadaae15157b855874510789b40
parent: 88eea72811dca084078ea20b0b14c4d94e06e017 [diff]
diff --git a/Dockerfile b/Dockerfile
index cde20de..aad4d5c 100644
--- a/Dockerfile
+++ b/Dockerfile

@@ -8,6 +8,10 @@
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 ENV MAKEFLAGS="-j$(nproc)"
 
+# spaCy processing configuration
+ENV SPACY_USE_DEPENDENCIES="True"
+ENV SPACY_USE_GERMALEMMA="True"
+
 # Set the working directory
 COPY lib /app/lib
 COPY requirements.txt /app/requirements.txt

diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
index eeb1efa..74b110b 100644
--- a/systems/parse_spacy_pipe.py
+++ b/systems/parse_spacy_pipe.py

@@ -33,6 +33,39 @@
 	return "|".join(features)
 
 
+def format_dependency_relations(doc):
+	"""
+	Extract and format dependency relations from a spaCy doc for CoNLL-U output.
+	
+	Args:
+		doc: spaCy Doc object
+		
+	Returns:
+		list: List of tuples (head_id, deprel) for each token
+	"""
+	dependencies = []
+	for i, token in enumerate(doc):
+		# HEAD column: 1-based index of the head token (0 for root)
+		if token.dep_ == "ROOT":
+			head_id = 0
+		else:
+			# Find the 1-based index of the head token
+			head_id = None
+			for j, potential_head in enumerate(doc):
+				if potential_head == token.head:
+					head_id = j + 1
+					break
+			if head_id is None:
+				head_id = 0  # Fallback to root if head not found
+		
+		# DEPREL column: dependency relation
+		deprel = token.dep_ if token.dep_ else "_"
+		
+		dependencies.append((head_id, deprel))
+	
+	return dependencies
+
+
 class WhitespaceTokenizer(object):
 	def __init__(self, vocab):
 		self.vocab = vocab
@@ -44,15 +77,26 @@
 		return Doc(self.vocab, words=words, spaces=spaces)
 
 
-def get_conll_str(anno_obj, spacy_doc, use_germalemma):
+def get_conll_str(anno_obj, spacy_doc, use_germalemma, use_dependencies):
 	#  First lines are comments. (metadata)
 	conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+	
+	# Get dependency relations if enabled
+	dependencies = format_dependency_relations(spacy_doc) if use_dependencies == "True" else None
+	
 	for ix, token in enumerate(spacy_doc):
 		morph_features = format_morphological_features(token)
-		if use_germalemma == "True":
-			content = (str(ix+1), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, "_", "_", "_", "_")
+		
+		# Get HEAD and DEPREL columns
+		if dependencies:
+			head_id, deprel = dependencies[ix]
 		else:
-			content = (str(ix+1), token.text, token.lemma_, token.pos_, token.tag_, morph_features, "_", "_", "_", "_") # Pure SpaCy!
+			head_id, deprel = "_", "_"
+		
+		if use_germalemma == "True":
+			content = (str(ix+1), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, str(head_id), deprel, "_", "_")
+		else:
+			content = (str(ix+1), token.text, token.lemma_, token.pos_, token.tag_, morph_features, str(head_id), deprel, "_", "_") # Pure SpaCy!
 		conll_lines.append("\t".join(content))
 	return "\n".join(conll_lines)
 
@@ -84,6 +128,7 @@
 	parser.add_argument("-sm", "--spacy_model", help="Spacy model containing the pipeline to tag", default="de_core_news_lg")
 	parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLLUP_Token")
 	parser.add_argument("-ugl", "--use_germalemma", help="Use Germalemma lemmatizer on top of SpaCy", default="True")
+	parser.add_argument("-udp", "--use_dependencies", help="Include dependency parsing (adds HEAD/DEPREL columns, set to False for faster processing)", default="True")
 	parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
 	args = parser.parse_args()
 	
@@ -99,12 +144,31 @@
 	console_hdlr = logging.StreamHandler(sys.stderr)
 	file_hdlr = logging.FileHandler(filename=f"logs/Parse_{args.corpus_name}.SpaCy.log")
 	logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
+	
+	# Override with environment variables if set (useful for Docker)
+	import os
+	if os.getenv("SPACY_USE_DEPENDENCIES") is not None:
+		args.use_dependencies = os.getenv("SPACY_USE_DEPENDENCIES", "True")
+		logger.info(f"Using SPACY_USE_DEPENDENCIES environment variable: {args.use_dependencies}")
+	
+	if os.getenv("SPACY_USE_GERMALEMMA") is not None:
+		args.use_germalemma = os.getenv("SPACY_USE_GERMALEMMA", "True")
+		logger.info(f"Using SPACY_USE_GERMALEMMA environment variable: {args.use_germalemma}")
+	
 	logger.info(f"Chunking {args.corpus_name} Corpus in chunks of {CHUNK_SIZE} Sentences")
 	
 	# =====================================================================================
 	#                    POS TAG DOCUMENTS
 	# =====================================================================================
-	spacy_de = spacy.load(args.spacy_model, disable=["ner", "parser"])
+	# Configure which components to disable based on dependency parsing option
+	disabled_components = ["ner"]
+	if args.use_dependencies != "True":
+		disabled_components.append("parser")
+		logger.info("Dependency parsing disabled for faster processing")
+	else:
+		logger.info("Dependency parsing enabled (slower but includes HEAD/DEPREL)")
+	
+	spacy_de = spacy.load(args.spacy_model, disable=disabled_components)
 	spacy_de.tokenizer = WhitespaceTokenizer(spacy_de.vocab) # We won't re-tokenize to respect how the source CoNLL are tokenized!
 	lemmatizer = GermaLemma()
 	
@@ -118,7 +182,7 @@
 		logger.info(f"Already processed {total_processed_sents} sentences...")
 		sents = [a.get_sentence() for a in annos]
 		for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=SPACY_PROC)):
-			conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma)
+			conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
 			print(conll_str+ "\n")
 			
 	end = time.time()
commit	0ce98a6274daf271a501c4ac290a18a078e2daf5	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Oct 26 15:59:27 2025 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Oct 26 15:59:27 2025 +0100
tree	c428975374710cadaae15157b855874510789b40
parent	88eea72811dca084078ea20b0b14c4d94e06e017 [diff]