Add safety limits for dependency parsing

Change-Id: If901f9d7f03bc7926a812dfa55bf18a58149e583
diff --git a/systems/parse_spacy.py b/systems/parse_spacy.py
index d2bf1b1..ac27058 100644
--- a/systems/parse_spacy.py
+++ b/systems/parse_spacy.py
@@ -1,11 +1,65 @@
-import argparse
+import argparse, os
 import spacy
 from spacy.tokens import Doc
-import logging, sys, time
+import logging, sys, time, signal
 from lib.CoNLL_Annotation import get_token_type
 import my_utils.file_utils as fu
 from germalemma import GermaLemma
 
+# Dependency parsing safety limits
+DEFAULT_PARSE_TIMEOUT = 30  # seconds per sentence
+DEFAULT_MAX_SENTENCE_LENGTH = 500  # tokens
+
+class TimeoutException(Exception):
+	pass
+
+def timeout_handler(signum, frame):
+	raise TimeoutException("Dependency parsing timeout")
+
+def safe_dependency_parse(spacy_model, text, timeout=DEFAULT_PARSE_TIMEOUT, max_length=DEFAULT_MAX_SENTENCE_LENGTH):
+	"""
+	Safely parse a sentence with timeout and length limits.
+	
+	Args:
+		spacy_model: Loaded spaCy model
+		text: Text to parse
+		timeout: Maximum seconds to wait for parsing
+		max_length: Maximum sentence length in tokens
+		
+	Returns:
+		tuple: (spacy_doc, success, warning_message)
+	"""
+	# Check sentence length
+	if len(text.split()) > max_length:
+		# Process without dependency parsing for long sentences
+		disabled_components = ["ner", "parser"]
+		doc = spacy_model(text, disable=disabled_components)
+		return doc, False, f"Sentence too long ({len(text.split())} tokens > {max_length}), dependency parsing skipped"
+	
+	# Set up timeout
+	old_handler = signal.signal(signal.SIGALRM, timeout_handler)
+	signal.alarm(timeout)
+	
+	try:
+		doc = spacy_model(text)
+		signal.alarm(0)  # Cancel alarm
+		signal.signal(signal.SIGALRM, old_handler)
+		return doc, True, None
+	except TimeoutException:
+		signal.alarm(0)  # Cancel alarm
+		signal.signal(signal.SIGALRM, old_handler)
+		# Retry without dependency parsing
+		disabled_components = ["ner", "parser"]
+		doc = spacy_model(text, disable=disabled_components)
+		return doc, False, f"Dependency parsing timeout after {timeout}s, processed without dependencies"
+	except Exception as e:
+		signal.alarm(0)  # Cancel alarm
+		signal.signal(signal.SIGALRM, old_handler)
+		# Retry without dependency parsing
+		disabled_components = ["ner", "parser"]
+		doc = spacy_model(text, disable=disabled_components)
+		return doc, False, f"Dependency parsing error: {str(e)}, processed without dependencies"
+
 def format_morphological_features(token):
 	"""
 	Extract and format morphological features from a spaCy token for CoNLL-U output.
diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
index a4f1693..95fb898 100644
--- a/systems/parse_spacy_pipe.py
+++ b/systems/parse_spacy_pipe.py
@@ -1,12 +1,66 @@
 from sys import stdin
-import argparse
+import argparse, os
 import spacy
 from spacy.tokens import Doc
-import logging, sys, time
+import logging, sys, time, signal
 from lib.CoNLL_Annotation import get_token_type
 import my_utils.file_utils as fu
 from germalemma import GermaLemma
 
+# Dependency parsing safety limits
+DEFAULT_PARSE_TIMEOUT = 30  # seconds per sentence
+DEFAULT_MAX_SENTENCE_LENGTH = 500  # tokens
+
+class TimeoutException(Exception):
+	pass
+
+def timeout_handler(signum, frame):
+	raise TimeoutException("Dependency parsing timeout")
+
+def safe_dependency_parse(spacy_model, text, timeout=DEFAULT_PARSE_TIMEOUT, max_length=DEFAULT_MAX_SENTENCE_LENGTH):
+	"""
+	Safely parse a sentence with timeout and length limits.
+	
+	Args:
+		spacy_model: Loaded spaCy model
+		text: Text to parse
+		timeout: Maximum seconds to wait for parsing
+		max_length: Maximum sentence length in tokens
+		
+	Returns:
+		tuple: (spacy_doc, success, warning_message)
+	"""
+	# Check sentence length
+	if len(text.split()) > max_length:
+		# Process without dependency parsing for long sentences
+		disabled_components = ["ner", "parser"]
+		doc = spacy_model(text, disable=disabled_components)
+		return doc, False, f"Sentence too long ({len(text.split())} tokens > {max_length}), dependency parsing skipped"
+	
+	# Set up timeout
+	old_handler = signal.signal(signal.SIGALRM, timeout_handler)
+	signal.alarm(timeout)
+	
+	try:
+		doc = spacy_model(text)
+		signal.alarm(0)  # Cancel alarm
+		signal.signal(signal.SIGALRM, old_handler)
+		return doc, True, None
+	except TimeoutException:
+		signal.alarm(0)  # Cancel alarm
+		signal.signal(signal.SIGALRM, old_handler)
+		# Retry without dependency parsing
+		disabled_components = ["ner", "parser"]
+		doc = spacy_model(text, disable=disabled_components)
+		return doc, False, f"Dependency parsing timeout after {timeout}s, processed without dependencies"
+	except Exception as e:
+		signal.alarm(0)  # Cancel alarm
+		signal.signal(signal.SIGALRM, old_handler)
+		# Retry without dependency parsing
+		disabled_components = ["ner", "parser"]
+		doc = spacy_model(text, disable=disabled_components)
+		return doc, False, f"Dependency parsing error: {str(e)}, processed without dependencies"
+
 def format_morphological_features(token):
 	"""
 	Extract and format morphological features from a spaCy token for CoNLL-U output.
@@ -182,8 +236,15 @@
 	except AttributeError:
 		logger.info("GermaLemma version: unknown (no __version__ attribute)")
 	
+	# Parse timeout and sentence length limits from environment variables
+	parse_timeout = int(os.getenv("SPACY_PARSE_TIMEOUT", DEFAULT_PARSE_TIMEOUT))
+	max_sentence_length = int(os.getenv("SPACY_MAX_SENTENCE_LENGTH", DEFAULT_MAX_SENTENCE_LENGTH))
+	
+	logger.info(f"Dependency parsing limits: timeout={parse_timeout}s, max_length={max_sentence_length} tokens")
+	
 	start = time.time()
 	total_processed_sents = 0
+	dependency_warnings = 0
 	
 	while file_has_next:
 		annos, file_has_next = fu.get_file_annos_chunk(stdin, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str, our_foundry="spacy")
@@ -191,10 +252,29 @@
 		total_processed_sents += len(annos)
 		logger.info(f"Already processed {total_processed_sents} sentences...")
 		sents = [a.get_sentence() for a in annos]
-		for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=SPACY_PROC)):
-			conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
-			print(conll_str+ "\n")
+		
+		# Process sentences individually when dependency parsing is enabled for timeout protection
+		if args.use_dependencies == "True":
+			for ix, sent in enumerate(sents):
+				doc, dependency_success, warning = safe_dependency_parse(
+					spacy_de, sent, timeout=parse_timeout, max_length=max_sentence_length
+				)
+				if warning:
+					dependency_warnings += 1
+					logger.warning(f"Sentence {total_processed_sents - len(sents) + ix + 1}: {warning}")
+				
+				# Override use_dependencies based on actual parsing success
+				actual_use_dependencies = dependency_success
+				conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=actual_use_dependencies)
+				print(conll_str+ "\n")
+		else:
+			# Use batch processing for faster processing when dependencies are disabled
+			for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=SPACY_PROC)):
+				conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
+				print(conll_str+ "\n")
 			
 	end = time.time()
 	logger.info(f"Processing {args.corpus_name} took {(end - start)} seconds!")
+	if dependency_warnings > 0:
+		logger.info(f"Dependency parsing warnings: {dependency_warnings} sentences processed without dependencies")
 			
\ No newline at end of file