Add safety limits for dependency parsing
Change-Id: If901f9d7f03bc7926a812dfa55bf18a58149e583
diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
index a4f1693..95fb898 100644
--- a/systems/parse_spacy_pipe.py
+++ b/systems/parse_spacy_pipe.py
@@ -1,12 +1,66 @@
from sys import stdin
-import argparse
+import argparse, os
import spacy
from spacy.tokens import Doc
-import logging, sys, time
+import logging, sys, time, signal
from lib.CoNLL_Annotation import get_token_type
import my_utils.file_utils as fu
from germalemma import GermaLemma
+# Dependency parsing safety limits
+DEFAULT_PARSE_TIMEOUT = 30 # seconds per sentence
+DEFAULT_MAX_SENTENCE_LENGTH = 500 # tokens
+
+class TimeoutException(Exception):
+ pass
+
+def timeout_handler(signum, frame):
+ raise TimeoutException("Dependency parsing timeout")
+
+def safe_dependency_parse(spacy_model, text, timeout=DEFAULT_PARSE_TIMEOUT, max_length=DEFAULT_MAX_SENTENCE_LENGTH):
+ """
+ Safely parse a sentence with timeout and length limits.
+
+ Args:
+ spacy_model: Loaded spaCy model
+ text: Text to parse
+ timeout: Maximum seconds to wait for parsing
+ max_length: Maximum sentence length in tokens
+
+ Returns:
+ tuple: (spacy_doc, success, warning_message)
+ """
+ # Check sentence length
+ if len(text.split()) > max_length:
+ # Process without dependency parsing for long sentences
+ disabled_components = ["ner", "parser"]
+ doc = spacy_model(text, disable=disabled_components)
+ return doc, False, f"Sentence too long ({len(text.split())} tokens > {max_length}), dependency parsing skipped"
+
+ # Set up timeout
+ old_handler = signal.signal(signal.SIGALRM, timeout_handler)
+ signal.alarm(timeout)
+
+ try:
+ doc = spacy_model(text)
+ signal.alarm(0) # Cancel alarm
+ signal.signal(signal.SIGALRM, old_handler)
+ return doc, True, None
+ except TimeoutException:
+ signal.alarm(0) # Cancel alarm
+ signal.signal(signal.SIGALRM, old_handler)
+ # Retry without dependency parsing
+ disabled_components = ["ner", "parser"]
+ doc = spacy_model(text, disable=disabled_components)
+ return doc, False, f"Dependency parsing timeout after {timeout}s, processed without dependencies"
+ except Exception as e:
+ signal.alarm(0) # Cancel alarm
+ signal.signal(signal.SIGALRM, old_handler)
+ # Retry without dependency parsing
+ disabled_components = ["ner", "parser"]
+ doc = spacy_model(text, disable=disabled_components)
+ return doc, False, f"Dependency parsing error: {str(e)}, processed without dependencies"
+
def format_morphological_features(token):
"""
Extract and format morphological features from a spaCy token for CoNLL-U output.
@@ -182,8 +236,15 @@
except AttributeError:
logger.info("GermaLemma version: unknown (no __version__ attribute)")
+ # Parse timeout and sentence length limits from environment variables
+ parse_timeout = int(os.getenv("SPACY_PARSE_TIMEOUT", DEFAULT_PARSE_TIMEOUT))
+ max_sentence_length = int(os.getenv("SPACY_MAX_SENTENCE_LENGTH", DEFAULT_MAX_SENTENCE_LENGTH))
+
+ logger.info(f"Dependency parsing limits: timeout={parse_timeout}s, max_length={max_sentence_length} tokens")
+
start = time.time()
total_processed_sents = 0
+ dependency_warnings = 0
while file_has_next:
annos, file_has_next = fu.get_file_annos_chunk(stdin, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str, our_foundry="spacy")
@@ -191,10 +252,29 @@
total_processed_sents += len(annos)
logger.info(f"Already processed {total_processed_sents} sentences...")
sents = [a.get_sentence() for a in annos]
- for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=SPACY_PROC)):
- conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
- print(conll_str+ "\n")
+
+ # Process sentences individually when dependency parsing is enabled for timeout protection
+ if args.use_dependencies == "True":
+ for ix, sent in enumerate(sents):
+ doc, dependency_success, warning = safe_dependency_parse(
+ spacy_de, sent, timeout=parse_timeout, max_length=max_sentence_length
+ )
+ if warning:
+ dependency_warnings += 1
+ logger.warning(f"Sentence {total_processed_sents - len(sents) + ix + 1}: {warning}")
+
+ # Override use_dependencies based on actual parsing success
+ actual_use_dependencies = dependency_success
+ conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=actual_use_dependencies)
+ print(conll_str+ "\n")
+ else:
+ # Use batch processing for faster processing when dependencies are disabled
+ for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=SPACY_PROC)):
+ conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
+ print(conll_str+ "\n")
end = time.time()
logger.info(f"Processing {args.corpus_name} took {(end - start)} seconds!")
+ if dependency_warnings > 0:
+ logger.info(f"Dependency parsing warnings: {dependency_warnings} sentences processed without dependencies")
\ No newline at end of file