Add safety limits for dependency parsing
Change-Id: If901f9d7f03bc7926a812dfa55bf18a58149e583
diff --git a/systems/parse_spacy.py b/systems/parse_spacy.py
index d2bf1b1..ac27058 100644
--- a/systems/parse_spacy.py
+++ b/systems/parse_spacy.py
@@ -1,11 +1,65 @@
-import argparse
+import argparse, os
import spacy
from spacy.tokens import Doc
-import logging, sys, time
+import logging, sys, time, signal
from lib.CoNLL_Annotation import get_token_type
import my_utils.file_utils as fu
from germalemma import GermaLemma
+# Dependency parsing safety limits
+DEFAULT_PARSE_TIMEOUT = 30 # seconds per sentence
+DEFAULT_MAX_SENTENCE_LENGTH = 500 # tokens
+
+class TimeoutException(Exception):
+ pass
+
+def timeout_handler(signum, frame):
+ raise TimeoutException("Dependency parsing timeout")
+
+def safe_dependency_parse(spacy_model, text, timeout=DEFAULT_PARSE_TIMEOUT, max_length=DEFAULT_MAX_SENTENCE_LENGTH):
+ """
+ Safely parse a sentence with timeout and length limits.
+
+ Args:
+ spacy_model: Loaded spaCy model
+ text: Text to parse
+ timeout: Maximum seconds to wait for parsing
+ max_length: Maximum sentence length in tokens
+
+ Returns:
+ tuple: (spacy_doc, success, warning_message)
+ """
+ # Check sentence length
+ if len(text.split()) > max_length:
+ # Process without dependency parsing for long sentences
+ disabled_components = ["ner", "parser"]
+ doc = spacy_model(text, disable=disabled_components)
+ return doc, False, f"Sentence too long ({len(text.split())} tokens > {max_length}), dependency parsing skipped"
+
+ # Set up timeout
+ old_handler = signal.signal(signal.SIGALRM, timeout_handler)
+ signal.alarm(timeout)
+
+ try:
+ doc = spacy_model(text)
+ signal.alarm(0) # Cancel alarm
+ signal.signal(signal.SIGALRM, old_handler)
+ return doc, True, None
+ except TimeoutException:
+ signal.alarm(0) # Cancel alarm
+ signal.signal(signal.SIGALRM, old_handler)
+ # Retry without dependency parsing
+ disabled_components = ["ner", "parser"]
+ doc = spacy_model(text, disable=disabled_components)
+ return doc, False, f"Dependency parsing timeout after {timeout}s, processed without dependencies"
+ except Exception as e:
+ signal.alarm(0) # Cancel alarm
+ signal.signal(signal.SIGALRM, old_handler)
+ # Retry without dependency parsing
+ disabled_components = ["ner", "parser"]
+ doc = spacy_model(text, disable=disabled_components)
+ return doc, False, f"Dependency parsing error: {str(e)}, processed without dependencies"
+
def format_morphological_features(token):
"""
Extract and format morphological features from a spaCy token for CoNLL-U output.