Default procs to 1 and stabilize
Change-Id: Iabbbca52e6c30a7b989d23fde4e320e13900af0d
diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
index b532456..f6dd7fa 100644
--- a/systems/parse_spacy_pipe.py
+++ b/systems/parse_spacy_pipe.py
@@ -126,6 +126,11 @@
def __call__(self, text):
words = text.split(' ')
+ # Filter out empty strings to avoid spaCy errors
+ words = [w for w in words if w]
+ # Handle edge case of empty input
+ if not words:
+ words = ['']
# All tokens 'own' a subsequent space character in this tokenizer
spaces = [True] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)
@@ -189,7 +194,7 @@
file_has_next, chunk_ix = True, 0
CHUNK_SIZE = int(os.getenv("SPACY_CHUNK_SIZE", "20000"))
SPACY_BATCH = int(os.getenv("SPACY_BATCH_SIZE", "2000"))
- SPACY_PROC = int(os.getenv("SPACY_N_PROCESS", "10"))
+ SPACY_PROC = int(os.getenv("SPACY_N_PROCESS", "1"))
# =====================================================================================
# LOGGING INFO ...
@@ -231,6 +236,10 @@
spacy_de = spacy.load(args.spacy_model, disable=disabled_components)
spacy_de.tokenizer = WhitespaceTokenizer(spacy_de.vocab) # We won't re-tokenize to respect how the source CoNLL are tokenized!
+
+ # Increase max_length to handle very long sentences (especially when parser is disabled)
+ spacy_de.max_length = 10000000 # 10M characters
+
lemmatizer = GermaLemma()
# Log version information
@@ -283,9 +292,26 @@
print(conll_str+ "\n")
else:
# Use batch processing for faster processing when dependencies are disabled
- for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=SPACY_PROC)):
- conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
- print(conll_str+ "\n")
+ # Use n_process=1 to avoid multiprocessing deadlocks and memory issues with large files
+ try:
+ for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=1)):
+ conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
+ print(conll_str+ "\n")
+ except Exception as e:
+ logger.error(f"Batch processing failed: {str(e)}")
+ logger.info("Falling back to individual sentence processing...")
+ # Fallback: process sentences individually
+ for ix, sent in enumerate(sents):
+ try:
+ doc = spacy_de(sent)
+ conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
+ print(conll_str+ "\n")
+ except Exception as sent_error:
+ logger.error(f"Failed to process sentence {total_processed_sents - len(sents) + ix + 1}: {str(sent_error)}")
+ logger.error(f"Sentence preview: {sent[:100]}...")
+ # Output a placeholder to maintain alignment
+ conll_str = get_conll_str(annos[ix], spacy_de("ERROR"), use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
+ print(conll_str+ "\n")
end = time.time()
total_time = end - start