Default procs to 1 and stabilize Change-Id: Iabbbca52e6c30a7b989d23fde4e320e13900af0d

commit: 732b3f4b3eb7096258f612b7e774209f679cbb47 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Fri Oct 31 17:19:11 2025 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Fri Oct 31 17:19:11 2025 +0100
tree: 7c4d8e74acedfee3d8d005f4dbf7736a68886b03
parent: 772a06d2493c4f6e1c93a9176b80b3397d2230cd [diff]
diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
index b532456..f6dd7fa 100644
--- a/systems/parse_spacy_pipe.py
+++ b/systems/parse_spacy_pipe.py

@@ -126,6 +126,11 @@
 
 	def __call__(self, text):
 		words = text.split(' ')
+		# Filter out empty strings to avoid spaCy errors
+		words = [w for w in words if w]
+		# Handle edge case of empty input
+		if not words:
+			words = ['']
 		# All tokens 'own' a subsequent space character in this tokenizer
 		spaces = [True] * len(words)
 		return Doc(self.vocab, words=words, spaces=spaces)
@@ -189,7 +194,7 @@
 	file_has_next, chunk_ix = True, 0
 	CHUNK_SIZE = int(os.getenv("SPACY_CHUNK_SIZE", "20000"))
 	SPACY_BATCH = int(os.getenv("SPACY_BATCH_SIZE", "2000"))
-	SPACY_PROC = int(os.getenv("SPACY_N_PROCESS", "10"))
+	SPACY_PROC = int(os.getenv("SPACY_N_PROCESS", "1"))
 	
 	# =====================================================================================
 	#                    LOGGING INFO ...
@@ -231,6 +236,10 @@
 	
 	spacy_de = spacy.load(args.spacy_model, disable=disabled_components)
 	spacy_de.tokenizer = WhitespaceTokenizer(spacy_de.vocab) # We won't re-tokenize to respect how the source CoNLL are tokenized!
+	
+	# Increase max_length to handle very long sentences (especially when parser is disabled)
+	spacy_de.max_length = 10000000  # 10M characters
+	
 	lemmatizer = GermaLemma()
 	
 	# Log version information
@@ -283,9 +292,26 @@
 				print(conll_str+ "\n")
 		else:
 			# Use batch processing for faster processing when dependencies are disabled
-			for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=SPACY_PROC)):
-				conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
-				print(conll_str+ "\n")
+			# Use n_process=1 to avoid multiprocessing deadlocks and memory issues with large files
+			try:
+				for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=1)):
+					conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
+					print(conll_str+ "\n")
+			except Exception as e:
+				logger.error(f"Batch processing failed: {str(e)}")
+				logger.info("Falling back to individual sentence processing...")
+				# Fallback: process sentences individually
+				for ix, sent in enumerate(sents):
+					try:
+						doc = spacy_de(sent)
+						conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
+						print(conll_str+ "\n")
+					except Exception as sent_error:
+						logger.error(f"Failed to process sentence {total_processed_sents - len(sents) + ix + 1}: {str(sent_error)}")
+						logger.error(f"Sentence preview: {sent[:100]}...")
+						# Output a placeholder to maintain alignment
+						conll_str = get_conll_str(annos[ix], spacy_de("ERROR"), use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
+						print(conll_str+ "\n")
 			
 	end = time.time()
 	total_time = end - start
commit	732b3f4b3eb7096258f612b7e774209f679cbb47	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Fri Oct 31 17:19:11 2025 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Fri Oct 31 17:19:11 2025 +0100
tree	7c4d8e74acedfee3d8d005f4dbf7736a68886b03
parent	772a06d2493c4f6e1c93a9176b80b3397d2230cd [diff]