| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 1 | from sys import stdin |
| Marc Kupietz | 095185b | 2025-10-27 14:41:43 +0100 | [diff] [blame^] | 2 | import argparse, os |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 3 | import spacy |
| 4 | from spacy.tokens import Doc |
| Marc Kupietz | 095185b | 2025-10-27 14:41:43 +0100 | [diff] [blame^] | 5 | import logging, sys, time, signal |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 6 | from lib.CoNLL_Annotation import get_token_type |
| 7 | import my_utils.file_utils as fu |
| 8 | from germalemma import GermaLemma |
| 9 | |
| Marc Kupietz | 095185b | 2025-10-27 14:41:43 +0100 | [diff] [blame^] | 10 | # Dependency parsing safety limits |
| 11 | DEFAULT_PARSE_TIMEOUT = 30 # seconds per sentence |
| 12 | DEFAULT_MAX_SENTENCE_LENGTH = 500 # tokens |
| 13 | |
| 14 | class TimeoutException(Exception): |
| 15 | pass |
| 16 | |
| 17 | def timeout_handler(signum, frame): |
| 18 | raise TimeoutException("Dependency parsing timeout") |
| 19 | |
| 20 | def safe_dependency_parse(spacy_model, text, timeout=DEFAULT_PARSE_TIMEOUT, max_length=DEFAULT_MAX_SENTENCE_LENGTH): |
| 21 | """ |
| 22 | Safely parse a sentence with timeout and length limits. |
| 23 | |
| 24 | Args: |
| 25 | spacy_model: Loaded spaCy model |
| 26 | text: Text to parse |
| 27 | timeout: Maximum seconds to wait for parsing |
| 28 | max_length: Maximum sentence length in tokens |
| 29 | |
| 30 | Returns: |
| 31 | tuple: (spacy_doc, success, warning_message) |
| 32 | """ |
| 33 | # Check sentence length |
| 34 | if len(text.split()) > max_length: |
| 35 | # Process without dependency parsing for long sentences |
| 36 | disabled_components = ["ner", "parser"] |
| 37 | doc = spacy_model(text, disable=disabled_components) |
| 38 | return doc, False, f"Sentence too long ({len(text.split())} tokens > {max_length}), dependency parsing skipped" |
| 39 | |
| 40 | # Set up timeout |
| 41 | old_handler = signal.signal(signal.SIGALRM, timeout_handler) |
| 42 | signal.alarm(timeout) |
| 43 | |
| 44 | try: |
| 45 | doc = spacy_model(text) |
| 46 | signal.alarm(0) # Cancel alarm |
| 47 | signal.signal(signal.SIGALRM, old_handler) |
| 48 | return doc, True, None |
| 49 | except TimeoutException: |
| 50 | signal.alarm(0) # Cancel alarm |
| 51 | signal.signal(signal.SIGALRM, old_handler) |
| 52 | # Retry without dependency parsing |
| 53 | disabled_components = ["ner", "parser"] |
| 54 | doc = spacy_model(text, disable=disabled_components) |
| 55 | return doc, False, f"Dependency parsing timeout after {timeout}s, processed without dependencies" |
| 56 | except Exception as e: |
| 57 | signal.alarm(0) # Cancel alarm |
| 58 | signal.signal(signal.SIGALRM, old_handler) |
| 59 | # Retry without dependency parsing |
| 60 | disabled_components = ["ner", "parser"] |
| 61 | doc = spacy_model(text, disable=disabled_components) |
| 62 | return doc, False, f"Dependency parsing error: {str(e)}, processed without dependencies" |
| 63 | |
| Marc Kupietz | 88eea72 | 2025-10-26 15:21:14 +0100 | [diff] [blame] | 64 | def format_morphological_features(token): |
| 65 | """ |
| 66 | Extract and format morphological features from a spaCy token for CoNLL-U output. |
| 67 | |
| 68 | Args: |
| 69 | token: spaCy token object |
| 70 | |
| 71 | Returns: |
| 72 | str: Formatted morphological features string for CoNLL-U 5th column |
| 73 | Returns "_" if no features are available |
| 74 | """ |
| 75 | if not hasattr(token, 'morph') or not token.morph: |
| 76 | return "_" |
| 77 | |
| 78 | morph_dict = token.morph.to_dict() |
| 79 | if not morph_dict: |
| 80 | return "_" |
| 81 | |
| 82 | # Format as CoNLL-U format: Feature=Value|Feature2=Value2 |
| 83 | features = [] |
| 84 | for feature, value in sorted(morph_dict.items()): |
| 85 | features.append(f"{feature}={value}") |
| 86 | |
| 87 | return "|".join(features) |
| 88 | |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 89 | |
| Marc Kupietz | 0ce98a6 | 2025-10-26 15:59:27 +0100 | [diff] [blame] | 90 | def format_dependency_relations(doc): |
| 91 | """ |
| 92 | Extract and format dependency relations from a spaCy doc for CoNLL-U output. |
| 93 | |
| 94 | Args: |
| 95 | doc: spaCy Doc object |
| 96 | |
| 97 | Returns: |
| 98 | list: List of tuples (head_id, deprel) for each token |
| 99 | """ |
| 100 | dependencies = [] |
| 101 | for i, token in enumerate(doc): |
| 102 | # HEAD column: 1-based index of the head token (0 for root) |
| 103 | if token.dep_ == "ROOT": |
| 104 | head_id = 0 |
| 105 | else: |
| 106 | # Find the 1-based index of the head token |
| 107 | head_id = None |
| 108 | for j, potential_head in enumerate(doc): |
| 109 | if potential_head == token.head: |
| 110 | head_id = j + 1 |
| 111 | break |
| 112 | if head_id is None: |
| 113 | head_id = 0 # Fallback to root if head not found |
| 114 | |
| 115 | # DEPREL column: dependency relation |
| 116 | deprel = token.dep_ if token.dep_ else "_" |
| 117 | |
| 118 | dependencies.append((head_id, deprel)) |
| 119 | |
| 120 | return dependencies |
| 121 | |
| 122 | |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 123 | class WhitespaceTokenizer(object): |
| 124 | def __init__(self, vocab): |
| 125 | self.vocab = vocab |
| 126 | |
| 127 | def __call__(self, text): |
| 128 | words = text.split(' ') |
| 129 | # All tokens 'own' a subsequent space character in this tokenizer |
| 130 | spaces = [True] * len(words) |
| 131 | return Doc(self.vocab, words=words, spaces=spaces) |
| 132 | |
| 133 | |
| Marc Kupietz | 0ce98a6 | 2025-10-26 15:59:27 +0100 | [diff] [blame] | 134 | def get_conll_str(anno_obj, spacy_doc, use_germalemma, use_dependencies): |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 135 | # First lines are comments. (metadata) |
| 136 | conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC] |
| Marc Kupietz | 0ce98a6 | 2025-10-26 15:59:27 +0100 | [diff] [blame] | 137 | |
| 138 | # Get dependency relations if enabled |
| 139 | dependencies = format_dependency_relations(spacy_doc) if use_dependencies == "True" else None |
| 140 | |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 141 | for ix, token in enumerate(spacy_doc): |
| Marc Kupietz | 88eea72 | 2025-10-26 15:21:14 +0100 | [diff] [blame] | 142 | morph_features = format_morphological_features(token) |
| Marc Kupietz | 0ce98a6 | 2025-10-26 15:59:27 +0100 | [diff] [blame] | 143 | |
| 144 | # Get HEAD and DEPREL columns |
| 145 | if dependencies: |
| 146 | head_id, deprel = dependencies[ix] |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 147 | else: |
| Marc Kupietz | 0ce98a6 | 2025-10-26 15:59:27 +0100 | [diff] [blame] | 148 | head_id, deprel = "_", "_" |
| 149 | |
| 150 | if use_germalemma == "True": |
| 151 | content = (str(ix+1), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, str(head_id), deprel, "_", "_") |
| 152 | else: |
| 153 | content = (str(ix+1), token.text, token.lemma_, token.pos_, token.tag_, morph_features, str(head_id), deprel, "_", "_") # Pure SpaCy! |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 154 | conll_lines.append("\t".join(content)) |
| 155 | return "\n".join(conll_lines) |
| 156 | |
| 157 | |
| 158 | def find_germalemma(word, pos, spacy_lemma): |
| 159 | simplify_pos = {"ADJA":"ADJ", "ADJD":"ADJ", |
| 160 | "NA":"N", "NE":"N", "NN":"N", |
| 161 | "ADV":"ADV", "PAV":"ADV", "PROAV":"ADV", "PAVREL":"ADV", "PWAV":"ADV", "PWAVREL":"ADV", |
| 162 | "VAFIN":"V", "VAIMP":"V", "VAINF":"V", "VAPP":"V", "VMFIN":"V", "VMINF":"V", |
| 163 | "VMPP":"V", "VVFIN":"V", "VVIMP":"V", "VVINF":"V", "VVIZU":"V","VVPP":"V" |
| 164 | } |
| 165 | # simplify_pos = {"VERB": "V", "ADV": "ADV", "ADJ": "ADJ", "NOUN":"N", "PROPN": "N"} |
| 166 | try: |
| 167 | return lemmatizer.find_lemma(word, simplify_pos.get(pos, "UNK")) |
| 168 | except: |
| 169 | return spacy_lemma |
| 170 | |
| 171 | |
| 172 | if __name__ == "__main__": |
| 173 | """ |
| 174 | --- Example Real Data TEST --- |
| 175 | |
| 176 | cat /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/zca18.conllu | python systems/parse_spacy_pipe.py \ |
| 177 | --corpus_name DeReKo_zca18 --comment_str "#" > output_zca18.conll |
| 178 | """ |
| 179 | |
| 180 | parser = argparse.ArgumentParser() |
| 181 | parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus") |
| 182 | parser.add_argument("-sm", "--spacy_model", help="Spacy model containing the pipeline to tag", default="de_core_news_lg") |
| 183 | parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLLUP_Token") |
| 184 | parser.add_argument("-ugl", "--use_germalemma", help="Use Germalemma lemmatizer on top of SpaCy", default="True") |
| Marc Kupietz | 0ce98a6 | 2025-10-26 15:59:27 +0100 | [diff] [blame] | 185 | parser.add_argument("-udp", "--use_dependencies", help="Include dependency parsing (adds HEAD/DEPREL columns, set to False for faster processing)", default="True") |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 186 | parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#") |
| 187 | args = parser.parse_args() |
| 188 | |
| 189 | file_has_next, chunk_ix = True, 0 |
| 190 | CHUNK_SIZE = 20000 |
| 191 | SPACY_BATCH = 2000 |
| 192 | SPACY_PROC = 10 |
| 193 | |
| 194 | # ===================================================================================== |
| 195 | # LOGGING INFO ... |
| 196 | # ===================================================================================== |
| 197 | logger = logging.getLogger(__name__) |
| 198 | console_hdlr = logging.StreamHandler(sys.stderr) |
| 199 | file_hdlr = logging.FileHandler(filename=f"logs/Parse_{args.corpus_name}.SpaCy.log") |
| 200 | logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr]) |
| Marc Kupietz | 0ce98a6 | 2025-10-26 15:59:27 +0100 | [diff] [blame] | 201 | |
| 202 | # Override with environment variables if set (useful for Docker) |
| 203 | import os |
| 204 | if os.getenv("SPACY_USE_DEPENDENCIES") is not None: |
| 205 | args.use_dependencies = os.getenv("SPACY_USE_DEPENDENCIES", "True") |
| 206 | logger.info(f"Using SPACY_USE_DEPENDENCIES environment variable: {args.use_dependencies}") |
| 207 | |
| 208 | if os.getenv("SPACY_USE_GERMALEMMA") is not None: |
| 209 | args.use_germalemma = os.getenv("SPACY_USE_GERMALEMMA", "True") |
| 210 | logger.info(f"Using SPACY_USE_GERMALEMMA environment variable: {args.use_germalemma}") |
| 211 | |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 212 | logger.info(f"Chunking {args.corpus_name} Corpus in chunks of {CHUNK_SIZE} Sentences") |
| 213 | |
| 214 | # ===================================================================================== |
| 215 | # POS TAG DOCUMENTS |
| 216 | # ===================================================================================== |
| Marc Kupietz | 0ce98a6 | 2025-10-26 15:59:27 +0100 | [diff] [blame] | 217 | # Configure which components to disable based on dependency parsing option |
| 218 | disabled_components = ["ner"] |
| 219 | if args.use_dependencies != "True": |
| 220 | disabled_components.append("parser") |
| 221 | logger.info("Dependency parsing disabled for faster processing") |
| 222 | else: |
| 223 | logger.info("Dependency parsing enabled (slower but includes HEAD/DEPREL)") |
| 224 | |
| 225 | spacy_de = spacy.load(args.spacy_model, disable=disabled_components) |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 226 | spacy_de.tokenizer = WhitespaceTokenizer(spacy_de.vocab) # We won't re-tokenize to respect how the source CoNLL are tokenized! |
| 227 | lemmatizer = GermaLemma() |
| 228 | |
| Marc Kupietz | f629a40 | 2025-10-26 21:54:33 +0100 | [diff] [blame] | 229 | # Log version information |
| 230 | logger.info(f"spaCy version: {spacy.__version__}") |
| 231 | logger.info(f"spaCy model: {args.spacy_model}") |
| 232 | logger.info(f"spaCy model version: {spacy_de.meta.get('version', 'unknown')}") |
| 233 | try: |
| 234 | import germalemma |
| 235 | logger.info(f"GermaLemma version: {germalemma.__version__}") |
| 236 | except AttributeError: |
| 237 | logger.info("GermaLemma version: unknown (no __version__ attribute)") |
| 238 | |
| Marc Kupietz | 095185b | 2025-10-27 14:41:43 +0100 | [diff] [blame^] | 239 | # Parse timeout and sentence length limits from environment variables |
| 240 | parse_timeout = int(os.getenv("SPACY_PARSE_TIMEOUT", DEFAULT_PARSE_TIMEOUT)) |
| 241 | max_sentence_length = int(os.getenv("SPACY_MAX_SENTENCE_LENGTH", DEFAULT_MAX_SENTENCE_LENGTH)) |
| 242 | |
| 243 | logger.info(f"Dependency parsing limits: timeout={parse_timeout}s, max_length={max_sentence_length} tokens") |
| 244 | |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 245 | start = time.time() |
| 246 | total_processed_sents = 0 |
| Marc Kupietz | 095185b | 2025-10-27 14:41:43 +0100 | [diff] [blame^] | 247 | dependency_warnings = 0 |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 248 | |
| 249 | while file_has_next: |
| Marc Kupietz | a01314f | 2021-02-11 17:02:08 +0100 | [diff] [blame] | 250 | annos, file_has_next = fu.get_file_annos_chunk(stdin, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str, our_foundry="spacy") |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 251 | if len(annos) == 0: break |
| 252 | total_processed_sents += len(annos) |
| 253 | logger.info(f"Already processed {total_processed_sents} sentences...") |
| 254 | sents = [a.get_sentence() for a in annos] |
| Marc Kupietz | 095185b | 2025-10-27 14:41:43 +0100 | [diff] [blame^] | 255 | |
| 256 | # Process sentences individually when dependency parsing is enabled for timeout protection |
| 257 | if args.use_dependencies == "True": |
| 258 | for ix, sent in enumerate(sents): |
| 259 | doc, dependency_success, warning = safe_dependency_parse( |
| 260 | spacy_de, sent, timeout=parse_timeout, max_length=max_sentence_length |
| 261 | ) |
| 262 | if warning: |
| 263 | dependency_warnings += 1 |
| 264 | logger.warning(f"Sentence {total_processed_sents - len(sents) + ix + 1}: {warning}") |
| 265 | |
| 266 | # Override use_dependencies based on actual parsing success |
| 267 | actual_use_dependencies = dependency_success |
| 268 | conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=actual_use_dependencies) |
| 269 | print(conll_str+ "\n") |
| 270 | else: |
| 271 | # Use batch processing for faster processing when dependencies are disabled |
| 272 | for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=SPACY_PROC)): |
| 273 | conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies) |
| 274 | print(conll_str+ "\n") |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 275 | |
| 276 | end = time.time() |
| 277 | logger.info(f"Processing {args.corpus_name} took {(end - start)} seconds!") |
| Marc Kupietz | 095185b | 2025-10-27 14:41:43 +0100 | [diff] [blame^] | 278 | if dependency_warnings > 0: |
| 279 | logger.info(f"Dependency parsing warnings: {dependency_warnings} sentences processed without dependencies") |
| daza | 48606ba | 2021-02-10 14:16:41 +0100 | [diff] [blame] | 280 | |