| margaretha | 4d8f8e2 | 2026-03-19 11:58:29 +0100 | [diff] [blame^] | 1 | #!/usr/bin/env python3 |
| 2 | """ |
| 3 | @author margaretha with AI assistance |
| 4 | |
| 5 | |
| 6 | import-c2-project-vc.py – Parse cdef.all.txt and create project VCs in Kustvakt. |
| 7 | |
| 8 | Input file format (tab-separated, one entry per line): |
| 9 | [vcDescription]\tLOAD('[vcName]')\tG_[groupName] |
| 10 | |
| 11 | For each line the script: |
| 12 | 1. Creates (or updates) a PROJECT virtual corpus named <groupName>-VC |
| 13 | owned by --user, using the corpus query refersto(<vcName>) |
| 14 | 2. Shares that VC with the user group <groupName> |
| 15 | |
| 16 | Lines starting with '#' or blank lines are ignored. |
| 17 | |
| 18 | The script obtains a Bearer token via the OAuth2 password grant and calls: |
| 19 | PUT <base-url>/<api-version>/vc/~<user>/<groupName>-VC (JSON body) |
| 20 | POST <base-url>/<api-version>/vc/~<user>/<groupName>-VC/share/@<groupName> |
| 21 | |
| 22 | Usage: |
| 23 | python3 import-c2-project-vc.py --file data/cdef.all.txt \\ |
| 24 | --url http://localhost:8080/api \\ |
| 25 | ----api-version v1.0 \\ |
| 26 | --user admin --password pass \\ |
| 27 | --client-id <client_id> [--client-secret <client_secret>] |
| 28 | |
| 29 | Run with --help for all options. |
| 30 | """ |
| 31 | |
| 32 | import argparse |
| 33 | import json |
| 34 | import logging |
| 35 | import re |
| 36 | import sys |
| 37 | import urllib.error |
| 38 | import urllib.parse |
| 39 | import urllib.request |
| 40 | from dataclasses import dataclass |
| 41 | from pathlib import Path |
| 42 | |
| 43 | logging.basicConfig( |
| 44 | level=logging.INFO, |
| 45 | format="%(levelname)s %(message)s", |
| 46 | ) |
| 47 | log = logging.getLogger(__name__) |
| 48 | |
| 49 | # cdef.all.txt line pattern: |
| 50 | # <vcDescription><TAB>LOAD('<vcName>')<TAB>G_<groupName> |
| 51 | _LINE_RE = re.compile( |
| 52 | r"^(?P<vcDesc>[^\t]*?)" |
| 53 | r"\tLOAD\('(?P<vcName>[^']+)'\)" |
| 54 | r"\tG_(?P<groupName>\S+)$" |
| 55 | ) |
| 56 | |
| 57 | # Matches lines that have LOAD() but no G_<groupName> column – silently skipped. |
| 58 | _NO_GROUP_RE = re.compile(r"\tLOAD\('[^']+'\)\s*$") |
| 59 | |
| 60 | |
| 61 | # --------------------------------------------------------------------------- |
| 62 | # Data class |
| 63 | # --------------------------------------------------------------------------- |
| 64 | |
| 65 | @dataclass |
| 66 | class CdefEntry: |
| 67 | vc_sigles: list[str] # corpus sigles from LOAD('sigle1 sigle2 ...') |
| 68 | vc_desc: str # description (from the first occurrence of the group) |
| 69 | group_name: str |
| 70 | |
| 71 | @property |
| 72 | def kustvakt_vc_name(self) -> str: |
| 73 | """Name used when creating the VC in Kustvakt: <groupName>-VC""" |
| 74 | return f"{self.group_name}-VC" |
| 75 | |
| 76 | |
| 77 | # --------------------------------------------------------------------------- |
| 78 | # Parsing |
| 79 | # --------------------------------------------------------------------------- |
| 80 | |
| 81 | def parse_cdef(path: Path) -> list[CdefEntry]: |
| 82 | # Use an ordered dict so insertion order (first occurrence) is preserved. |
| 83 | by_group: dict[str, CdefEntry] = {} |
| 84 | |
| 85 | for lineno, raw in enumerate(path.read_text(encoding="utf-8").splitlines(), 1): |
| 86 | line = raw.rstrip("\n") |
| 87 | if not line.strip() or line.strip().startswith("#"): |
| 88 | continue |
| 89 | m = _LINE_RE.match(line) |
| 90 | if not m: |
| 91 | # Lines with LOAD() but no G_<groupName> are silently skipped. |
| 92 | if _NO_GROUP_RE.search(line): |
| 93 | log.debug("Line %d: no G_<groupName>, skipping: %s", lineno, line) |
| 94 | else: |
| 95 | log.warning("Line %d: unexpected format, skipping: %s", lineno, line) |
| 96 | continue |
| 97 | |
| 98 | group_name = m.group("groupName") |
| 99 | # LOAD() contains a space-separated list of corpus sigles. |
| 100 | new_sigles = m.group("vcName").split() |
| 101 | |
| 102 | if group_name in by_group: |
| 103 | # Merge: append sigles that are not already present. |
| 104 | existing = by_group[group_name] |
| 105 | seen = set(existing.vc_sigles) |
| 106 | for s in new_sigles: |
| 107 | if s not in seen: |
| 108 | existing.vc_sigles.append(s) |
| 109 | seen.add(s) |
| 110 | log.debug("Line %d: merged %d sigle(s) into group '%s'", |
| 111 | lineno, len(new_sigles), group_name) |
| 112 | else: |
| 113 | by_group[group_name] = CdefEntry( |
| 114 | vc_sigles=list(new_sigles), |
| 115 | vc_desc=m.group("vcDesc").strip(), |
| 116 | group_name=group_name, |
| 117 | ) |
| 118 | |
| 119 | return list(by_group.values()) |
| 120 | |
| 121 | |
| 122 | # --------------------------------------------------------------------------- |
| 123 | # HTTP helpers |
| 124 | # --------------------------------------------------------------------------- |
| 125 | |
| 126 | def fetch_bearer_token( |
| 127 | base_url: str, |
| 128 | api_version: str, |
| 129 | user: str, |
| 130 | password: str, |
| 131 | client_id: str, |
| 132 | client_secret: str, |
| 133 | ) -> str: |
| 134 | """Obtain an OAuth2 Bearer token via the resource-owner password grant.""" |
| 135 | token_url = f"{base_url}/{api_version}/oauth2/token" |
| 136 | body = urllib.parse.urlencode({ |
| 137 | "grant_type": "password", |
| 138 | "username": user, |
| 139 | "password": password, |
| 140 | "client_id": client_id, |
| 141 | "client_secret": client_secret, |
| 142 | }).encode() |
| 143 | req = urllib.request.Request( |
| 144 | token_url, |
| 145 | data=body, |
| 146 | method="POST", |
| 147 | headers={"Content-Type": "application/x-www-form-urlencoded"}, |
| 148 | ) |
| 149 | try: |
| 150 | with urllib.request.urlopen(req) as resp: |
| 151 | data = json.loads(resp.read().decode()) |
| 152 | except urllib.error.HTTPError as exc: |
| 153 | log.error("Token request failed (HTTP %s): %s", |
| 154 | exc.code, exc.read().decode(errors="replace")) |
| 155 | sys.exit(1) |
| 156 | except urllib.error.URLError as exc: |
| 157 | log.error("Token request network error: %s", exc.reason) |
| 158 | sys.exit(1) |
| 159 | |
| 160 | token = data.get("access_token") |
| 161 | if not token: |
| 162 | log.error("No access_token in response: %s", data) |
| 163 | sys.exit(1) |
| 164 | |
| 165 | log.debug("Bearer token obtained successfully.") |
| 166 | return token |
| 167 | |
| 168 | |
| 169 | def _put_json(url: str, payload: dict, auth_header: str, dry_run: bool) -> int: |
| 170 | """PUT with an application/json body. Returns HTTP status code.""" |
| 171 | body = json.dumps(payload).encode() |
| 172 | req = urllib.request.Request( |
| 173 | url, |
| 174 | data=body, |
| 175 | method="PUT", |
| 176 | headers={ |
| 177 | "Authorization": auth_header, |
| 178 | "Content-Type": "application/json;charset=utf-8", |
| 179 | }, |
| 180 | ) |
| 181 | if dry_run: |
| 182 | log.info("[DRY-RUN] PUT %s body=%s", url, payload) |
| 183 | return 201 |
| 184 | |
| 185 | try: |
| 186 | with urllib.request.urlopen(req) as resp: |
| 187 | return resp.status |
| 188 | except urllib.error.HTTPError as exc: |
| 189 | log.error("HTTP %s for PUT %s – %s", exc.code, url, exc.read().decode(errors="replace")) |
| 190 | return exc.code |
| 191 | except urllib.error.URLError as exc: |
| 192 | log.error("Network error for PUT %s – %s", url, exc.reason) |
| 193 | return 0 |
| 194 | |
| 195 | |
| 196 | def _post(url: str, auth_header: str, dry_run: bool) -> int: |
| 197 | """POST with an empty body. Returns HTTP status code.""" |
| 198 | req = urllib.request.Request( |
| 199 | url, |
| 200 | data=b"", |
| 201 | method="POST", |
| 202 | headers={"Authorization": auth_header}, |
| 203 | ) |
| 204 | if dry_run: |
| 205 | log.info("[DRY-RUN] POST %s", url) |
| 206 | return 200 |
| 207 | |
| 208 | try: |
| 209 | with urllib.request.urlopen(req) as resp: |
| 210 | return resp.status |
| 211 | except urllib.error.HTTPError as exc: |
| 212 | log.error("HTTP %s for POST %s – %s", exc.code, url, exc.read().decode(errors="replace")) |
| 213 | return exc.code |
| 214 | except urllib.error.URLError as exc: |
| 215 | log.error("Network error for POST %s – %s", url, exc.reason) |
| 216 | return 0 |
| 217 | |
| 218 | |
| 219 | # --------------------------------------------------------------------------- |
| 220 | # QueryJson builder |
| 221 | # --------------------------------------------------------------------------- |
| 222 | |
| 223 | def _build_query_json(entry: CdefEntry) -> dict: |
| 224 | """ |
| 225 | Build the JSON body that maps to Kustvakt's QueryJson POJO: |
| 226 | |
| 227 | QueryJson field │ Java type │ value sent |
| 228 | ──────────────────┼────────────────┼────────────────────────────── |
| 229 | type │ ResourceType │ "PROJECT" |
| 230 | queryType │ QueryType │ "VIRTUAL_CORPUS" |
| 231 | corpusQuery │ String │ corpusSigle="A" | corpusSigle="B" ... |
| 232 | description │ String │ <vcDescription> |
| 233 | |
| 234 | corpusQuery is a KorAP collection query string that is serialized |
| 235 | to KoralQuery JSON by Kustvakt internally (via Koral QuerySerializer). |
| 236 | Multiple sigles from repeated LOAD() lines for the same group are |
| 237 | combined with | (OR). |
| 238 | |
| 239 | The controller (VirtualCorpusController.createUpdateVC) defaults |
| 240 | queryType to VIRTUAL_CORPUS when null, so it could be omitted, but |
| 241 | is included here for clarity. |
| 242 | """ |
| 243 | corpus_query = " | ".join( |
| 244 | f'corpusSigle="{sigle}"' for sigle in entry.vc_sigles |
| 245 | ) |
| 246 | return { |
| 247 | "type": "PROJECT", # ResourceType.PROJECT |
| 248 | "queryType": "VIRTUAL_CORPUS", # QueryType.VIRTUAL_CORPUS (default if omitted) |
| 249 | "corpusQuery": corpus_query, |
| 250 | "description": entry.vc_desc, |
| 251 | } |
| 252 | |
| 253 | |
| 254 | # --------------------------------------------------------------------------- |
| 255 | # Import |
| 256 | # --------------------------------------------------------------------------- |
| 257 | |
| 258 | def import_vc( |
| 259 | entries: list[CdefEntry], |
| 260 | base_url: str, |
| 261 | api_version: str, |
| 262 | creator: str, |
| 263 | auth_header: str, |
| 264 | share: bool, |
| 265 | dry_run: bool, |
| 266 | ) -> None: |
| 267 | created = updated = shared_ok = shared_err = 0 |
| 268 | |
| 269 | for e in entries: |
| 270 | vc_url = f"{base_url}/{api_version}/vc/~{creator}/{e.kustvakt_vc_name}" |
| 271 | payload = _build_query_json(e) |
| 272 | |
| 273 | # --- create / update VC --- |
| 274 | status = _put_json(vc_url, payload, auth_header, dry_run) |
| 275 | if status == 201: |
| 276 | log.info("Created VC '%s' (group '%s')", e.kustvakt_vc_name, e.group_name) |
| 277 | created += 1 |
| 278 | elif status == 204: |
| 279 | log.info("Updated VC '%s' (group '%s')", e.kustvakt_vc_name, e.group_name) |
| 280 | updated += 1 |
| 281 | else: |
| 282 | log.error("Failed to create/update VC '%s' (HTTP %s) – skipping share", |
| 283 | e.kustvakt_vc_name, status) |
| 284 | continue |
| 285 | |
| 286 | # --- share with group --- |
| 287 | if share: |
| 288 | share_url = f"{vc_url}/share/@{e.group_name}" |
| 289 | status = _post(share_url, auth_header, dry_run) |
| 290 | if status == 200: |
| 291 | log.info(" Shared '%s' with group '%s'", |
| 292 | e.kustvakt_vc_name, e.group_name) |
| 293 | shared_ok += 1 |
| 294 | else: |
| 295 | log.error(" Failed to share '%s' with group '%s' (HTTP %s)", |
| 296 | e.kustvakt_vc_name, e.group_name, status) |
| 297 | shared_err += 1 |
| 298 | |
| 299 | log.info( |
| 300 | "Done: %d created, %d updated, %d shared OK, %d share failed.", |
| 301 | created, updated, shared_ok, shared_err, |
| 302 | ) |
| 303 | |
| 304 | |
| 305 | # --------------------------------------------------------------------------- |
| 306 | # CLI |
| 307 | # --------------------------------------------------------------------------- |
| 308 | |
| 309 | def build_parser() -> argparse.ArgumentParser: |
| 310 | p = argparse.ArgumentParser( |
| 311 | description=__doc__, |
| 312 | formatter_class=argparse.RawDescriptionHelpFormatter, |
| 313 | ) |
| 314 | p.add_argument( |
| 315 | "--file", |
| 316 | default="data/cdef.all.txt", |
| 317 | metavar="PATH", |
| 318 | help="Path to cdef.all.txt (default: data/cdef.all.txt)", |
| 319 | ) |
| 320 | p.add_argument( |
| 321 | "--url", |
| 322 | default="http://localhost:8080/api", |
| 323 | metavar="URL", |
| 324 | help="Kustvakt base URL, without trailing slash " |
| 325 | "(default: http://localhost:8080/api)", |
| 326 | ) |
| 327 | p.add_argument( |
| 328 | "--api-version", |
| 329 | default="v1.0", |
| 330 | metavar="VER", |
| 331 | help="API version segment, e.g. v1.0 or v1.1 (default: v1.0)", |
| 332 | ) |
| 333 | p.add_argument( |
| 334 | "--user", |
| 335 | default="admin", |
| 336 | metavar="USERNAME", |
| 337 | help="VC creator / resource-owner username (default: admin)", |
| 338 | ) |
| 339 | p.add_argument( |
| 340 | "--password", |
| 341 | default=None, |
| 342 | metavar="PASSWORD", |
| 343 | help="Resource-owner password (prompted if omitted)", |
| 344 | ) |
| 345 | p.add_argument( |
| 346 | "--client-id", |
| 347 | required=True, |
| 348 | metavar="CLIENT_ID", |
| 349 | help="OAuth2 client_id for the password grant", |
| 350 | ) |
| 351 | p.add_argument( |
| 352 | "--client-secret", |
| 353 | default="", |
| 354 | metavar="CLIENT_SECRET", |
| 355 | help="OAuth2 client_secret (default: empty, for public clients)", |
| 356 | ) |
| 357 | p.add_argument( |
| 358 | "--skip-share", |
| 359 | action="store_true", |
| 360 | help="Create VCs only, do not share them with the groups", |
| 361 | ) |
| 362 | p.add_argument( |
| 363 | "--dry-run", |
| 364 | action="store_true", |
| 365 | help="Print what would be done without sending any requests", |
| 366 | ) |
| 367 | p.add_argument( |
| 368 | "--verbose", |
| 369 | action="store_true", |
| 370 | help="Enable DEBUG-level logging", |
| 371 | ) |
| 372 | return p |
| 373 | |
| 374 | |
| 375 | def main() -> None: |
| 376 | args = build_parser().parse_args() |
| 377 | |
| 378 | if args.verbose: |
| 379 | logging.getLogger().setLevel(logging.DEBUG) |
| 380 | |
| 381 | path = Path(args.file) |
| 382 | if not path.exists(): |
| 383 | log.error("Input file not found: %s", path) |
| 384 | sys.exit(1) |
| 385 | |
| 386 | base_url = args.url.rstrip("/") |
| 387 | |
| 388 | if args.dry_run: |
| 389 | auth_header = "Bearer <dry-run>" |
| 390 | else: |
| 391 | password = args.password |
| 392 | if password is None: |
| 393 | import getpass |
| 394 | password = getpass.getpass(f"Password for '{args.user}': ") |
| 395 | token = fetch_bearer_token( |
| 396 | base_url=base_url, |
| 397 | api_version=args.api_version, |
| 398 | user=args.user, |
| 399 | password=password, |
| 400 | client_id=args.client_id, |
| 401 | client_secret=args.client_secret, |
| 402 | ) |
| 403 | auth_header = f"Bearer {token}" |
| 404 | |
| 405 | entries = parse_cdef(path) |
| 406 | log.info("Parsed %d entr%s from %s", |
| 407 | len(entries), "y" if len(entries) == 1 else "ies", path) |
| 408 | |
| 409 | if not entries: |
| 410 | log.warning("No entries found – nothing to do.") |
| 411 | return |
| 412 | |
| 413 | import_vc( |
| 414 | entries=entries, |
| 415 | base_url=base_url, |
| 416 | api_version=args.api_version, |
| 417 | creator=args.user, |
| 418 | auth_header=auth_header, |
| 419 | share=not args.skip_share, |
| 420 | dry_run=args.dry_run, |
| 421 | ) |
| 422 | |
| 423 | |
| 424 | if __name__ == "__main__": |
| 425 | main() |