Import C2 project VCs to Kustvakt.
Require cdef.all.txt
Change-Id: I9e07dc8fcf1fde925b7bb62eda8eccceeda28918
diff --git a/bin/import-c2-project-vc.py b/bin/import-c2-project-vc.py
new file mode 100644
index 0000000..d776da2
--- /dev/null
+++ b/bin/import-c2-project-vc.py
@@ -0,0 +1,425 @@
+#!/usr/bin/env python3
+"""
+@author margaretha with AI assistance
+
+
+import-c2-project-vc.py – Parse cdef.all.txt and create project VCs in Kustvakt.
+
+Input file format (tab-separated, one entry per line):
+ [vcDescription]\tLOAD('[vcName]')\tG_[groupName]
+
+For each line the script:
+ 1. Creates (or updates) a PROJECT virtual corpus named <groupName>-VC
+ owned by --user, using the corpus query refersto(<vcName>)
+ 2. Shares that VC with the user group <groupName>
+
+Lines starting with '#' or blank lines are ignored.
+
+The script obtains a Bearer token via the OAuth2 password grant and calls:
+ PUT <base-url>/<api-version>/vc/~<user>/<groupName>-VC (JSON body)
+ POST <base-url>/<api-version>/vc/~<user>/<groupName>-VC/share/@<groupName>
+
+Usage:
+ python3 import-c2-project-vc.py --file data/cdef.all.txt \\
+ --url http://localhost:8080/api \\
+ ----api-version v1.0 \\
+ --user admin --password pass \\
+ --client-id <client_id> [--client-secret <client_secret>]
+
+Run with --help for all options.
+"""
+
+import argparse
+import json
+import logging
+import re
+import sys
+import urllib.error
+import urllib.parse
+import urllib.request
+from dataclasses import dataclass
+from pathlib import Path
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(levelname)s %(message)s",
+)
+log = logging.getLogger(__name__)
+
+# cdef.all.txt line pattern:
+# <vcDescription><TAB>LOAD('<vcName>')<TAB>G_<groupName>
+_LINE_RE = re.compile(
+ r"^(?P<vcDesc>[^\t]*?)"
+ r"\tLOAD\('(?P<vcName>[^']+)'\)"
+ r"\tG_(?P<groupName>\S+)$"
+)
+
+# Matches lines that have LOAD() but no G_<groupName> column – silently skipped.
+_NO_GROUP_RE = re.compile(r"\tLOAD\('[^']+'\)\s*$")
+
+
+# ---------------------------------------------------------------------------
+# Data class
+# ---------------------------------------------------------------------------
+
+@dataclass
+class CdefEntry:
+ vc_sigles: list[str] # corpus sigles from LOAD('sigle1 sigle2 ...')
+ vc_desc: str # description (from the first occurrence of the group)
+ group_name: str
+
+ @property
+ def kustvakt_vc_name(self) -> str:
+ """Name used when creating the VC in Kustvakt: <groupName>-VC"""
+ return f"{self.group_name}-VC"
+
+
+# ---------------------------------------------------------------------------
+# Parsing
+# ---------------------------------------------------------------------------
+
+def parse_cdef(path: Path) -> list[CdefEntry]:
+ # Use an ordered dict so insertion order (first occurrence) is preserved.
+ by_group: dict[str, CdefEntry] = {}
+
+ for lineno, raw in enumerate(path.read_text(encoding="utf-8").splitlines(), 1):
+ line = raw.rstrip("\n")
+ if not line.strip() or line.strip().startswith("#"):
+ continue
+ m = _LINE_RE.match(line)
+ if not m:
+ # Lines with LOAD() but no G_<groupName> are silently skipped.
+ if _NO_GROUP_RE.search(line):
+ log.debug("Line %d: no G_<groupName>, skipping: %s", lineno, line)
+ else:
+ log.warning("Line %d: unexpected format, skipping: %s", lineno, line)
+ continue
+
+ group_name = m.group("groupName")
+ # LOAD() contains a space-separated list of corpus sigles.
+ new_sigles = m.group("vcName").split()
+
+ if group_name in by_group:
+ # Merge: append sigles that are not already present.
+ existing = by_group[group_name]
+ seen = set(existing.vc_sigles)
+ for s in new_sigles:
+ if s not in seen:
+ existing.vc_sigles.append(s)
+ seen.add(s)
+ log.debug("Line %d: merged %d sigle(s) into group '%s'",
+ lineno, len(new_sigles), group_name)
+ else:
+ by_group[group_name] = CdefEntry(
+ vc_sigles=list(new_sigles),
+ vc_desc=m.group("vcDesc").strip(),
+ group_name=group_name,
+ )
+
+ return list(by_group.values())
+
+
+# ---------------------------------------------------------------------------
+# HTTP helpers
+# ---------------------------------------------------------------------------
+
+def fetch_bearer_token(
+ base_url: str,
+ api_version: str,
+ user: str,
+ password: str,
+ client_id: str,
+ client_secret: str,
+) -> str:
+ """Obtain an OAuth2 Bearer token via the resource-owner password grant."""
+ token_url = f"{base_url}/{api_version}/oauth2/token"
+ body = urllib.parse.urlencode({
+ "grant_type": "password",
+ "username": user,
+ "password": password,
+ "client_id": client_id,
+ "client_secret": client_secret,
+ }).encode()
+ req = urllib.request.Request(
+ token_url,
+ data=body,
+ method="POST",
+ headers={"Content-Type": "application/x-www-form-urlencoded"},
+ )
+ try:
+ with urllib.request.urlopen(req) as resp:
+ data = json.loads(resp.read().decode())
+ except urllib.error.HTTPError as exc:
+ log.error("Token request failed (HTTP %s): %s",
+ exc.code, exc.read().decode(errors="replace"))
+ sys.exit(1)
+ except urllib.error.URLError as exc:
+ log.error("Token request network error: %s", exc.reason)
+ sys.exit(1)
+
+ token = data.get("access_token")
+ if not token:
+ log.error("No access_token in response: %s", data)
+ sys.exit(1)
+
+ log.debug("Bearer token obtained successfully.")
+ return token
+
+
+def _put_json(url: str, payload: dict, auth_header: str, dry_run: bool) -> int:
+ """PUT with an application/json body. Returns HTTP status code."""
+ body = json.dumps(payload).encode()
+ req = urllib.request.Request(
+ url,
+ data=body,
+ method="PUT",
+ headers={
+ "Authorization": auth_header,
+ "Content-Type": "application/json;charset=utf-8",
+ },
+ )
+ if dry_run:
+ log.info("[DRY-RUN] PUT %s body=%s", url, payload)
+ return 201
+
+ try:
+ with urllib.request.urlopen(req) as resp:
+ return resp.status
+ except urllib.error.HTTPError as exc:
+ log.error("HTTP %s for PUT %s – %s", exc.code, url, exc.read().decode(errors="replace"))
+ return exc.code
+ except urllib.error.URLError as exc:
+ log.error("Network error for PUT %s – %s", url, exc.reason)
+ return 0
+
+
+def _post(url: str, auth_header: str, dry_run: bool) -> int:
+ """POST with an empty body. Returns HTTP status code."""
+ req = urllib.request.Request(
+ url,
+ data=b"",
+ method="POST",
+ headers={"Authorization": auth_header},
+ )
+ if dry_run:
+ log.info("[DRY-RUN] POST %s", url)
+ return 200
+
+ try:
+ with urllib.request.urlopen(req) as resp:
+ return resp.status
+ except urllib.error.HTTPError as exc:
+ log.error("HTTP %s for POST %s – %s", exc.code, url, exc.read().decode(errors="replace"))
+ return exc.code
+ except urllib.error.URLError as exc:
+ log.error("Network error for POST %s – %s", url, exc.reason)
+ return 0
+
+
+# ---------------------------------------------------------------------------
+# QueryJson builder
+# ---------------------------------------------------------------------------
+
+def _build_query_json(entry: CdefEntry) -> dict:
+ """
+ Build the JSON body that maps to Kustvakt's QueryJson POJO:
+
+ QueryJson field │ Java type │ value sent
+ ──────────────────┼────────────────┼──────────────────────────────
+ type │ ResourceType │ "PROJECT"
+ queryType │ QueryType │ "VIRTUAL_CORPUS"
+ corpusQuery │ String │ corpusSigle="A" | corpusSigle="B" ...
+ description │ String │ <vcDescription>
+
+ corpusQuery is a KorAP collection query string that is serialized
+ to KoralQuery JSON by Kustvakt internally (via Koral QuerySerializer).
+ Multiple sigles from repeated LOAD() lines for the same group are
+ combined with | (OR).
+
+ The controller (VirtualCorpusController.createUpdateVC) defaults
+ queryType to VIRTUAL_CORPUS when null, so it could be omitted, but
+ is included here for clarity.
+ """
+ corpus_query = " | ".join(
+ f'corpusSigle="{sigle}"' for sigle in entry.vc_sigles
+ )
+ return {
+ "type": "PROJECT", # ResourceType.PROJECT
+ "queryType": "VIRTUAL_CORPUS", # QueryType.VIRTUAL_CORPUS (default if omitted)
+ "corpusQuery": corpus_query,
+ "description": entry.vc_desc,
+ }
+
+
+# ---------------------------------------------------------------------------
+# Import
+# ---------------------------------------------------------------------------
+
+def import_vc(
+ entries: list[CdefEntry],
+ base_url: str,
+ api_version: str,
+ creator: str,
+ auth_header: str,
+ share: bool,
+ dry_run: bool,
+) -> None:
+ created = updated = shared_ok = shared_err = 0
+
+ for e in entries:
+ vc_url = f"{base_url}/{api_version}/vc/~{creator}/{e.kustvakt_vc_name}"
+ payload = _build_query_json(e)
+
+ # --- create / update VC ---
+ status = _put_json(vc_url, payload, auth_header, dry_run)
+ if status == 201:
+ log.info("Created VC '%s' (group '%s')", e.kustvakt_vc_name, e.group_name)
+ created += 1
+ elif status == 204:
+ log.info("Updated VC '%s' (group '%s')", e.kustvakt_vc_name, e.group_name)
+ updated += 1
+ else:
+ log.error("Failed to create/update VC '%s' (HTTP %s) – skipping share",
+ e.kustvakt_vc_name, status)
+ continue
+
+ # --- share with group ---
+ if share:
+ share_url = f"{vc_url}/share/@{e.group_name}"
+ status = _post(share_url, auth_header, dry_run)
+ if status == 200:
+ log.info(" Shared '%s' with group '%s'",
+ e.kustvakt_vc_name, e.group_name)
+ shared_ok += 1
+ else:
+ log.error(" Failed to share '%s' with group '%s' (HTTP %s)",
+ e.kustvakt_vc_name, e.group_name, status)
+ shared_err += 1
+
+ log.info(
+ "Done: %d created, %d updated, %d shared OK, %d share failed.",
+ created, updated, shared_ok, shared_err,
+ )
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def build_parser() -> argparse.ArgumentParser:
+ p = argparse.ArgumentParser(
+ description=__doc__,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ )
+ p.add_argument(
+ "--file",
+ default="data/cdef.all.txt",
+ metavar="PATH",
+ help="Path to cdef.all.txt (default: data/cdef.all.txt)",
+ )
+ p.add_argument(
+ "--url",
+ default="http://localhost:8080/api",
+ metavar="URL",
+ help="Kustvakt base URL, without trailing slash "
+ "(default: http://localhost:8080/api)",
+ )
+ p.add_argument(
+ "--api-version",
+ default="v1.0",
+ metavar="VER",
+ help="API version segment, e.g. v1.0 or v1.1 (default: v1.0)",
+ )
+ p.add_argument(
+ "--user",
+ default="admin",
+ metavar="USERNAME",
+ help="VC creator / resource-owner username (default: admin)",
+ )
+ p.add_argument(
+ "--password",
+ default=None,
+ metavar="PASSWORD",
+ help="Resource-owner password (prompted if omitted)",
+ )
+ p.add_argument(
+ "--client-id",
+ required=True,
+ metavar="CLIENT_ID",
+ help="OAuth2 client_id for the password grant",
+ )
+ p.add_argument(
+ "--client-secret",
+ default="",
+ metavar="CLIENT_SECRET",
+ help="OAuth2 client_secret (default: empty, for public clients)",
+ )
+ p.add_argument(
+ "--skip-share",
+ action="store_true",
+ help="Create VCs only, do not share them with the groups",
+ )
+ p.add_argument(
+ "--dry-run",
+ action="store_true",
+ help="Print what would be done without sending any requests",
+ )
+ p.add_argument(
+ "--verbose",
+ action="store_true",
+ help="Enable DEBUG-level logging",
+ )
+ return p
+
+
+def main() -> None:
+ args = build_parser().parse_args()
+
+ if args.verbose:
+ logging.getLogger().setLevel(logging.DEBUG)
+
+ path = Path(args.file)
+ if not path.exists():
+ log.error("Input file not found: %s", path)
+ sys.exit(1)
+
+ base_url = args.url.rstrip("/")
+
+ if args.dry_run:
+ auth_header = "Bearer <dry-run>"
+ else:
+ password = args.password
+ if password is None:
+ import getpass
+ password = getpass.getpass(f"Password for '{args.user}': ")
+ token = fetch_bearer_token(
+ base_url=base_url,
+ api_version=args.api_version,
+ user=args.user,
+ password=password,
+ client_id=args.client_id,
+ client_secret=args.client_secret,
+ )
+ auth_header = f"Bearer {token}"
+
+ entries = parse_cdef(path)
+ log.info("Parsed %d entr%s from %s",
+ len(entries), "y" if len(entries) == 1 else "ies", path)
+
+ if not entries:
+ log.warning("No entries found – nothing to do.")
+ return
+
+ import_vc(
+ entries=entries,
+ base_url=base_url,
+ api_version=args.api_version,
+ creator=args.user,
+ auth_header=auth_header,
+ share=not args.skip_share,
+ dry_run=args.dry_run,
+ )
+
+
+if __name__ == "__main__":
+ main()