blob: d776da2a61174c667aaab98388d8e91dc539e892 [file] [log] [blame]
margaretha4d8f8e22026-03-19 11:58:29 +01001#!/usr/bin/env python3
2"""
3@author margaretha with AI assistance
4
5
6import-c2-project-vc.py – Parse cdef.all.txt and create project VCs in Kustvakt.
7
8Input file format (tab-separated, one entry per line):
9 [vcDescription]\tLOAD('[vcName]')\tG_[groupName]
10
11For each line the script:
12 1. Creates (or updates) a PROJECT virtual corpus named <groupName>-VC
13 owned by --user, using the corpus query refersto(<vcName>)
14 2. Shares that VC with the user group <groupName>
15
16Lines starting with '#' or blank lines are ignored.
17
18The script obtains a Bearer token via the OAuth2 password grant and calls:
19 PUT <base-url>/<api-version>/vc/~<user>/<groupName>-VC (JSON body)
20 POST <base-url>/<api-version>/vc/~<user>/<groupName>-VC/share/@<groupName>
21
22Usage:
23 python3 import-c2-project-vc.py --file data/cdef.all.txt \\
24 --url http://localhost:8080/api \\
25 ----api-version v1.0 \\
26 --user admin --password pass \\
27 --client-id <client_id> [--client-secret <client_secret>]
28
29Run with --help for all options.
30"""
31
32import argparse
33import json
34import logging
35import re
36import sys
37import urllib.error
38import urllib.parse
39import urllib.request
40from dataclasses import dataclass
41from pathlib import Path
42
43logging.basicConfig(
44 level=logging.INFO,
45 format="%(levelname)s %(message)s",
46)
47log = logging.getLogger(__name__)
48
49# cdef.all.txt line pattern:
50# <vcDescription><TAB>LOAD('<vcName>')<TAB>G_<groupName>
51_LINE_RE = re.compile(
52 r"^(?P<vcDesc>[^\t]*?)"
53 r"\tLOAD\('(?P<vcName>[^']+)'\)"
54 r"\tG_(?P<groupName>\S+)$"
55)
56
57# Matches lines that have LOAD() but no G_<groupName> column – silently skipped.
58_NO_GROUP_RE = re.compile(r"\tLOAD\('[^']+'\)\s*$")
59
60
61# ---------------------------------------------------------------------------
62# Data class
63# ---------------------------------------------------------------------------
64
65@dataclass
66class CdefEntry:
67 vc_sigles: list[str] # corpus sigles from LOAD('sigle1 sigle2 ...')
68 vc_desc: str # description (from the first occurrence of the group)
69 group_name: str
70
71 @property
72 def kustvakt_vc_name(self) -> str:
73 """Name used when creating the VC in Kustvakt: <groupName>-VC"""
74 return f"{self.group_name}-VC"
75
76
77# ---------------------------------------------------------------------------
78# Parsing
79# ---------------------------------------------------------------------------
80
81def parse_cdef(path: Path) -> list[CdefEntry]:
82 # Use an ordered dict so insertion order (first occurrence) is preserved.
83 by_group: dict[str, CdefEntry] = {}
84
85 for lineno, raw in enumerate(path.read_text(encoding="utf-8").splitlines(), 1):
86 line = raw.rstrip("\n")
87 if not line.strip() or line.strip().startswith("#"):
88 continue
89 m = _LINE_RE.match(line)
90 if not m:
91 # Lines with LOAD() but no G_<groupName> are silently skipped.
92 if _NO_GROUP_RE.search(line):
93 log.debug("Line %d: no G_<groupName>, skipping: %s", lineno, line)
94 else:
95 log.warning("Line %d: unexpected format, skipping: %s", lineno, line)
96 continue
97
98 group_name = m.group("groupName")
99 # LOAD() contains a space-separated list of corpus sigles.
100 new_sigles = m.group("vcName").split()
101
102 if group_name in by_group:
103 # Merge: append sigles that are not already present.
104 existing = by_group[group_name]
105 seen = set(existing.vc_sigles)
106 for s in new_sigles:
107 if s not in seen:
108 existing.vc_sigles.append(s)
109 seen.add(s)
110 log.debug("Line %d: merged %d sigle(s) into group '%s'",
111 lineno, len(new_sigles), group_name)
112 else:
113 by_group[group_name] = CdefEntry(
114 vc_sigles=list(new_sigles),
115 vc_desc=m.group("vcDesc").strip(),
116 group_name=group_name,
117 )
118
119 return list(by_group.values())
120
121
122# ---------------------------------------------------------------------------
123# HTTP helpers
124# ---------------------------------------------------------------------------
125
126def fetch_bearer_token(
127 base_url: str,
128 api_version: str,
129 user: str,
130 password: str,
131 client_id: str,
132 client_secret: str,
133) -> str:
134 """Obtain an OAuth2 Bearer token via the resource-owner password grant."""
135 token_url = f"{base_url}/{api_version}/oauth2/token"
136 body = urllib.parse.urlencode({
137 "grant_type": "password",
138 "username": user,
139 "password": password,
140 "client_id": client_id,
141 "client_secret": client_secret,
142 }).encode()
143 req = urllib.request.Request(
144 token_url,
145 data=body,
146 method="POST",
147 headers={"Content-Type": "application/x-www-form-urlencoded"},
148 )
149 try:
150 with urllib.request.urlopen(req) as resp:
151 data = json.loads(resp.read().decode())
152 except urllib.error.HTTPError as exc:
153 log.error("Token request failed (HTTP %s): %s",
154 exc.code, exc.read().decode(errors="replace"))
155 sys.exit(1)
156 except urllib.error.URLError as exc:
157 log.error("Token request network error: %s", exc.reason)
158 sys.exit(1)
159
160 token = data.get("access_token")
161 if not token:
162 log.error("No access_token in response: %s", data)
163 sys.exit(1)
164
165 log.debug("Bearer token obtained successfully.")
166 return token
167
168
169def _put_json(url: str, payload: dict, auth_header: str, dry_run: bool) -> int:
170 """PUT with an application/json body. Returns HTTP status code."""
171 body = json.dumps(payload).encode()
172 req = urllib.request.Request(
173 url,
174 data=body,
175 method="PUT",
176 headers={
177 "Authorization": auth_header,
178 "Content-Type": "application/json;charset=utf-8",
179 },
180 )
181 if dry_run:
182 log.info("[DRY-RUN] PUT %s body=%s", url, payload)
183 return 201
184
185 try:
186 with urllib.request.urlopen(req) as resp:
187 return resp.status
188 except urllib.error.HTTPError as exc:
189 log.error("HTTP %s for PUT %s – %s", exc.code, url, exc.read().decode(errors="replace"))
190 return exc.code
191 except urllib.error.URLError as exc:
192 log.error("Network error for PUT %s – %s", url, exc.reason)
193 return 0
194
195
196def _post(url: str, auth_header: str, dry_run: bool) -> int:
197 """POST with an empty body. Returns HTTP status code."""
198 req = urllib.request.Request(
199 url,
200 data=b"",
201 method="POST",
202 headers={"Authorization": auth_header},
203 )
204 if dry_run:
205 log.info("[DRY-RUN] POST %s", url)
206 return 200
207
208 try:
209 with urllib.request.urlopen(req) as resp:
210 return resp.status
211 except urllib.error.HTTPError as exc:
212 log.error("HTTP %s for POST %s – %s", exc.code, url, exc.read().decode(errors="replace"))
213 return exc.code
214 except urllib.error.URLError as exc:
215 log.error("Network error for POST %s – %s", url, exc.reason)
216 return 0
217
218
219# ---------------------------------------------------------------------------
220# QueryJson builder
221# ---------------------------------------------------------------------------
222
223def _build_query_json(entry: CdefEntry) -> dict:
224 """
225 Build the JSON body that maps to Kustvakt's QueryJson POJO:
226
227 QueryJson field │ Java type │ value sent
228 ──────────────────┼────────────────┼──────────────────────────────
229 type │ ResourceType │ "PROJECT"
230 queryType │ QueryType │ "VIRTUAL_CORPUS"
231 corpusQuery │ String │ corpusSigle="A" | corpusSigle="B" ...
232 description │ String │ <vcDescription>
233
234 corpusQuery is a KorAP collection query string that is serialized
235 to KoralQuery JSON by Kustvakt internally (via Koral QuerySerializer).
236 Multiple sigles from repeated LOAD() lines for the same group are
237 combined with | (OR).
238
239 The controller (VirtualCorpusController.createUpdateVC) defaults
240 queryType to VIRTUAL_CORPUS when null, so it could be omitted, but
241 is included here for clarity.
242 """
243 corpus_query = " | ".join(
244 f'corpusSigle="{sigle}"' for sigle in entry.vc_sigles
245 )
246 return {
247 "type": "PROJECT", # ResourceType.PROJECT
248 "queryType": "VIRTUAL_CORPUS", # QueryType.VIRTUAL_CORPUS (default if omitted)
249 "corpusQuery": corpus_query,
250 "description": entry.vc_desc,
251 }
252
253
254# ---------------------------------------------------------------------------
255# Import
256# ---------------------------------------------------------------------------
257
258def import_vc(
259 entries: list[CdefEntry],
260 base_url: str,
261 api_version: str,
262 creator: str,
263 auth_header: str,
264 share: bool,
265 dry_run: bool,
266) -> None:
267 created = updated = shared_ok = shared_err = 0
268
269 for e in entries:
270 vc_url = f"{base_url}/{api_version}/vc/~{creator}/{e.kustvakt_vc_name}"
271 payload = _build_query_json(e)
272
273 # --- create / update VC ---
274 status = _put_json(vc_url, payload, auth_header, dry_run)
275 if status == 201:
276 log.info("Created VC '%s' (group '%s')", e.kustvakt_vc_name, e.group_name)
277 created += 1
278 elif status == 204:
279 log.info("Updated VC '%s' (group '%s')", e.kustvakt_vc_name, e.group_name)
280 updated += 1
281 else:
282 log.error("Failed to create/update VC '%s' (HTTP %s) – skipping share",
283 e.kustvakt_vc_name, status)
284 continue
285
286 # --- share with group ---
287 if share:
288 share_url = f"{vc_url}/share/@{e.group_name}"
289 status = _post(share_url, auth_header, dry_run)
290 if status == 200:
291 log.info(" Shared '%s' with group '%s'",
292 e.kustvakt_vc_name, e.group_name)
293 shared_ok += 1
294 else:
295 log.error(" Failed to share '%s' with group '%s' (HTTP %s)",
296 e.kustvakt_vc_name, e.group_name, status)
297 shared_err += 1
298
299 log.info(
300 "Done: %d created, %d updated, %d shared OK, %d share failed.",
301 created, updated, shared_ok, shared_err,
302 )
303
304
305# ---------------------------------------------------------------------------
306# CLI
307# ---------------------------------------------------------------------------
308
309def build_parser() -> argparse.ArgumentParser:
310 p = argparse.ArgumentParser(
311 description=__doc__,
312 formatter_class=argparse.RawDescriptionHelpFormatter,
313 )
314 p.add_argument(
315 "--file",
316 default="data/cdef.all.txt",
317 metavar="PATH",
318 help="Path to cdef.all.txt (default: data/cdef.all.txt)",
319 )
320 p.add_argument(
321 "--url",
322 default="http://localhost:8080/api",
323 metavar="URL",
324 help="Kustvakt base URL, without trailing slash "
325 "(default: http://localhost:8080/api)",
326 )
327 p.add_argument(
328 "--api-version",
329 default="v1.0",
330 metavar="VER",
331 help="API version segment, e.g. v1.0 or v1.1 (default: v1.0)",
332 )
333 p.add_argument(
334 "--user",
335 default="admin",
336 metavar="USERNAME",
337 help="VC creator / resource-owner username (default: admin)",
338 )
339 p.add_argument(
340 "--password",
341 default=None,
342 metavar="PASSWORD",
343 help="Resource-owner password (prompted if omitted)",
344 )
345 p.add_argument(
346 "--client-id",
347 required=True,
348 metavar="CLIENT_ID",
349 help="OAuth2 client_id for the password grant",
350 )
351 p.add_argument(
352 "--client-secret",
353 default="",
354 metavar="CLIENT_SECRET",
355 help="OAuth2 client_secret (default: empty, for public clients)",
356 )
357 p.add_argument(
358 "--skip-share",
359 action="store_true",
360 help="Create VCs only, do not share them with the groups",
361 )
362 p.add_argument(
363 "--dry-run",
364 action="store_true",
365 help="Print what would be done without sending any requests",
366 )
367 p.add_argument(
368 "--verbose",
369 action="store_true",
370 help="Enable DEBUG-level logging",
371 )
372 return p
373
374
375def main() -> None:
376 args = build_parser().parse_args()
377
378 if args.verbose:
379 logging.getLogger().setLevel(logging.DEBUG)
380
381 path = Path(args.file)
382 if not path.exists():
383 log.error("Input file not found: %s", path)
384 sys.exit(1)
385
386 base_url = args.url.rstrip("/")
387
388 if args.dry_run:
389 auth_header = "Bearer <dry-run>"
390 else:
391 password = args.password
392 if password is None:
393 import getpass
394 password = getpass.getpass(f"Password for '{args.user}': ")
395 token = fetch_bearer_token(
396 base_url=base_url,
397 api_version=args.api_version,
398 user=args.user,
399 password=password,
400 client_id=args.client_id,
401 client_secret=args.client_secret,
402 )
403 auth_header = f"Bearer {token}"
404
405 entries = parse_cdef(path)
406 log.info("Parsed %d entr%s from %s",
407 len(entries), "y" if len(entries) == 1 else "ies", path)
408
409 if not entries:
410 log.warning("No entries found – nothing to do.")
411 return
412
413 import_vc(
414 entries=entries,
415 base_url=base_url,
416 api_version=args.api_version,
417 creator=args.user,
418 auth_header=auth_header,
419 share=not args.skip_share,
420 dry_run=args.dry_run,
421 )
422
423
424if __name__ == "__main__":
425 main()