blob: 934799411bf5b08f788a9cd055577d51ae26c086 [file] [log] [blame]
Akron57ee5582025-05-21 15:25:13 +02001package config
2
3import (
4 "fmt"
Akrondaca3142026-05-21 13:00:45 +02005 "net/url"
Akron57ee5582025-05-21 15:25:13 +02006 "os"
Akroned787d02026-05-20 12:31:07 +02007 "path/filepath"
Akronf98ba282026-02-24 11:13:30 +01008 "strconv"
Akroned787d02026-05-20 12:31:07 +02009 "strings"
Akron57ee5582025-05-21 15:25:13 +020010
Akron2ef703c2025-07-03 15:57:42 +020011 "github.com/KorAP/Koral-Mapper/ast"
12 "github.com/KorAP/Koral-Mapper/parser"
Akron7e8da932025-07-01 11:56:46 +020013 "github.com/rs/zerolog/log"
Akron57ee5582025-05-21 15:25:13 +020014 "gopkg.in/yaml.v3"
15)
16
Akron06d21f02025-06-04 14:36:07 +020017const (
Akron2ac2ec02025-06-05 15:26:42 +020018 defaultServer = "https://korap.ids-mannheim.de/"
19 defaultSDK = "https://korap.ids-mannheim.de/js/korap-plugin-latest.js"
Akron43fb1022026-02-20 11:38:49 +010020 defaultStylesheet = "https://korap.ids-mannheim.de/css/kalamar-plugin-latest.css"
Akron2ef703c2025-07-03 15:57:42 +020021 defaultServiceURL = "https://korap.ids-mannheim.de/plugin/koralmapper"
Akron43fb1022026-02-20 11:38:49 +010022 defaultCookieName = "km-config"
Akron14c13a52025-06-06 15:36:23 +020023 defaultPort = 5725
Akronf1ca8822026-05-20 15:44:00 +020024 defaultLogLevel = "warn"
25 defaultRateLimit = 100
Akron06d21f02025-06-04 14:36:07 +020026)
27
Akron57ee5582025-05-21 15:25:13 +020028// MappingRule represents a single mapping rule in the configuration
29type MappingRule string
30
31// MappingList represents a list of mapping rules with metadata
32type MappingList struct {
Akrondab27112025-06-05 13:52:43 +020033 ID string `yaml:"id"`
Akron2f93c582026-02-19 16:49:13 +010034 Type string `yaml:"type,omitempty"` // "annotation" (default) or "corpus"
Akrondab27112025-06-05 13:52:43 +020035 Description string `yaml:"desc,omitempty"`
36 FoundryA string `yaml:"foundryA,omitempty"`
37 LayerA string `yaml:"layerA,omitempty"`
38 FoundryB string `yaml:"foundryB,omitempty"`
39 LayerB string `yaml:"layerB,omitempty"`
Akrona67de8f2026-02-23 17:54:26 +010040 FieldA string `yaml:"fieldA,omitempty"`
41 FieldB string `yaml:"fieldB,omitempty"`
Akronf7bba072026-05-21 12:36:19 +020042 Rewrites *bool `yaml:"rewrites,omitempty"`
Akrondab27112025-06-05 13:52:43 +020043 Mappings []MappingRule `yaml:"mappings"`
Akron57ee5582025-05-21 15:25:13 +020044}
45
Akron2f93c582026-02-19 16:49:13 +010046// IsCorpus returns true if the mapping list type is "corpus".
47func (list *MappingList) IsCorpus() bool {
48 return list.Type == "corpus"
49}
50
Akronf7bba072026-05-21 12:36:19 +020051// EffectiveRewrites returns the resolved rewrites setting for this list.
52// If the list has an explicit per-list override, it is used; otherwise the
53// global default is returned.
54func (list *MappingList) EffectiveRewrites(globalDefault bool) bool {
55 if list.Rewrites != nil {
56 return *list.Rewrites
57 }
58 return globalDefault
59}
60
Akron2f93c582026-02-19 16:49:13 +010061// ParseCorpusMappings parses all mapping rules as corpus rules.
Akrona67de8f2026-02-23 17:54:26 +010062// Bare values (without key=) are always allowed and receive the default
63// field name from the mapping list header (FieldA/FieldB) when set.
Akron2f93c582026-02-19 16:49:13 +010064func (list *MappingList) ParseCorpusMappings() ([]*parser.CorpusMappingResult, error) {
65 corpusParser := parser.NewCorpusParser()
Akrona67de8f2026-02-23 17:54:26 +010066 corpusParser.AllowBareValues = true
67
Akron2f93c582026-02-19 16:49:13 +010068 results := make([]*parser.CorpusMappingResult, len(list.Mappings))
69 for i, rule := range list.Mappings {
70 if rule == "" {
71 return nil, fmt.Errorf("empty corpus mapping rule at index %d in list '%s'", i, list.ID)
72 }
73 result, err := corpusParser.ParseMapping(string(rule))
74 if err != nil {
75 return nil, fmt.Errorf("failed to parse corpus mapping rule %d in list '%s': %w", i, list.ID, err)
76 }
Akrona67de8f2026-02-23 17:54:26 +010077
78 if list.FieldA != "" {
79 applyDefaultCorpusKey(result.Upper, list.FieldA)
80 }
81 if list.FieldB != "" {
82 applyDefaultCorpusKey(result.Lower, list.FieldB)
83 }
84
Akron2f93c582026-02-19 16:49:13 +010085 results[i] = result
86 }
87 return results, nil
88}
89
Akrona67de8f2026-02-23 17:54:26 +010090// applyDefaultCorpusKey recursively fills in empty keys on CorpusField nodes.
91func applyDefaultCorpusKey(node parser.CorpusNode, defaultKey string) {
92 switch n := node.(type) {
93 case *parser.CorpusField:
94 if n.Key == "" {
95 n.Key = defaultKey
96 }
97 case *parser.CorpusGroup:
98 for _, op := range n.Operands {
99 applyDefaultCorpusKey(op, defaultKey)
100 }
101 }
102}
103
Akron06d21f02025-06-04 14:36:07 +0200104// MappingConfig represents the root configuration containing multiple mapping lists
105type MappingConfig struct {
Akronf1ca8822026-05-20 15:44:00 +0200106 SDK string `yaml:"sdk,omitempty"`
107 Stylesheet string `yaml:"stylesheet,omitempty"`
108 Server string `yaml:"server,omitempty"`
109 ServiceURL string `yaml:"serviceURL,omitempty"`
110 CookieName string `yaml:"cookieName,omitempty"`
111 BasePath string `yaml:"basePath,omitempty"` // restricts config file loading to this directory tree
112 AllowOrigins string `yaml:"allowOrigins,omitempty"` // comma-separated list of allowed CORS origins
113 Port int `yaml:"port,omitempty"`
114 LogLevel string `yaml:"loglevel,omitempty"`
115 RateLimit int `yaml:"rateLimit,omitempty"` // max requests per minute per IP (0 = use default 100)
Akronf7bba072026-05-21 12:36:19 +0200116 Rewrites bool `yaml:"rewrites,omitempty"` // global default for koral:rewrite annotations
Akronf1ca8822026-05-20 15:44:00 +0200117 Lists []MappingList `yaml:"lists,omitempty"`
Akron57ee5582025-05-21 15:25:13 +0200118}
119
Akroned787d02026-05-20 12:31:07 +0200120// AllowedBasePath restricts file loading to a specific directory tree.
121// When set, all file paths must resolve to a location at or below this
122// directory (or under the system temp directory). Defaults to the CWD at
123// application startup; can be overridden via the "basePath" YAML config
124// field or the KORAL_MAPPER_BASE_PATH environment variable. In Docker
125// (WORKDIR /), the default "/" naturally allows all paths.
126var AllowedBasePath string
127
128// isWithinDir checks whether absPath is at or below the given directory.
129// Uses a trailing-separator comparison to avoid prefix false positives
130// (e.g. /home/user must not match /home/username).
131func isWithinDir(absPath, dir string) bool {
132 if dir == "/" {
133 return true
134 }
135 return absPath == dir || strings.HasPrefix(absPath, dir+string(filepath.Separator))
136}
137
138// sanitizeFilePath cleans a file path, resolves it to an absolute path, and
139// (when AllowedBasePath is set) verifies it resides at or below the allowed
140// base directory or the system temp directory. This prevents path
141// traversal attacks by ensuring os.ReadFile never receives
142// unsanitized user input and cannot access files outside the application's
143// working tree.
144func sanitizeFilePath(path string) (string, error) {
145 if path == "" {
146 return "", fmt.Errorf("empty file path")
147 }
148
149 // Clean the path to remove redundant separators and resolve "." and ".."
150 cleaned := filepath.Clean(path)
151
152 // Convert to absolute path so all traversal is resolved against the CWD
153 absPath, err := filepath.Abs(cleaned)
154 if err != nil {
155 return "", fmt.Errorf("failed to resolve absolute path for '%s': %w", path, err)
156 }
157
158 // If a base path is configured, confine access to that tree or temp dir
159 if AllowedBasePath != "" {
160 base := filepath.Clean(AllowedBasePath)
161 tmpDir := filepath.Clean(os.TempDir())
162
163 if !isWithinDir(absPath, base) && !isWithinDir(absPath, tmpDir) {
164 return "", fmt.Errorf(
165 "path traversal detected: '%s' resolves to '%s' which is outside the allowed base '%s'",
166 path, absPath, base)
167 }
168 }
169
170 return absPath, nil
171}
172
Akrone1cff7c2025-06-04 18:43:32 +0200173// LoadFromSources loads configuration from multiple sources and merges them:
174// - A main configuration file (optional) containing global settings and lists
175// - Individual mapping files (optional) containing single mapping lists each
176// At least one source must be provided
177func LoadFromSources(configFile string, mappingFiles []string) (*MappingConfig, error) {
178 var allLists []MappingList
179 var globalConfig MappingConfig
Akron57ee5582025-05-21 15:25:13 +0200180
Akrone1cff7c2025-06-04 18:43:32 +0200181 // Track seen IDs across all sources to detect duplicates
182 seenIDs := make(map[string]bool)
Akrona5d88142025-05-22 14:42:09 +0200183
Akrone1cff7c2025-06-04 18:43:32 +0200184 // Load main configuration file if provided
185 if configFile != "" {
Akroned787d02026-05-20 12:31:07 +0200186 safePath, err := sanitizeFilePath(configFile)
187 if err != nil {
188 return nil, err
189 }
190 data, err := os.ReadFile(safePath) // #nosec G304 -- path sanitized above
Akrone1cff7c2025-06-04 18:43:32 +0200191 if err != nil {
192 return nil, fmt.Errorf("failed to read config file '%s': %w", configFile, err)
Akron06d21f02025-06-04 14:36:07 +0200193 }
Akrone1cff7c2025-06-04 18:43:32 +0200194
195 if len(data) == 0 {
196 return nil, fmt.Errorf("EOF: config file '%s' is empty", configFile)
197 }
198
199 // Try to unmarshal as new format first (object with optional sdk/server and lists)
Akron813780f2025-06-05 15:44:28 +0200200 if err := yaml.Unmarshal(data, &globalConfig); err == nil {
201 // Successfully parsed as new format - accept it regardless of whether it has lists
Akrone1cff7c2025-06-04 18:43:32 +0200202 for _, list := range globalConfig.Lists {
203 if seenIDs[list.ID] {
204 return nil, fmt.Errorf("duplicate mapping list ID found: %s", list.ID)
205 }
206 seenIDs[list.ID] = true
207 }
208 allLists = append(allLists, globalConfig.Lists...)
209 } else {
210 // Fall back to old format (direct list)
211 var lists []MappingList
212 if err := yaml.Unmarshal(data, &lists); err != nil {
213 return nil, fmt.Errorf("failed to parse YAML config file '%s': %w", configFile, err)
214 }
215
216 for _, list := range lists {
217 if seenIDs[list.ID] {
218 return nil, fmt.Errorf("duplicate mapping list ID found: %s", list.ID)
219 }
220 seenIDs[list.ID] = true
221 }
222 allLists = append(allLists, lists...)
223 // Clear the lists from globalConfig since we got them from the old format
224 globalConfig.Lists = nil
225 }
Akron06d21f02025-06-04 14:36:07 +0200226 }
227
Akrone1cff7c2025-06-04 18:43:32 +0200228 // Load individual mapping files
229 for _, file := range mappingFiles {
Akroned787d02026-05-20 12:31:07 +0200230 safePath, err := sanitizeFilePath(file)
231 if err != nil {
232 return nil, err
233 }
234 data, err := os.ReadFile(safePath) // #nosec G304 -- path sanitized above
Akrone1cff7c2025-06-04 18:43:32 +0200235 if err != nil {
Akron7e8da932025-07-01 11:56:46 +0200236 log.Error().Err(err).Str("file", file).Msg("Failed to read mapping file")
237 continue
Akrone1cff7c2025-06-04 18:43:32 +0200238 }
239
240 if len(data) == 0 {
Akron7e8da932025-07-01 11:56:46 +0200241 log.Error().Err(err).Str("file", file).Msg("EOF: mapping file is empty")
242 continue
Akrone1cff7c2025-06-04 18:43:32 +0200243 }
244
245 var list MappingList
246 if err := yaml.Unmarshal(data, &list); err != nil {
Akron7e8da932025-07-01 11:56:46 +0200247 log.Error().Err(err).Str("file", file).Msg("Failed to parse YAML mapping file")
248 continue
Akrone1cff7c2025-06-04 18:43:32 +0200249 }
250
251 if seenIDs[list.ID] {
Akron7e8da932025-07-01 11:56:46 +0200252 log.Error().Err(err).Str("file", file).Str("list-id", list.ID).Msg("Duplicate mapping list ID found")
253 continue
Akrone1cff7c2025-06-04 18:43:32 +0200254 }
255 seenIDs[list.ID] = true
256 allLists = append(allLists, list)
Akron57ee5582025-05-21 15:25:13 +0200257 }
258
Akrone1cff7c2025-06-04 18:43:32 +0200259 // Ensure we have at least some configuration
260 if len(allLists) == 0 {
261 return nil, fmt.Errorf("no mapping lists found: provide either a config file (-c) with lists or mapping files (-m)")
262 }
263
Akron585f50f2025-07-03 13:55:47 +0200264 // Validate all mapping lists (skip duplicate ID check since we already did it)
Akrone1cff7c2025-06-04 18:43:32 +0200265 if err := validateMappingLists(allLists); err != nil {
Akron06d21f02025-06-04 14:36:07 +0200266 return nil, err
267 }
268
Akrone1cff7c2025-06-04 18:43:32 +0200269 // Create final configuration
270 result := &MappingConfig{
Akronf1ca8822026-05-20 15:44:00 +0200271 SDK: globalConfig.SDK,
272 Stylesheet: globalConfig.Stylesheet,
273 Server: globalConfig.Server,
274 ServiceURL: globalConfig.ServiceURL,
275 BasePath: globalConfig.BasePath,
276 AllowOrigins: globalConfig.AllowOrigins,
277 Port: globalConfig.Port,
278 LogLevel: globalConfig.LogLevel,
279 RateLimit: globalConfig.RateLimit,
Akronf7bba072026-05-21 12:36:19 +0200280 Rewrites: globalConfig.Rewrites,
Akronf1ca8822026-05-20 15:44:00 +0200281 Lists: allLists,
Akrone1cff7c2025-06-04 18:43:32 +0200282 }
283
Akronf98ba282026-02-24 11:13:30 +0100284 // Apply environment variable overrides (ENV > config file)
285 ApplyEnvOverrides(result)
286
Akron06d21f02025-06-04 14:36:07 +0200287 // Apply defaults if not specified
Akron2ac2ec02025-06-05 15:26:42 +0200288 ApplyDefaults(result)
Akrone1cff7c2025-06-04 18:43:32 +0200289
290 return result, nil
291}
292
Akron585f50f2025-07-03 13:55:47 +0200293// ApplyDefaults sets default values for configuration fields if they are empty
Akron2ac2ec02025-06-05 15:26:42 +0200294func ApplyDefaults(config *MappingConfig) {
Akron585f50f2025-07-03 13:55:47 +0200295 defaults := map[*string]string{
296 &config.SDK: defaultSDK,
Akron43fb1022026-02-20 11:38:49 +0100297 &config.Stylesheet: defaultStylesheet,
Akron585f50f2025-07-03 13:55:47 +0200298 &config.Server: defaultServer,
299 &config.ServiceURL: defaultServiceURL,
Akron43fb1022026-02-20 11:38:49 +0100300 &config.CookieName: defaultCookieName,
Akron585f50f2025-07-03 13:55:47 +0200301 &config.LogLevel: defaultLogLevel,
Akron06d21f02025-06-04 14:36:07 +0200302 }
Akron585f50f2025-07-03 13:55:47 +0200303
304 for field, defaultValue := range defaults {
305 if *field == "" {
306 *field = defaultValue
307 }
Akron06d21f02025-06-04 14:36:07 +0200308 }
Akron585f50f2025-07-03 13:55:47 +0200309
Akrondaca3142026-05-21 13:00:45 +0200310 // AllowOrigins defaults to the Server value. This avoids duplicating
311 // the server URL string and keeps CORS in sync with the deployment.
Akronf1ca8822026-05-20 15:44:00 +0200312 if config.AllowOrigins == "" {
Akrondaca3142026-05-21 13:00:45 +0200313 config.AllowOrigins = config.Server
Akronf1ca8822026-05-20 15:44:00 +0200314 }
Akrondaca3142026-05-21 13:00:45 +0200315 config.AllowOrigins = normalizeOrigins(config.AllowOrigins)
Akronf1ca8822026-05-20 15:44:00 +0200316
Akrona8a66ce2025-06-05 10:50:17 +0200317 if config.Port == 0 {
318 config.Port = defaultPort
319 }
Akrone6767de2026-05-20 10:06:24 +0200320 if config.RateLimit == 0 {
321 config.RateLimit = defaultRateLimit
322 }
Akron06d21f02025-06-04 14:36:07 +0200323}
324
Akrondaca3142026-05-21 13:00:45 +0200325// normalizeOrigins takes a comma-separated list of origin URLs and strips
326// any path components, returning only scheme + host (+ port when present).
327// The CORS middleware requires bare origins without paths; URLs like
328// "https://example.com/instance/test" are pruned to "https://example.com".
329func normalizeOrigins(raw string) string {
330 parts := strings.Split(raw, ",")
331 for i, part := range parts {
332 part = strings.TrimSpace(part)
333 if u, err := url.Parse(part); err == nil && u.Host != "" {
334 parts[i] = u.Scheme + "://" + u.Host
335 } else {
336 parts[i] = strings.TrimRight(part, "/")
337 }
338 }
339 return strings.Join(parts, ",")
340}
341
Akronf98ba282026-02-24 11:13:30 +0100342// ApplyEnvOverrides overrides configuration fields from environment variables.
343// All environment variables are uppercase and prefixed with KORAL_MAPPER_.
344// Non-empty environment values override any previously loaded config values.
345func ApplyEnvOverrides(config *MappingConfig) {
346 envMappings := map[string]*string{
Akronf1ca8822026-05-20 15:44:00 +0200347 "KORAL_MAPPER_SERVER": &config.Server,
348 "KORAL_MAPPER_SDK": &config.SDK,
349 "KORAL_MAPPER_STYLESHEET": &config.Stylesheet,
350 "KORAL_MAPPER_SERVICE_URL": &config.ServiceURL,
351 "KORAL_MAPPER_COOKIE_NAME": &config.CookieName,
352 "KORAL_MAPPER_LOG_LEVEL": &config.LogLevel,
353 "KORAL_MAPPER_BASE_PATH": &config.BasePath,
354 "KORAL_MAPPER_ALLOW_ORIGINS": &config.AllowOrigins,
Akronf98ba282026-02-24 11:13:30 +0100355 }
356
357 for envKey, field := range envMappings {
358 if val := os.Getenv(envKey); val != "" {
359 *field = val
360 }
361 }
362
363 if val := os.Getenv("KORAL_MAPPER_PORT"); val != "" {
364 if port, err := strconv.Atoi(val); err == nil {
365 config.Port = port
366 }
367 }
Akrone6767de2026-05-20 10:06:24 +0200368
369 if val := os.Getenv("KORAL_MAPPER_RATE_LIMIT"); val != "" {
370 if rl, err := strconv.Atoi(val); err == nil {
371 config.RateLimit = rl
372 }
373 }
Akronf7bba072026-05-21 12:36:19 +0200374
375 if val := os.Getenv("KORAL_MAPPER_REWRITES"); val != "" {
376 config.Rewrites = val == "true"
377 }
Akronf98ba282026-02-24 11:13:30 +0100378}
379
Akron585f50f2025-07-03 13:55:47 +0200380// validateMappingLists validates a slice of mapping lists (without duplicate ID checking)
Akron06d21f02025-06-04 14:36:07 +0200381func validateMappingLists(lists []MappingList) error {
Akron57ee5582025-05-21 15:25:13 +0200382 for i, list := range lists {
383 if list.ID == "" {
Akron06d21f02025-06-04 14:36:07 +0200384 return fmt.Errorf("mapping list at index %d is missing an ID", i)
Akron57ee5582025-05-21 15:25:13 +0200385 }
Akrona5d88142025-05-22 14:42:09 +0200386
Akron57ee5582025-05-21 15:25:13 +0200387 if len(list.Mappings) == 0 {
Akron06d21f02025-06-04 14:36:07 +0200388 return fmt.Errorf("mapping list '%s' has no mapping rules", list.ID)
Akron57ee5582025-05-21 15:25:13 +0200389 }
390
391 // Validate each mapping rule
392 for j, rule := range list.Mappings {
393 if rule == "" {
Akron06d21f02025-06-04 14:36:07 +0200394 return fmt.Errorf("mapping list '%s' rule at index %d is empty", list.ID, j)
Akron57ee5582025-05-21 15:25:13 +0200395 }
396 }
397 }
Akron06d21f02025-06-04 14:36:07 +0200398 return nil
Akron57ee5582025-05-21 15:25:13 +0200399}
400
401// ParseMappings parses all mapping rules in a list and returns a slice of parsed rules
402func (list *MappingList) ParseMappings() ([]*parser.MappingResult, error) {
403 // Create a grammar parser with the list's default foundries and layers
404 grammarParser, err := parser.NewGrammarParser("", "")
405 if err != nil {
406 return nil, fmt.Errorf("failed to create grammar parser: %w", err)
407 }
408
409 results := make([]*parser.MappingResult, len(list.Mappings))
410 for i, rule := range list.Mappings {
Akrona5d88142025-05-22 14:42:09 +0200411 // Check for empty rules first
412 if rule == "" {
413 return nil, fmt.Errorf("empty mapping rule at index %d in list '%s'", i, list.ID)
414 }
415
Akron57ee5582025-05-21 15:25:13 +0200416 // Parse the mapping rule
417 result, err := grammarParser.ParseMapping(string(rule))
418 if err != nil {
419 return nil, fmt.Errorf("failed to parse mapping rule %d in list '%s': %w", i, list.ID, err)
420 }
421
422 // Apply default foundries and layers if not specified in the rule
423 if list.FoundryA != "" {
424 applyDefaultFoundryAndLayer(result.Upper.Wrap, list.FoundryA, list.LayerA)
425 }
426 if list.FoundryB != "" {
427 applyDefaultFoundryAndLayer(result.Lower.Wrap, list.FoundryB, list.LayerB)
428 }
429
430 results[i] = result
431 }
432
433 return results, nil
434}
435
436// applyDefaultFoundryAndLayer recursively applies default foundry and layer to terms that don't have them specified
437func applyDefaultFoundryAndLayer(node ast.Node, defaultFoundry, defaultLayer string) {
438 switch n := node.(type) {
439 case *ast.Term:
Akron585f50f2025-07-03 13:55:47 +0200440 if n.Foundry == "" && defaultFoundry != "" {
Akron57ee5582025-05-21 15:25:13 +0200441 n.Foundry = defaultFoundry
442 }
Akron585f50f2025-07-03 13:55:47 +0200443 if n.Layer == "" && defaultLayer != "" {
Akron57ee5582025-05-21 15:25:13 +0200444 n.Layer = defaultLayer
445 }
446 case *ast.TermGroup:
447 for _, op := range n.Operands {
448 applyDefaultFoundryAndLayer(op, defaultFoundry, defaultLayer)
449 }
450 }
451}