blob: d6857089bd724a34f34b1eb303bd6af4f03b75c5 [file] [log] [blame]
Akron57ee5582025-05-21 15:25:13 +02001package config
2
3import (
4 "fmt"
5 "os"
Akroned787d02026-05-20 12:31:07 +02006 "path/filepath"
Akronf98ba282026-02-24 11:13:30 +01007 "strconv"
Akroned787d02026-05-20 12:31:07 +02008 "strings"
Akron57ee5582025-05-21 15:25:13 +02009
Akron2ef703c2025-07-03 15:57:42 +020010 "github.com/KorAP/Koral-Mapper/ast"
11 "github.com/KorAP/Koral-Mapper/parser"
Akron7e8da932025-07-01 11:56:46 +020012 "github.com/rs/zerolog/log"
Akron57ee5582025-05-21 15:25:13 +020013 "gopkg.in/yaml.v3"
14)
15
Akron06d21f02025-06-04 14:36:07 +020016const (
Akron2ac2ec02025-06-05 15:26:42 +020017 defaultServer = "https://korap.ids-mannheim.de/"
18 defaultSDK = "https://korap.ids-mannheim.de/js/korap-plugin-latest.js"
Akron43fb1022026-02-20 11:38:49 +010019 defaultStylesheet = "https://korap.ids-mannheim.de/css/kalamar-plugin-latest.css"
Akron2ef703c2025-07-03 15:57:42 +020020 defaultServiceURL = "https://korap.ids-mannheim.de/plugin/koralmapper"
Akron43fb1022026-02-20 11:38:49 +010021 defaultCookieName = "km-config"
Akron14c13a52025-06-06 15:36:23 +020022 defaultPort = 5725
Akronf1ca8822026-05-20 15:44:00 +020023 defaultLogLevel = "warn"
24 defaultRateLimit = 100
Akron06d21f02025-06-04 14:36:07 +020025)
26
Akron57ee5582025-05-21 15:25:13 +020027// MappingRule represents a single mapping rule in the configuration
28type MappingRule string
29
30// MappingList represents a list of mapping rules with metadata
31type MappingList struct {
Akrondab27112025-06-05 13:52:43 +020032 ID string `yaml:"id"`
Akron2f93c582026-02-19 16:49:13 +010033 Type string `yaml:"type,omitempty"` // "annotation" (default) or "corpus"
Akrondab27112025-06-05 13:52:43 +020034 Description string `yaml:"desc,omitempty"`
35 FoundryA string `yaml:"foundryA,omitempty"`
36 LayerA string `yaml:"layerA,omitempty"`
37 FoundryB string `yaml:"foundryB,omitempty"`
38 LayerB string `yaml:"layerB,omitempty"`
Akrona67de8f2026-02-23 17:54:26 +010039 FieldA string `yaml:"fieldA,omitempty"`
40 FieldB string `yaml:"fieldB,omitempty"`
Akronf7bba072026-05-21 12:36:19 +020041 Rewrites *bool `yaml:"rewrites,omitempty"`
Akrondab27112025-06-05 13:52:43 +020042 Mappings []MappingRule `yaml:"mappings"`
Akron57ee5582025-05-21 15:25:13 +020043}
44
Akron2f93c582026-02-19 16:49:13 +010045// IsCorpus returns true if the mapping list type is "corpus".
46func (list *MappingList) IsCorpus() bool {
47 return list.Type == "corpus"
48}
49
Akronf7bba072026-05-21 12:36:19 +020050// EffectiveRewrites returns the resolved rewrites setting for this list.
51// If the list has an explicit per-list override, it is used; otherwise the
52// global default is returned.
53func (list *MappingList) EffectiveRewrites(globalDefault bool) bool {
54 if list.Rewrites != nil {
55 return *list.Rewrites
56 }
57 return globalDefault
58}
59
Akron2f93c582026-02-19 16:49:13 +010060// ParseCorpusMappings parses all mapping rules as corpus rules.
Akrona67de8f2026-02-23 17:54:26 +010061// Bare values (without key=) are always allowed and receive the default
62// field name from the mapping list header (FieldA/FieldB) when set.
Akron2f93c582026-02-19 16:49:13 +010063func (list *MappingList) ParseCorpusMappings() ([]*parser.CorpusMappingResult, error) {
64 corpusParser := parser.NewCorpusParser()
Akrona67de8f2026-02-23 17:54:26 +010065 corpusParser.AllowBareValues = true
66
Akron2f93c582026-02-19 16:49:13 +010067 results := make([]*parser.CorpusMappingResult, len(list.Mappings))
68 for i, rule := range list.Mappings {
69 if rule == "" {
70 return nil, fmt.Errorf("empty corpus mapping rule at index %d in list '%s'", i, list.ID)
71 }
72 result, err := corpusParser.ParseMapping(string(rule))
73 if err != nil {
74 return nil, fmt.Errorf("failed to parse corpus mapping rule %d in list '%s': %w", i, list.ID, err)
75 }
Akrona67de8f2026-02-23 17:54:26 +010076
77 if list.FieldA != "" {
78 applyDefaultCorpusKey(result.Upper, list.FieldA)
79 }
80 if list.FieldB != "" {
81 applyDefaultCorpusKey(result.Lower, list.FieldB)
82 }
83
Akron2f93c582026-02-19 16:49:13 +010084 results[i] = result
85 }
86 return results, nil
87}
88
Akrona67de8f2026-02-23 17:54:26 +010089// applyDefaultCorpusKey recursively fills in empty keys on CorpusField nodes.
90func applyDefaultCorpusKey(node parser.CorpusNode, defaultKey string) {
91 switch n := node.(type) {
92 case *parser.CorpusField:
93 if n.Key == "" {
94 n.Key = defaultKey
95 }
96 case *parser.CorpusGroup:
97 for _, op := range n.Operands {
98 applyDefaultCorpusKey(op, defaultKey)
99 }
100 }
101}
102
Akron06d21f02025-06-04 14:36:07 +0200103// MappingConfig represents the root configuration containing multiple mapping lists
104type MappingConfig struct {
Akronf1ca8822026-05-20 15:44:00 +0200105 SDK string `yaml:"sdk,omitempty"`
106 Stylesheet string `yaml:"stylesheet,omitempty"`
107 Server string `yaml:"server,omitempty"`
108 ServiceURL string `yaml:"serviceURL,omitempty"`
109 CookieName string `yaml:"cookieName,omitempty"`
110 BasePath string `yaml:"basePath,omitempty"` // restricts config file loading to this directory tree
111 AllowOrigins string `yaml:"allowOrigins,omitempty"` // comma-separated list of allowed CORS origins
112 Port int `yaml:"port,omitempty"`
113 LogLevel string `yaml:"loglevel,omitempty"`
114 RateLimit int `yaml:"rateLimit,omitempty"` // max requests per minute per IP (0 = use default 100)
Akronf7bba072026-05-21 12:36:19 +0200115 Rewrites bool `yaml:"rewrites,omitempty"` // global default for koral:rewrite annotations
Akronf1ca8822026-05-20 15:44:00 +0200116 Lists []MappingList `yaml:"lists,omitempty"`
Akron57ee5582025-05-21 15:25:13 +0200117}
118
Akroned787d02026-05-20 12:31:07 +0200119// AllowedBasePath restricts file loading to a specific directory tree.
120// When set, all file paths must resolve to a location at or below this
121// directory (or under the system temp directory). Defaults to the CWD at
122// application startup; can be overridden via the "basePath" YAML config
123// field or the KORAL_MAPPER_BASE_PATH environment variable. In Docker
124// (WORKDIR /), the default "/" naturally allows all paths.
125var AllowedBasePath string
126
127// isWithinDir checks whether absPath is at or below the given directory.
128// Uses a trailing-separator comparison to avoid prefix false positives
129// (e.g. /home/user must not match /home/username).
130func isWithinDir(absPath, dir string) bool {
131 if dir == "/" {
132 return true
133 }
134 return absPath == dir || strings.HasPrefix(absPath, dir+string(filepath.Separator))
135}
136
137// sanitizeFilePath cleans a file path, resolves it to an absolute path, and
138// (when AllowedBasePath is set) verifies it resides at or below the allowed
139// base directory or the system temp directory. This prevents path
140// traversal attacks by ensuring os.ReadFile never receives
141// unsanitized user input and cannot access files outside the application's
142// working tree.
143func sanitizeFilePath(path string) (string, error) {
144 if path == "" {
145 return "", fmt.Errorf("empty file path")
146 }
147
148 // Clean the path to remove redundant separators and resolve "." and ".."
149 cleaned := filepath.Clean(path)
150
151 // Convert to absolute path so all traversal is resolved against the CWD
152 absPath, err := filepath.Abs(cleaned)
153 if err != nil {
154 return "", fmt.Errorf("failed to resolve absolute path for '%s': %w", path, err)
155 }
156
157 // If a base path is configured, confine access to that tree or temp dir
158 if AllowedBasePath != "" {
159 base := filepath.Clean(AllowedBasePath)
160 tmpDir := filepath.Clean(os.TempDir())
161
162 if !isWithinDir(absPath, base) && !isWithinDir(absPath, tmpDir) {
163 return "", fmt.Errorf(
164 "path traversal detected: '%s' resolves to '%s' which is outside the allowed base '%s'",
165 path, absPath, base)
166 }
167 }
168
169 return absPath, nil
170}
171
Akrone1cff7c2025-06-04 18:43:32 +0200172// LoadFromSources loads configuration from multiple sources and merges them:
173// - A main configuration file (optional) containing global settings and lists
174// - Individual mapping files (optional) containing single mapping lists each
175// At least one source must be provided
176func LoadFromSources(configFile string, mappingFiles []string) (*MappingConfig, error) {
177 var allLists []MappingList
178 var globalConfig MappingConfig
Akron57ee5582025-05-21 15:25:13 +0200179
Akrone1cff7c2025-06-04 18:43:32 +0200180 // Track seen IDs across all sources to detect duplicates
181 seenIDs := make(map[string]bool)
Akrona5d88142025-05-22 14:42:09 +0200182
Akrone1cff7c2025-06-04 18:43:32 +0200183 // Load main configuration file if provided
184 if configFile != "" {
Akroned787d02026-05-20 12:31:07 +0200185 safePath, err := sanitizeFilePath(configFile)
186 if err != nil {
187 return nil, err
188 }
189 data, err := os.ReadFile(safePath) // #nosec G304 -- path sanitized above
Akrone1cff7c2025-06-04 18:43:32 +0200190 if err != nil {
191 return nil, fmt.Errorf("failed to read config file '%s': %w", configFile, err)
Akron06d21f02025-06-04 14:36:07 +0200192 }
Akrone1cff7c2025-06-04 18:43:32 +0200193
194 if len(data) == 0 {
195 return nil, fmt.Errorf("EOF: config file '%s' is empty", configFile)
196 }
197
198 // Try to unmarshal as new format first (object with optional sdk/server and lists)
Akron813780f2025-06-05 15:44:28 +0200199 if err := yaml.Unmarshal(data, &globalConfig); err == nil {
200 // Successfully parsed as new format - accept it regardless of whether it has lists
Akrone1cff7c2025-06-04 18:43:32 +0200201 for _, list := range globalConfig.Lists {
202 if seenIDs[list.ID] {
203 return nil, fmt.Errorf("duplicate mapping list ID found: %s", list.ID)
204 }
205 seenIDs[list.ID] = true
206 }
207 allLists = append(allLists, globalConfig.Lists...)
208 } else {
209 // Fall back to old format (direct list)
210 var lists []MappingList
211 if err := yaml.Unmarshal(data, &lists); err != nil {
212 return nil, fmt.Errorf("failed to parse YAML config file '%s': %w", configFile, err)
213 }
214
215 for _, list := range lists {
216 if seenIDs[list.ID] {
217 return nil, fmt.Errorf("duplicate mapping list ID found: %s", list.ID)
218 }
219 seenIDs[list.ID] = true
220 }
221 allLists = append(allLists, lists...)
222 // Clear the lists from globalConfig since we got them from the old format
223 globalConfig.Lists = nil
224 }
Akron06d21f02025-06-04 14:36:07 +0200225 }
226
Akrone1cff7c2025-06-04 18:43:32 +0200227 // Load individual mapping files
228 for _, file := range mappingFiles {
Akroned787d02026-05-20 12:31:07 +0200229 safePath, err := sanitizeFilePath(file)
230 if err != nil {
231 return nil, err
232 }
233 data, err := os.ReadFile(safePath) // #nosec G304 -- path sanitized above
Akrone1cff7c2025-06-04 18:43:32 +0200234 if err != nil {
Akron7e8da932025-07-01 11:56:46 +0200235 log.Error().Err(err).Str("file", file).Msg("Failed to read mapping file")
236 continue
Akrone1cff7c2025-06-04 18:43:32 +0200237 }
238
239 if len(data) == 0 {
Akron7e8da932025-07-01 11:56:46 +0200240 log.Error().Err(err).Str("file", file).Msg("EOF: mapping file is empty")
241 continue
Akrone1cff7c2025-06-04 18:43:32 +0200242 }
243
244 var list MappingList
245 if err := yaml.Unmarshal(data, &list); err != nil {
Akron7e8da932025-07-01 11:56:46 +0200246 log.Error().Err(err).Str("file", file).Msg("Failed to parse YAML mapping file")
247 continue
Akrone1cff7c2025-06-04 18:43:32 +0200248 }
249
250 if seenIDs[list.ID] {
Akron7e8da932025-07-01 11:56:46 +0200251 log.Error().Err(err).Str("file", file).Str("list-id", list.ID).Msg("Duplicate mapping list ID found")
252 continue
Akrone1cff7c2025-06-04 18:43:32 +0200253 }
254 seenIDs[list.ID] = true
255 allLists = append(allLists, list)
Akron57ee5582025-05-21 15:25:13 +0200256 }
257
Akrone1cff7c2025-06-04 18:43:32 +0200258 // Ensure we have at least some configuration
259 if len(allLists) == 0 {
260 return nil, fmt.Errorf("no mapping lists found: provide either a config file (-c) with lists or mapping files (-m)")
261 }
262
Akron585f50f2025-07-03 13:55:47 +0200263 // Validate all mapping lists (skip duplicate ID check since we already did it)
Akrone1cff7c2025-06-04 18:43:32 +0200264 if err := validateMappingLists(allLists); err != nil {
Akron06d21f02025-06-04 14:36:07 +0200265 return nil, err
266 }
267
Akrone1cff7c2025-06-04 18:43:32 +0200268 // Create final configuration
269 result := &MappingConfig{
Akronf1ca8822026-05-20 15:44:00 +0200270 SDK: globalConfig.SDK,
271 Stylesheet: globalConfig.Stylesheet,
272 Server: globalConfig.Server,
273 ServiceURL: globalConfig.ServiceURL,
274 BasePath: globalConfig.BasePath,
275 AllowOrigins: globalConfig.AllowOrigins,
276 Port: globalConfig.Port,
277 LogLevel: globalConfig.LogLevel,
278 RateLimit: globalConfig.RateLimit,
Akronf7bba072026-05-21 12:36:19 +0200279 Rewrites: globalConfig.Rewrites,
Akronf1ca8822026-05-20 15:44:00 +0200280 Lists: allLists,
Akrone1cff7c2025-06-04 18:43:32 +0200281 }
282
Akronf98ba282026-02-24 11:13:30 +0100283 // Apply environment variable overrides (ENV > config file)
284 ApplyEnvOverrides(result)
285
Akron06d21f02025-06-04 14:36:07 +0200286 // Apply defaults if not specified
Akron2ac2ec02025-06-05 15:26:42 +0200287 ApplyDefaults(result)
Akrone1cff7c2025-06-04 18:43:32 +0200288
289 return result, nil
290}
291
Akron585f50f2025-07-03 13:55:47 +0200292// ApplyDefaults sets default values for configuration fields if they are empty
Akron2ac2ec02025-06-05 15:26:42 +0200293func ApplyDefaults(config *MappingConfig) {
Akron585f50f2025-07-03 13:55:47 +0200294 defaults := map[*string]string{
295 &config.SDK: defaultSDK,
Akron43fb1022026-02-20 11:38:49 +0100296 &config.Stylesheet: defaultStylesheet,
Akron585f50f2025-07-03 13:55:47 +0200297 &config.Server: defaultServer,
298 &config.ServiceURL: defaultServiceURL,
Akron43fb1022026-02-20 11:38:49 +0100299 &config.CookieName: defaultCookieName,
Akron585f50f2025-07-03 13:55:47 +0200300 &config.LogLevel: defaultLogLevel,
Akron06d21f02025-06-04 14:36:07 +0200301 }
Akron585f50f2025-07-03 13:55:47 +0200302
303 for field, defaultValue := range defaults {
304 if *field == "" {
305 *field = defaultValue
306 }
Akron06d21f02025-06-04 14:36:07 +0200307 }
Akron585f50f2025-07-03 13:55:47 +0200308
Akronf1ca8822026-05-20 15:44:00 +0200309 // AllowOrigins defaults to the Server value (with trailing slash
310 // stripped to form a proper origin). This avoids duplicating the
311 // server URL string and keeps CORS in sync with the deployment.
312 if config.AllowOrigins == "" {
313 config.AllowOrigins = strings.TrimRight(config.Server, "/")
314 }
315
Akrona8a66ce2025-06-05 10:50:17 +0200316 if config.Port == 0 {
317 config.Port = defaultPort
318 }
Akrone6767de2026-05-20 10:06:24 +0200319 if config.RateLimit == 0 {
320 config.RateLimit = defaultRateLimit
321 }
Akron06d21f02025-06-04 14:36:07 +0200322}
323
Akronf98ba282026-02-24 11:13:30 +0100324// ApplyEnvOverrides overrides configuration fields from environment variables.
325// All environment variables are uppercase and prefixed with KORAL_MAPPER_.
326// Non-empty environment values override any previously loaded config values.
327func ApplyEnvOverrides(config *MappingConfig) {
328 envMappings := map[string]*string{
Akronf1ca8822026-05-20 15:44:00 +0200329 "KORAL_MAPPER_SERVER": &config.Server,
330 "KORAL_MAPPER_SDK": &config.SDK,
331 "KORAL_MAPPER_STYLESHEET": &config.Stylesheet,
332 "KORAL_MAPPER_SERVICE_URL": &config.ServiceURL,
333 "KORAL_MAPPER_COOKIE_NAME": &config.CookieName,
334 "KORAL_MAPPER_LOG_LEVEL": &config.LogLevel,
335 "KORAL_MAPPER_BASE_PATH": &config.BasePath,
336 "KORAL_MAPPER_ALLOW_ORIGINS": &config.AllowOrigins,
Akronf98ba282026-02-24 11:13:30 +0100337 }
338
339 for envKey, field := range envMappings {
340 if val := os.Getenv(envKey); val != "" {
341 *field = val
342 }
343 }
344
345 if val := os.Getenv("KORAL_MAPPER_PORT"); val != "" {
346 if port, err := strconv.Atoi(val); err == nil {
347 config.Port = port
348 }
349 }
Akrone6767de2026-05-20 10:06:24 +0200350
351 if val := os.Getenv("KORAL_MAPPER_RATE_LIMIT"); val != "" {
352 if rl, err := strconv.Atoi(val); err == nil {
353 config.RateLimit = rl
354 }
355 }
Akronf7bba072026-05-21 12:36:19 +0200356
357 if val := os.Getenv("KORAL_MAPPER_REWRITES"); val != "" {
358 config.Rewrites = val == "true"
359 }
Akronf98ba282026-02-24 11:13:30 +0100360}
361
Akron585f50f2025-07-03 13:55:47 +0200362// validateMappingLists validates a slice of mapping lists (without duplicate ID checking)
Akron06d21f02025-06-04 14:36:07 +0200363func validateMappingLists(lists []MappingList) error {
Akron57ee5582025-05-21 15:25:13 +0200364 for i, list := range lists {
365 if list.ID == "" {
Akron06d21f02025-06-04 14:36:07 +0200366 return fmt.Errorf("mapping list at index %d is missing an ID", i)
Akron57ee5582025-05-21 15:25:13 +0200367 }
Akrona5d88142025-05-22 14:42:09 +0200368
Akron57ee5582025-05-21 15:25:13 +0200369 if len(list.Mappings) == 0 {
Akron06d21f02025-06-04 14:36:07 +0200370 return fmt.Errorf("mapping list '%s' has no mapping rules", list.ID)
Akron57ee5582025-05-21 15:25:13 +0200371 }
372
373 // Validate each mapping rule
374 for j, rule := range list.Mappings {
375 if rule == "" {
Akron06d21f02025-06-04 14:36:07 +0200376 return fmt.Errorf("mapping list '%s' rule at index %d is empty", list.ID, j)
Akron57ee5582025-05-21 15:25:13 +0200377 }
378 }
379 }
Akron06d21f02025-06-04 14:36:07 +0200380 return nil
Akron57ee5582025-05-21 15:25:13 +0200381}
382
383// ParseMappings parses all mapping rules in a list and returns a slice of parsed rules
384func (list *MappingList) ParseMappings() ([]*parser.MappingResult, error) {
385 // Create a grammar parser with the list's default foundries and layers
386 grammarParser, err := parser.NewGrammarParser("", "")
387 if err != nil {
388 return nil, fmt.Errorf("failed to create grammar parser: %w", err)
389 }
390
391 results := make([]*parser.MappingResult, len(list.Mappings))
392 for i, rule := range list.Mappings {
Akrona5d88142025-05-22 14:42:09 +0200393 // Check for empty rules first
394 if rule == "" {
395 return nil, fmt.Errorf("empty mapping rule at index %d in list '%s'", i, list.ID)
396 }
397
Akron57ee5582025-05-21 15:25:13 +0200398 // Parse the mapping rule
399 result, err := grammarParser.ParseMapping(string(rule))
400 if err != nil {
401 return nil, fmt.Errorf("failed to parse mapping rule %d in list '%s': %w", i, list.ID, err)
402 }
403
404 // Apply default foundries and layers if not specified in the rule
405 if list.FoundryA != "" {
406 applyDefaultFoundryAndLayer(result.Upper.Wrap, list.FoundryA, list.LayerA)
407 }
408 if list.FoundryB != "" {
409 applyDefaultFoundryAndLayer(result.Lower.Wrap, list.FoundryB, list.LayerB)
410 }
411
412 results[i] = result
413 }
414
415 return results, nil
416}
417
418// applyDefaultFoundryAndLayer recursively applies default foundry and layer to terms that don't have them specified
419func applyDefaultFoundryAndLayer(node ast.Node, defaultFoundry, defaultLayer string) {
420 switch n := node.(type) {
421 case *ast.Term:
Akron585f50f2025-07-03 13:55:47 +0200422 if n.Foundry == "" && defaultFoundry != "" {
Akron57ee5582025-05-21 15:25:13 +0200423 n.Foundry = defaultFoundry
424 }
Akron585f50f2025-07-03 13:55:47 +0200425 if n.Layer == "" && defaultLayer != "" {
Akron57ee5582025-05-21 15:25:13 +0200426 n.Layer = defaultLayer
427 }
428 case *ast.TermGroup:
429 for _, op := range n.Operands {
430 applyDefaultFoundryAndLayer(op, defaultFoundry, defaultLayer)
431 }
432 }
433}