blob: 22fcc70619e8f33e23317db450db99cc89774414 [file] [log] [blame]
Akron81f709c2025-06-12 17:30:55 +02001package validation
2
3import (
4 "fmt"
5 "net/url"
6 "regexp"
7 "strings"
8
Akron708f3912025-06-17 12:26:02 +02009 "slices"
10
Akron81f709c2025-06-12 17:30:55 +020011 "github.com/korap/korap-mcp/service"
12 "github.com/rs/zerolog"
13)
14
15// Validator provides input validation and response schema validation
16type Validator struct {
17 logger zerolog.Logger
18}
19
20// New creates a new validator instance
21func New(logger zerolog.Logger) *Validator {
22 return &Validator{
23 logger: logger.With().Str("component", "validator").Logger(),
24 }
25}
26
27// SearchRequest holds the parameters for a search request validation
28type SearchRequest struct {
29 Query string `json:"query"`
30 QueryLanguage string `json:"query_language,omitempty"`
31 Corpus string `json:"corpus,omitempty"`
32 Count int `json:"count,omitempty"`
33}
34
35// MetadataRequest holds the parameters for a metadata request validation
36type MetadataRequest struct {
37 Action string `json:"action"`
38 Corpus string `json:"corpus,omitempty"`
39}
40
41// ValidationError represents a validation error with details
42type ValidationError struct {
43 Field string `json:"field"`
44 Value string `json:"value"`
45 Message string `json:"message"`
46}
47
48func (e ValidationError) Error() string {
49 return fmt.Sprintf("validation error for field '%s' (value: '%s'): %s", e.Field, e.Value, e.Message)
50}
51
52// ValidationErrors represents multiple validation errors
53type ValidationErrors struct {
54 Errors []ValidationError `json:"errors"`
55}
56
57func (e ValidationErrors) Error() string {
58 if len(e.Errors) == 0 {
59 return "validation errors occurred"
60 }
61 var messages []string
62 for _, err := range e.Errors {
63 messages = append(messages, err.Error())
64 }
65 return strings.Join(messages, "; ")
66}
67
68// Regular expressions for validation
69var (
Akron8db31c32025-06-17 12:22:41 +020070 // Query language validation
71 validQueryLanguages = []string{"poliqarp", "poliqarpplus", "cosmas2", "annis", "cql", "cqp", "fcsql"}
Akron81f709c2025-06-12 17:30:55 +020072
Akron8db31c32025-06-17 12:22:41 +020073 // Corpus ID validation - KorAP collection queries with metadata fields, operators, and regex
74 corpusIDRegex = regexp.MustCompile(`^[a-zA-Z0-9._\-\s&|!=<>()/*"']+$`)
Akron81f709c2025-06-12 17:30:55 +020075
76 // Action validation for metadata requests
77 validMetadataActions = map[string]bool{
78 "list": true,
79 "statistics": true,
80 }
81)
82
83// ValidateSearchRequest validates a search request
84func (v *Validator) ValidateSearchRequest(req SearchRequest) error {
85 var errors []ValidationError
86
87 // Validate query - required and non-empty
88 if strings.TrimSpace(req.Query) == "" {
89 errors = append(errors, ValidationError{
90 Field: "query",
91 Value: req.Query,
92 Message: "query is required and cannot be empty",
93 })
94 } else {
95 // Basic query validation - check for potentially dangerous patterns
96 if err := v.validateQuerySafety(req.Query); err != nil {
97 errors = append(errors, ValidationError{
98 Field: "query",
99 Value: req.Query,
100 Message: err.Error(),
101 })
102 }
103 }
104
105 // Validate query language if provided
Akron708f3912025-06-17 12:26:02 +0200106 if req.QueryLanguage != "" && !slices.Contains(validQueryLanguages, req.QueryLanguage) {
Akron81f709c2025-06-12 17:30:55 +0200107 errors = append(errors, ValidationError{
108 Field: "query_language",
109 Value: req.QueryLanguage,
Akron8db31c32025-06-17 12:22:41 +0200110 Message: fmt.Sprintf("invalid query language, must be one of: %s", strings.Join(validQueryLanguages, ", ")),
Akron81f709c2025-06-12 17:30:55 +0200111 })
112 }
113
114 // Validate corpus if provided
115 if req.Corpus != "" {
116 if err := v.validateCorpusID(req.Corpus); err != nil {
117 errors = append(errors, ValidationError{
118 Field: "corpus",
119 Value: req.Corpus,
120 Message: err.Error(),
121 })
122 }
123 }
124
125 // Validate count if provided (0 means use default, so only validate non-zero values)
126 if req.Count < 0 || req.Count > 10000 {
127 errors = append(errors, ValidationError{
128 Field: "count",
129 Value: fmt.Sprintf("%d", req.Count),
130 Message: "count must be between 0 and 10000 (0 means use default)",
131 })
132 }
133
134 if len(errors) > 0 {
135 v.logger.Warn().Interface("errors", errors).Msg("Search request validation failed")
136 return ValidationErrors{Errors: errors}
137 }
138
139 v.logger.Debug().Interface("request", req).Msg("Search request validation passed")
140 return nil
141}
142
143// ValidateMetadataRequest validates a metadata request
144func (v *Validator) ValidateMetadataRequest(req MetadataRequest) error {
145 var errors []ValidationError
146
147 // Validate action - required
148 if strings.TrimSpace(req.Action) == "" {
149 errors = append(errors, ValidationError{
150 Field: "action",
151 Value: req.Action,
152 Message: "action is required and cannot be empty",
153 })
154 } else if !validMetadataActions[req.Action] {
155 var validActions []string
156 for action := range validMetadataActions {
157 validActions = append(validActions, action)
158 }
159 errors = append(errors, ValidationError{
160 Field: "action",
161 Value: req.Action,
162 Message: fmt.Sprintf("invalid action, must be one of: %s", strings.Join(validActions, ", ")),
163 })
164 }
165
166 // Validate corpus if provided
167 if req.Corpus != "" {
168 if err := v.validateCorpusID(req.Corpus); err != nil {
169 errors = append(errors, ValidationError{
170 Field: "corpus",
171 Value: req.Corpus,
172 Message: err.Error(),
173 })
174 }
175 }
176
177 if len(errors) > 0 {
178 v.logger.Warn().Interface("errors", errors).Msg("Metadata request validation failed")
179 return ValidationErrors{Errors: errors}
180 }
181
182 v.logger.Debug().Interface("request", req).Msg("Metadata request validation passed")
183 return nil
184}
185
186// ValidateSearchResponse validates a search response
187func (v *Validator) ValidateSearchResponse(resp *service.SearchResponse) error {
188 if resp == nil {
189 return fmt.Errorf("search response is nil")
190 }
191
192 var errors []ValidationError
193
194 // Validate meta structure
195 if resp.Meta.TotalResults < 0 {
196 errors = append(errors, ValidationError{
197 Field: "meta.totalResults",
198 Value: fmt.Sprintf("%d", resp.Meta.TotalResults),
199 Message: "totalResults cannot be negative",
200 })
201 }
202
203 if resp.Meta.Count < 0 {
204 errors = append(errors, ValidationError{
205 Field: "meta.count",
206 Value: fmt.Sprintf("%d", resp.Meta.Count),
207 Message: "count cannot be negative",
208 })
209 }
210
211 if resp.Meta.StartIndex < 0 {
212 errors = append(errors, ValidationError{
213 Field: "meta.startIndex",
214 Value: fmt.Sprintf("%d", resp.Meta.StartIndex),
215 Message: "startIndex cannot be negative",
216 })
217 }
218
219 if resp.Meta.ItemsPerPage < 0 {
220 errors = append(errors, ValidationError{
221 Field: "meta.itemsPerPage",
222 Value: fmt.Sprintf("%d", resp.Meta.ItemsPerPage),
223 Message: "itemsPerPage cannot be negative",
224 })
225 }
226
227 // Validate matches if present
228 if resp.Matches != nil {
229 for i, match := range resp.Matches {
230 if match.MatchID == "" {
231 errors = append(errors, ValidationError{
232 Field: fmt.Sprintf("matches[%d].matchID", i),
233 Value: "",
234 Message: "match ID is required",
235 })
236 }
237
238 if match.TextSigle == "" {
239 errors = append(errors, ValidationError{
240 Field: fmt.Sprintf("matches[%d].textSigle", i),
241 Value: "",
242 Message: "textSigle is required",
243 })
244 }
245
246 if match.Position < 0 {
247 errors = append(errors, ValidationError{
248 Field: fmt.Sprintf("matches[%d].position", i),
249 Value: fmt.Sprintf("%d", match.Position),
250 Message: "position cannot be negative",
251 })
252 }
253 }
254 }
255
256 if len(errors) > 0 {
257 v.logger.Warn().Interface("errors", errors).Msg("Search response validation failed")
258 return ValidationErrors{Errors: errors}
259 }
260
261 v.logger.Debug().Msg("Search response validation passed")
262 return nil
263}
264
265// ValidateCorpusListResponse validates a corpus list response
266func (v *Validator) ValidateCorpusListResponse(resp *service.CorpusListResponse) error {
267 if resp == nil {
268 return fmt.Errorf("corpus list response is nil")
269 }
270
271 var errors []ValidationError
272
273 // Validate corpus entries
274 if resp.Corpora != nil {
275 for i, corpus := range resp.Corpora {
276 if corpus.ID == "" {
277 errors = append(errors, ValidationError{
278 Field: fmt.Sprintf("corpora[%d].id", i),
279 Value: "",
280 Message: "corpus ID is required",
281 })
282 } else if err := v.validateCorpusID(corpus.ID); err != nil {
283 errors = append(errors, ValidationError{
284 Field: fmt.Sprintf("corpora[%d].id", i),
285 Value: corpus.ID,
286 Message: err.Error(),
287 })
288 }
289
290 if corpus.Name == "" {
291 errors = append(errors, ValidationError{
292 Field: fmt.Sprintf("corpora[%d].name", i),
293 Value: "",
294 Message: "corpus name is required",
295 })
296 }
297
298 if corpus.Documents < 0 {
299 errors = append(errors, ValidationError{
300 Field: fmt.Sprintf("corpora[%d].documents", i),
301 Value: fmt.Sprintf("%d", corpus.Documents),
302 Message: "document count cannot be negative",
303 })
304 }
305
306 if corpus.Tokens < 0 {
307 errors = append(errors, ValidationError{
308 Field: fmt.Sprintf("corpora[%d].tokens", i),
309 Value: fmt.Sprintf("%d", corpus.Tokens),
310 Message: "token count cannot be negative",
311 })
312 }
313 }
314 }
315
316 if len(errors) > 0 {
317 v.logger.Warn().Interface("errors", errors).Msg("Corpus list response validation failed")
318 return ValidationErrors{Errors: errors}
319 }
320
321 v.logger.Debug().Msg("Corpus list response validation passed")
322 return nil
323}
324
325// ValidateStatisticsResponse validates a statistics response
326func (v *Validator) ValidateStatisticsResponse(resp *service.StatisticsResponse) error {
327 if resp == nil {
328 return fmt.Errorf("statistics response is nil")
329 }
330
331 var errors []ValidationError
332
333 if resp.Documents < 0 {
334 errors = append(errors, ValidationError{
335 Field: "documents",
336 Value: fmt.Sprintf("%d", resp.Documents),
337 Message: "document count cannot be negative",
338 })
339 }
340
341 if resp.Tokens < 0 {
342 errors = append(errors, ValidationError{
343 Field: "tokens",
344 Value: fmt.Sprintf("%d", resp.Tokens),
345 Message: "token count cannot be negative",
346 })
347 }
348
349 if resp.Sentences < 0 {
350 errors = append(errors, ValidationError{
351 Field: "sentences",
352 Value: fmt.Sprintf("%d", resp.Sentences),
353 Message: "sentence count cannot be negative",
354 })
355 }
356
357 if resp.Paragraphs < 0 {
358 errors = append(errors, ValidationError{
359 Field: "paragraphs",
360 Value: fmt.Sprintf("%d", resp.Paragraphs),
361 Message: "paragraph count cannot be negative",
362 })
363 }
364
365 if len(errors) > 0 {
366 v.logger.Warn().Interface("errors", errors).Msg("Statistics response validation failed")
367 return ValidationErrors{Errors: errors}
368 }
369
370 v.logger.Debug().Msg("Statistics response validation passed")
371 return nil
372}
373
374// validateQuerySafety performs basic security validation on queries
375func (v *Validator) validateQuerySafety(query string) error {
376 // Check for extremely long queries that could cause DoS
377 if len(query) > 10000 {
378 return fmt.Errorf("query is too long (max 10000 characters)")
379 }
380
381 // Check for potentially dangerous URL patterns
382 if strings.Contains(query, "://") {
383 if _, err := url.Parse(query); err == nil {
384 return fmt.Errorf("query appears to contain a URL which is not allowed")
385 }
386 }
387
388 // Check for excessive nesting that could cause parser issues
389 openParens := strings.Count(query, "(")
390 closeParens := strings.Count(query, ")")
391 if openParens != closeParens {
392 return fmt.Errorf("unmatched parentheses in query")
393 }
394 if openParens > 100 {
395 return fmt.Errorf("query has too many nested levels (max 100)")
396 }
397
398 return nil
399}
400
Akron8db31c32025-06-17 12:22:41 +0200401// validateCorpusID validates a corpus identifier or collection query
402// This supports both simple corpus sigles (e.g., "DeReKo-2023-I") and complex
403// collection queries with metadata fields (e.g., "textClass = \"politics\" & pubDate in 2020")
Akron81f709c2025-06-12 17:30:55 +0200404func (v *Validator) validateCorpusID(corpusID string) error {
405 if len(corpusID) == 0 {
406 return fmt.Errorf("corpus ID cannot be empty")
407 }
408
409 if len(corpusID) > 100 {
410 return fmt.Errorf("corpus ID is too long (max 100 characters)")
411 }
412
413 if !corpusIDRegex.MatchString(corpusID) {
Akron8db31c32025-06-17 12:22:41 +0200414 return fmt.Errorf("collection query contains invalid characters (supports alphanumeric, dots, hyphens, underscores, spaces, quotes, operators & | ! = < > in, parentheses, and regex /pattern/)")
Akron81f709c2025-06-12 17:30:55 +0200415 }
416
417 return nil
418}
419
420// SanitizeQuery performs basic sanitization on search queries
421func (v *Validator) SanitizeQuery(query string) string {
422 // Trim whitespace
423 sanitized := strings.TrimSpace(query)
424
425 // Remove any null bytes
426 sanitized = strings.ReplaceAll(sanitized, "\x00", "")
427
428 // Normalize whitespace
429 sanitized = regexp.MustCompile(`\s+`).ReplaceAllString(sanitized, " ")
430
431 v.logger.Debug().
432 Str("original", query).
433 Str("sanitized", sanitized).
434 Msg("Query sanitized")
435
436 return sanitized
437}
438
439// SanitizeCorpusID performs basic sanitization on corpus IDs
440func (v *Validator) SanitizeCorpusID(corpusID string) string {
441 // Trim whitespace
442 sanitized := strings.TrimSpace(corpusID)
443
444 // Remove any null bytes
445 sanitized = strings.ReplaceAll(sanitized, "\x00", "")
446
447 // Convert to lowercase for consistency
448 sanitized = strings.ToLower(sanitized)
449
450 v.logger.Debug().
451 Str("original", corpusID).
452 Str("sanitized", sanitized).
453 Msg("Corpus ID sanitized")
454
455 return sanitized
456}