Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 1 | package tools |
| 2 | |
| 3 | import ( |
| 4 | "context" |
| 5 | "fmt" |
Akron | 8138c35 | 2025-06-12 16:34:42 +0200 | [diff] [blame] | 6 | "strings" |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 7 | |
| 8 | "github.com/korap/korap-mcp/service" |
Akron | 81f709c | 2025-06-12 17:30:55 +0200 | [diff] [blame] | 9 | "github.com/korap/korap-mcp/validation" |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 10 | "github.com/mark3labs/mcp-go/mcp" |
| 11 | "github.com/rs/zerolog/log" |
| 12 | ) |
| 13 | |
| 14 | // SearchTool implements the Tool interface for KorAP corpus search |
| 15 | type SearchTool struct { |
Akron | 81f709c | 2025-06-12 17:30:55 +0200 | [diff] [blame] | 16 | client *service.Client |
| 17 | validator *validation.Validator |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 18 | } |
| 19 | |
| 20 | // NewSearchTool creates a new search tool instance |
| 21 | func NewSearchTool(client *service.Client) *SearchTool { |
| 22 | return &SearchTool{ |
Akron | 81f709c | 2025-06-12 17:30:55 +0200 | [diff] [blame] | 23 | client: client, |
| 24 | validator: validation.New(log.Logger), |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 25 | } |
| 26 | } |
| 27 | |
| 28 | // Name returns the tool name |
| 29 | func (s *SearchTool) Name() string { |
| 30 | return "korap_search" |
| 31 | } |
| 32 | |
| 33 | // Description returns the tool description |
| 34 | func (s *SearchTool) Description() string { |
| 35 | return "Search for words or phrases in KorAP corpora using various query languages" |
| 36 | } |
| 37 | |
| 38 | // InputSchema returns the JSON schema for tool parameters |
Akron | 708f391 | 2025-06-17 12:26:02 +0200 | [diff] [blame^] | 39 | func (s *SearchTool) InputSchema() map[string]any { |
| 40 | return map[string]any{ |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 41 | "type": "object", |
Akron | 708f391 | 2025-06-17 12:26:02 +0200 | [diff] [blame^] | 42 | "properties": map[string]any{ |
| 43 | "query": map[string]any{ |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 44 | "type": "string", |
Akron | 8db31c3 | 2025-06-17 12:22:41 +0200 | [diff] [blame] | 45 | "description": "The search query. Supports different query languages like Poliqarp, CosmasII, or Annis depending on the selected query_language parameter.", |
| 46 | "minLength": 1, |
| 47 | "maxLength": 1000, |
| 48 | "examples": []string{"Haus", "[pos=NN]", "der /w1:5 Mann"}, |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 49 | }, |
Akron | 708f391 | 2025-06-17 12:26:02 +0200 | [diff] [blame^] | 50 | "query_language": map[string]any{ |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 51 | "type": "string", |
Akron | 8db31c3 | 2025-06-17 12:22:41 +0200 | [diff] [blame] | 52 | "description": "Query language to use for parsing the search query. Supported languages: 'poliqarp' (default; extended Poliqarp QL), 'cosmas2' (corpus query syntax of COSMAS II), 'annis' (multi-layer annotation queries), 'cql' (corpus query language), 'cqp' (Corpus Query Processor syntax), 'fcsql' (Federated Content Search queries).", |
| 53 | "enum": []string{"poliqarp", "cosmas2", "annis", "cql", "cqp", "fcsql"}, |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 54 | "default": "poliqarp", |
Akron | 8db31c3 | 2025-06-17 12:22:41 +0200 | [diff] [blame] | 55 | "examples": []string{"poliqarp", "cosmas2", "annis", "cql", "cqp", "fcsql"}, |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 56 | }, |
Akron | 708f391 | 2025-06-17 12:26:02 +0200 | [diff] [blame^] | 57 | "corpus": map[string]any{ |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 58 | "type": "string", |
Akron | 8db31c3 | 2025-06-17 12:22:41 +0200 | [diff] [blame] | 59 | "description": "Virtual corpus query to filter search results based on metadata fields. Supports boolean operations (& | !), comparison operators (= != < > in), and regular expressions (/pattern/). Use metadata fields like corpusSigle, textClass, pubDate, textType, availability, etc. When not provided, searches all available data accessible to the user.", |
| 60 | "pattern": "^[a-zA-Z0-9._\\-\\s&|!=<>()/*\"']+$", |
| 61 | "examples": []string{"corpusSigle = \"GOE\"", "textClass = \"politics\" & pubDate in 2020", "textType = \"news\" | textType = \"blog\"", "availability = /CC.*/ & textClass != \"fiction\""}, |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 62 | }, |
Akron | 708f391 | 2025-06-17 12:26:02 +0200 | [diff] [blame^] | 63 | "count": map[string]any{ |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 64 | "type": "integer", |
Akron | 8db31c3 | 2025-06-17 12:22:41 +0200 | [diff] [blame] | 65 | "description": "Maximum number of search results to return. Higher values may increase response time. Use smaller values for faster responses when doing exploratory searches.", |
| 66 | "minimum": 0, |
| 67 | "maximum": 10000, |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 68 | "default": 25, |
Akron | 708f391 | 2025-06-17 12:26:02 +0200 | [diff] [blame^] | 69 | "examples": []any{10, 25, 50, 100}, |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 70 | }, |
| 71 | }, |
Akron | 8db31c3 | 2025-06-17 12:22:41 +0200 | [diff] [blame] | 72 | "required": []string{"query"}, |
| 73 | "additionalProperties": false, |
| 74 | "title": "KorAP Search Parameters", |
| 75 | "description": "Parameters for searching text corpora using KorAP's powerful query languages and filtering capabilities.", |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 76 | } |
| 77 | } |
| 78 | |
| 79 | // Execute performs the search operation |
| 80 | func (s *SearchTool) Execute(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) { |
| 81 | log.Debug(). |
| 82 | Str("tool", s.Name()). |
| 83 | Msg("Executing search tool") |
| 84 | |
Akron | 8138c35 | 2025-06-12 16:34:42 +0200 | [diff] [blame] | 85 | // Extract required query parameter |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 86 | query, err := request.RequireString("query") |
| 87 | if err != nil { |
| 88 | return nil, fmt.Errorf("query parameter is required: %w", err) |
| 89 | } |
| 90 | |
Akron | 8138c35 | 2025-06-12 16:34:42 +0200 | [diff] [blame] | 91 | // Extract optional parameters with defaults |
| 92 | queryLang := request.GetString("query_language", "poliqarp") |
| 93 | corpus := request.GetString("corpus", "") |
| 94 | count := request.GetInt("count", 25) |
| 95 | |
Akron | 81f709c | 2025-06-12 17:30:55 +0200 | [diff] [blame] | 96 | // Validate the search request using the validation package |
| 97 | searchReq := validation.SearchRequest{ |
| 98 | Query: query, |
| 99 | QueryLanguage: queryLang, |
| 100 | Corpus: corpus, |
| 101 | Count: count, |
| 102 | } |
| 103 | |
| 104 | if err := s.validator.ValidateSearchRequest(searchReq); err != nil { |
| 105 | log.Warn(). |
| 106 | Err(err). |
| 107 | Interface("request", searchReq). |
| 108 | Msg("Search request validation failed") |
| 109 | return nil, fmt.Errorf("invalid search request: %w", err) |
| 110 | } |
| 111 | |
| 112 | // Sanitize inputs |
| 113 | query = s.validator.SanitizeQuery(query) |
| 114 | if corpus != "" { |
| 115 | corpus = s.validator.SanitizeCorpusID(corpus) |
| 116 | } |
| 117 | |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 118 | log.Debug(). |
| 119 | Str("query", query). |
Akron | 8138c35 | 2025-06-12 16:34:42 +0200 | [diff] [blame] | 120 | Str("query_language", queryLang). |
| 121 | Str("corpus", corpus). |
| 122 | Int("count", count). |
Akron | 81f709c | 2025-06-12 17:30:55 +0200 | [diff] [blame] | 123 | Msg("Parsed and validated search parameters") |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 124 | |
Akron | 8138c35 | 2025-06-12 16:34:42 +0200 | [diff] [blame] | 125 | // Check if client is available and authenticated |
| 126 | if s.client == nil { |
| 127 | return nil, fmt.Errorf("KorAP client not configured") |
| 128 | } |
| 129 | |
| 130 | if !s.client.IsAuthenticated() { |
| 131 | log.Warn().Msg("Client not authenticated, attempting authentication") |
| 132 | if err := s.client.AuthenticateWithClientCredentials(ctx); err != nil { |
| 133 | return nil, fmt.Errorf("authentication failed: %w", err) |
| 134 | } |
| 135 | } |
| 136 | |
| 137 | // Prepare search request |
Akron | 81f709c | 2025-06-12 17:30:55 +0200 | [diff] [blame] | 138 | korapSearchReq := service.SearchRequest{ |
Akron | 8138c35 | 2025-06-12 16:34:42 +0200 | [diff] [blame] | 139 | Query: query, |
| 140 | QueryLang: queryLang, |
| 141 | Collection: corpus, |
| 142 | Count: count, |
| 143 | } |
| 144 | |
| 145 | // Perform the search |
| 146 | var searchResp service.SearchResponse |
Akron | 81f709c | 2025-06-12 17:30:55 +0200 | [diff] [blame] | 147 | err = s.client.PostJSON(ctx, "search", korapSearchReq, &searchResp) |
Akron | 8138c35 | 2025-06-12 16:34:42 +0200 | [diff] [blame] | 148 | if err != nil { |
| 149 | log.Error(). |
| 150 | Err(err). |
| 151 | Str("query", query). |
| 152 | Msg("Search request failed") |
| 153 | return nil, fmt.Errorf("search failed: %w", err) |
| 154 | } |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 155 | |
Akron | 81f709c | 2025-06-12 17:30:55 +0200 | [diff] [blame] | 156 | // Validate the response |
| 157 | if err := s.validator.ValidateSearchResponse(&searchResp); err != nil { |
| 158 | log.Warn(). |
| 159 | Err(err). |
| 160 | Msg("Search response validation failed, but continuing with potentially invalid data") |
| 161 | // Continue processing despite validation errors to be resilient |
| 162 | } |
| 163 | |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 164 | log.Info(). |
| 165 | Str("query", query). |
Akron | 8138c35 | 2025-06-12 16:34:42 +0200 | [diff] [blame] | 166 | Int("total_results", searchResp.Meta.TotalResults). |
| 167 | Int("returned_matches", len(searchResp.Matches)). |
| 168 | Float64("search_time", searchResp.Meta.SearchTime). |
| 169 | Msg("Search completed successfully") |
| 170 | |
| 171 | // Format the response |
| 172 | result := s.formatSearchResults(&searchResp) |
Akron | b1c71e6 | 2025-06-12 16:08:54 +0200 | [diff] [blame] | 173 | |
| 174 | return mcp.NewToolResultText(result), nil |
| 175 | } |
Akron | 8138c35 | 2025-06-12 16:34:42 +0200 | [diff] [blame] | 176 | |
| 177 | // formatSearchResults formats the search response into a readable text format |
| 178 | func (s *SearchTool) formatSearchResults(response *service.SearchResponse) string { |
| 179 | var result strings.Builder |
| 180 | |
| 181 | result.WriteString("KorAP Search Results\n") |
| 182 | result.WriteString("====================\n\n") |
| 183 | |
| 184 | // Query information |
| 185 | result.WriteString(fmt.Sprintf("Query: %s\n", response.Query.Query)) |
| 186 | if response.Query.QueryLang != "" { |
| 187 | result.WriteString(fmt.Sprintf("Query Language: %s\n", response.Query.QueryLang)) |
| 188 | } |
| 189 | if response.Query.Collection != "" { |
| 190 | result.WriteString(fmt.Sprintf("Corpus: %s\n", response.Query.Collection)) |
| 191 | } |
| 192 | result.WriteString("\n") |
| 193 | |
| 194 | // Result statistics |
| 195 | result.WriteString("Results Summary:\n") |
| 196 | result.WriteString(fmt.Sprintf(" Total Results: %d\n", response.Meta.TotalResults)) |
| 197 | result.WriteString(fmt.Sprintf(" Shown: %d-%d\n", |
| 198 | response.Meta.StartIndex+1, |
| 199 | response.Meta.StartIndex+len(response.Matches))) |
| 200 | if response.Meta.SearchTime > 0 { |
| 201 | result.WriteString(fmt.Sprintf(" Search Time: %.3f seconds\n", response.Meta.SearchTime)) |
| 202 | } |
| 203 | result.WriteString("\n") |
| 204 | |
| 205 | // Individual matches |
| 206 | if len(response.Matches) > 0 { |
| 207 | result.WriteString("Matches:\n") |
| 208 | result.WriteString("--------\n") |
| 209 | |
| 210 | for i, match := range response.Matches { |
| 211 | result.WriteString(fmt.Sprintf("\n%d. Text: %s\n", i+1, match.TextSigle)) |
| 212 | if match.Snippet != "" { |
| 213 | result.WriteString(fmt.Sprintf(" Snippet: %s\n", match.Snippet)) |
| 214 | } |
| 215 | if match.PubPlace != "" { |
| 216 | result.WriteString(fmt.Sprintf(" Publication: %s\n", match.PubPlace)) |
| 217 | } |
| 218 | if match.MatchID != "" { |
| 219 | result.WriteString(fmt.Sprintf(" Match ID: %s\n", match.MatchID)) |
| 220 | } |
| 221 | result.WriteString(fmt.Sprintf(" Position: %d\n", match.Position)) |
| 222 | } |
| 223 | } else { |
| 224 | result.WriteString("No matches found.\n") |
| 225 | } |
| 226 | |
| 227 | // Additional information |
| 228 | if response.Query.CutOff { |
| 229 | result.WriteString("\nNote: Results were cut off due to limits.\n") |
| 230 | } |
| 231 | if response.Query.TimeExceeded { |
| 232 | result.WriteString("\nNote: Search time limit was exceeded.\n") |
| 233 | } |
| 234 | |
| 235 | return result.String() |
| 236 | } |