blob: 46eb2217b38fcf785133d6031e54fec5770660d7 [file] [log] [blame]
Akronbd154ea2025-06-12 17:01:58 +02001package tools
2
3import (
4 "context"
5 "fmt"
6 "strings"
7
8 "github.com/korap/korap-mcp/service"
Akron81f709c2025-06-12 17:30:55 +02009 "github.com/korap/korap-mcp/validation"
Akronbd154ea2025-06-12 17:01:58 +020010 "github.com/mark3labs/mcp-go/mcp"
11 "github.com/rs/zerolog/log"
12)
13
14// MetadataTool implements the Tool interface for KorAP corpus metadata retrieval
15type MetadataTool struct {
Akron81f709c2025-06-12 17:30:55 +020016 client *service.Client
17 validator *validation.Validator
Akronbd154ea2025-06-12 17:01:58 +020018}
19
20// NewMetadataTool creates a new metadata tool instance
21func NewMetadataTool(client *service.Client) *MetadataTool {
22 return &MetadataTool{
Akron81f709c2025-06-12 17:30:55 +020023 client: client,
24 validator: validation.New(log.Logger),
Akronbd154ea2025-06-12 17:01:58 +020025 }
26}
27
28// Name returns the tool name
29func (m *MetadataTool) Name() string {
30 return "korap_metadata"
31}
32
33// Description returns the tool description
34func (m *MetadataTool) Description() string {
35 return "Retrieve metadata and statistics for KorAP corpora"
36}
37
38// InputSchema returns the JSON schema for tool parameters
39func (m *MetadataTool) InputSchema() map[string]interface{} {
40 return map[string]interface{}{
41 "type": "object",
42 "properties": map[string]interface{}{
43 "action": map[string]interface{}{
44 "type": "string",
Akron8db31c32025-06-17 12:22:41 +020045 "description": "Type of metadata operation to perform. 'list' retrieves all available corpora with their basic information, 'statistics' provides detailed corpus statistics.",
Akronbd154ea2025-06-12 17:01:58 +020046 "enum": []string{"list", "statistics"},
47 "default": "list",
Akron8db31c32025-06-17 12:22:41 +020048 "examples": []string{"list", "statistics"},
Akronbd154ea2025-06-12 17:01:58 +020049 },
50 "corpus": map[string]interface{}{
51 "type": "string",
Akron8db31c32025-06-17 12:22:41 +020052 "description": "Virtual corpus query to filter results based on metadata fields. For 'list' action, this parameter is ignored. For 'statistics' action, specifies which subset of data to analyze using metadata queries with boolean operations (& | !), comparison operators (= != < > in), and regular expressions (/pattern/). When not provided with 'statistics', returns statistics for all accessible data.",
53 "pattern": "^[a-zA-Z0-9._\\-\\s&|!=<>()/*\"']+$",
54 "examples": []string{"corpusSigle = \"GOE\"", "textClass = \"politics\" & pubDate in 2020", "textType = \"news\" | textType = \"blog\"", "availability = /CC.*/ & textClass != \"fiction\""},
Akronbd154ea2025-06-12 17:01:58 +020055 },
56 },
Akron8db31c32025-06-17 12:22:41 +020057 "required": []string{"action"},
58 "additionalProperties": false,
59 "title": "KorAP Metadata Parameters",
60 "description": "Parameters for retrieving corpus metadata and statistics from KorAP, including corpus lists and detailed statistical information.",
Akronbd154ea2025-06-12 17:01:58 +020061 }
62}
63
64// Execute performs the metadata retrieval operation
65func (m *MetadataTool) Execute(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
66 log.Debug().
67 Str("tool", m.Name()).
68 Msg("Executing metadata tool")
69
70 // Extract required action parameter
71 action, err := request.RequireString("action")
72 if err != nil {
73 return nil, fmt.Errorf("action parameter is required: %w", err)
74 }
75
76 // Extract optional corpus parameter
77 corpus := request.GetString("corpus", "")
78
Akron81f709c2025-06-12 17:30:55 +020079 // Validate the metadata request using the validation package
80 metadataReq := validation.MetadataRequest{
81 Action: action,
82 Corpus: corpus,
83 }
84
85 if err := m.validator.ValidateMetadataRequest(metadataReq); err != nil {
86 log.Warn().
87 Err(err).
88 Interface("request", metadataReq).
89 Msg("Metadata request validation failed")
90 return nil, fmt.Errorf("invalid metadata request: %w", err)
91 }
92
93 // Sanitize inputs
94 if corpus != "" {
95 corpus = m.validator.SanitizeCorpusID(corpus)
96 }
97
Akronbd154ea2025-06-12 17:01:58 +020098 log.Debug().
99 Str("action", action).
100 Str("corpus", corpus).
Akron81f709c2025-06-12 17:30:55 +0200101 Msg("Parsed and validated metadata parameters")
Akronbd154ea2025-06-12 17:01:58 +0200102
103 // Check if client is available and authenticated
104 if m.client == nil {
105 return nil, fmt.Errorf("KorAP client not configured")
106 }
107
108 if !m.client.IsAuthenticated() {
109 log.Warn().Msg("Client not authenticated, attempting authentication")
110 if err := m.client.AuthenticateWithClientCredentials(ctx); err != nil {
111 return nil, fmt.Errorf("authentication failed: %w", err)
112 }
113 }
114
115 // Handle different actions
116 switch action {
117 case "list":
118 return m.handleListCorpora(ctx)
119 case "statistics":
120 return m.handleCorpusStatistics(ctx, corpus)
121 default:
122 // This should never be reached due to validation above
123 return nil, fmt.Errorf("unknown action: %s", action)
124 }
125}
126
127// handleListCorpora retrieves and formats the list of available corpora
128func (m *MetadataTool) handleListCorpora(ctx context.Context) (*mcp.CallToolResult, error) {
129 log.Debug().Msg("Retrieving corpus list")
130
131 var corpusListResp service.CorpusListResponse
132 err := m.client.GetJSON(ctx, "corpus", &corpusListResp)
133 if err != nil {
134 log.Error().
135 Err(err).
136 Msg("Failed to retrieve corpus list")
137 return nil, fmt.Errorf("failed to retrieve corpus list: %w", err)
138 }
139
Akron81f709c2025-06-12 17:30:55 +0200140 // Validate the response
141 if err := m.validator.ValidateCorpusListResponse(&corpusListResp); err != nil {
142 log.Warn().
143 Err(err).
144 Msg("Corpus list response validation failed, but continuing with potentially invalid data")
145 // Continue processing despite validation errors to be resilient
146 }
147
Akronbd154ea2025-06-12 17:01:58 +0200148 log.Info().
149 Int("corpus_count", len(corpusListResp.Corpora)).
150 Msg("Corpus list retrieved successfully")
151
152 result := m.formatCorpusList(&corpusListResp)
153 return mcp.NewToolResultText(result), nil
154}
155
156// handleCorpusStatistics retrieves and formats statistics for a corpus query
157func (m *MetadataTool) handleCorpusStatistics(ctx context.Context, corpus string) (*mcp.CallToolResult, error) {
158 log.Debug().
159 Str("corpus", corpus).
160 Msg("Retrieving corpus statistics")
161
162 var statsResp service.StatisticsResponse
163 var endpoint string
164 if corpus == "" {
165 endpoint = "statistics"
166 } else {
167 endpoint = fmt.Sprintf("statistics?corpusQuery=%s", corpus)
168 }
169
170 err := m.client.GetJSON(ctx, endpoint, &statsResp)
171 if err != nil {
172 log.Error().
173 Err(err).
174 Str("corpus", corpus).
175 Msg("Failed to retrieve corpus statistics")
176 return nil, fmt.Errorf("failed to retrieve corpus statistics: %w", err)
177 }
178
Akron81f709c2025-06-12 17:30:55 +0200179 // Validate the response
180 if err := m.validator.ValidateStatisticsResponse(&statsResp); err != nil {
181 log.Warn().
182 Err(err).
183 Msg("Statistics response validation failed, but continuing with potentially invalid data")
184 // Continue processing despite validation errors to be resilient
185 }
186
Akronbd154ea2025-06-12 17:01:58 +0200187 log.Info().
188 Str("corpus", corpus).
189 Int("documents", statsResp.Documents).
190 Int("tokens", statsResp.Tokens).
191 Msg("Corpus statistics retrieved successfully")
192
193 result := m.formatCorpusStatistics(corpus, &statsResp)
194 return mcp.NewToolResultText(result), nil
195}
196
197// formatCorpusList formats the corpus list response into a readable text format
198func (m *MetadataTool) formatCorpusList(response *service.CorpusListResponse) string {
199 var result strings.Builder
200
201 result.WriteString("KorAP Available Corpora\n")
202 result.WriteString("=======================\n\n")
203
204 if len(response.Corpora) == 0 {
205 result.WriteString("No corpora available.\n")
206 return result.String()
207 }
208
209 result.WriteString(fmt.Sprintf("Total Corpora: %d\n\n", len(response.Corpora)))
210
211 for i, corpus := range response.Corpora {
212 result.WriteString(fmt.Sprintf("%d. %s\n", i+1, corpus.Name))
213 result.WriteString(fmt.Sprintf(" ID: %s\n", corpus.ID))
214
215 if corpus.Description != "" {
216 result.WriteString(fmt.Sprintf(" Description: %s\n", corpus.Description))
217 }
218
219 if corpus.Documents > 0 {
220 result.WriteString(fmt.Sprintf(" Documents: %d\n", corpus.Documents))
221 }
222
223 if corpus.Tokens > 0 {
224 result.WriteString(fmt.Sprintf(" Tokens: %d\n", corpus.Tokens))
225 }
226
227 if corpus.Sentences > 0 {
228 result.WriteString(fmt.Sprintf(" Sentences: %d\n", corpus.Sentences))
229 }
230
231 if corpus.Paragraphs > 0 {
232 result.WriteString(fmt.Sprintf(" Paragraphs: %d\n", corpus.Paragraphs))
233 }
234
235 result.WriteString("\n")
236 }
237
238 return result.String()
239}
240
241// formatCorpusStatistics formats the corpus statistics response into a readable text format
242func (m *MetadataTool) formatCorpusStatistics(corpus string, response *service.StatisticsResponse) string {
243 var result strings.Builder
244
245 result.WriteString("KorAP Corpus Statistics\n")
246 result.WriteString("=======================\n\n")
247
248 if corpus == "" {
249 result.WriteString("Corpus Query: (all available data)\n\n")
250 } else {
251 result.WriteString(fmt.Sprintf("Corpus Query: %s\n\n", corpus))
252 }
253
254 result.WriteString("Statistics:\n")
255 result.WriteString("-----------\n")
256 result.WriteString(fmt.Sprintf("Documents: %d\n", response.Documents))
257 result.WriteString(fmt.Sprintf("Tokens: %d\n", response.Tokens))
258
259 if response.Sentences > 0 {
260 result.WriteString(fmt.Sprintf("Sentences: %d\n", response.Sentences))
261 }
262
263 if response.Paragraphs > 0 {
264 result.WriteString(fmt.Sprintf("Paragraphs: %d\n", response.Paragraphs))
265 }
266
267 // Add any additional fields if present
268 if len(response.Fields) > 0 {
269 result.WriteString("\nAdditional Fields:\n")
270 result.WriteString("------------------\n")
271 for key, value := range response.Fields {
272 result.WriteString(fmt.Sprintf("%s: %v\n", key, value))
273 }
274 }
275
276 return result.String()
277}