blob: a70bd39d904848bec5e0cee60f797d3e1268a6f6 [file] [log] [blame]
package tools
import (
"context"
"fmt"
"strings"
"github.com/korap/korap-mcp/service"
"github.com/korap/korap-mcp/validation"
"github.com/mark3labs/mcp-go/mcp"
"github.com/rs/zerolog/log"
)
// MetadataTool implements the Tool interface for KorAP corpus metadata retrieval
type MetadataTool struct {
client *service.Client
validator *validation.Validator
}
// NewMetadataTool creates a new metadata tool instance
func NewMetadataTool(client *service.Client) *MetadataTool {
return &MetadataTool{
client: client,
validator: validation.New(log.Logger),
}
}
// Name returns the tool name
func (m *MetadataTool) Name() string {
return "korap_metadata"
}
// Description returns the tool description
func (m *MetadataTool) Description() string {
return "Retrieve metadata and statistics for KorAP corpora"
}
// InputSchema returns the JSON schema for tool parameters
func (m *MetadataTool) InputSchema() map[string]any {
return map[string]any{
"type": "object",
"properties": map[string]any{
"action": map[string]any{
"type": "string",
"description": "Type of metadata operation to perform. 'list' retrieves all available corpora with their basic information, 'statistics' provides detailed corpus statistics.",
"enum": []string{"list", "statistics"},
"default": "list",
"examples": []string{"list", "statistics"},
},
"corpus": map[string]any{
"type": "string",
"description": "Virtual corpus query to filter results based on metadata fields. For 'list' action, this parameter is ignored. For 'statistics' action, specifies which subset of data to analyze using metadata queries with boolean operations (& | !), comparison operators (= != < > in), and regular expressions (/pattern/). When not provided with 'statistics', returns statistics for all accessible data.",
"pattern": "^[a-zA-Z0-9._\\-\\s&|!=<>()/*\"']+$",
"examples": []string{"corpusSigle = \"GOE\"", "textClass = \"politics\" & pubDate in 2020", "textType = \"news\" | textType = \"blog\"", "availability = /CC.*/ & textClass != \"fiction\""},
},
},
"required": []string{"action"},
"additionalProperties": false,
"title": "KorAP Metadata Parameters",
"description": "Parameters for retrieving corpus metadata and statistics from KorAP, including corpus lists and detailed statistical information.",
}
}
// Execute performs the metadata retrieval operation
func (m *MetadataTool) Execute(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
log.Debug().
Str("tool", m.Name()).
Msg("Executing metadata tool")
// Extract required action parameter
action, err := request.RequireString("action")
if err != nil {
return nil, fmt.Errorf("action parameter is required: %w", err)
}
// Extract optional corpus parameter
corpus := request.GetString("corpus", "")
// Validate the metadata request using the validation package
metadataReq := validation.MetadataRequest{
Action: action,
Corpus: corpus,
}
if err := m.validator.ValidateMetadataRequest(metadataReq); err != nil {
log.Warn().
Err(err).
Interface("request", metadataReq).
Msg("Metadata request validation failed")
return nil, fmt.Errorf("invalid metadata request: %w", err)
}
// Sanitize inputs
if corpus != "" {
corpus = m.validator.SanitizeCorpusID(corpus)
}
log.Debug().
Str("action", action).
Str("corpus", corpus).
Msg("Parsed and validated metadata parameters")
// Check if client is available and authenticated
if m.client == nil {
return nil, fmt.Errorf("KorAP client not configured")
}
if !m.client.IsAuthenticated() {
log.Warn().Msg("Client not authenticated, attempting authentication")
if err := m.client.AuthenticateWithClientCredentials(ctx); err != nil {
return nil, fmt.Errorf("authentication failed: %w", err)
}
}
// Handle different actions
switch action {
case "list":
return m.handleListCorpora(ctx)
case "statistics":
return m.handleCorpusStatistics(ctx, corpus)
default:
// This should never be reached due to validation above
return nil, fmt.Errorf("unknown action: %s", action)
}
}
// handleListCorpora retrieves and formats the list of available corpora
func (m *MetadataTool) handleListCorpora(ctx context.Context) (*mcp.CallToolResult, error) {
log.Debug().Msg("Retrieving corpus list")
var corpusListResp service.CorpusListResponse
err := m.client.GetJSON(ctx, "corpus", &corpusListResp)
if err != nil {
log.Error().
Err(err).
Msg("Failed to retrieve corpus list")
return nil, fmt.Errorf("failed to retrieve corpus list: %w", err)
}
// Validate the response
if err := m.validator.ValidateCorpusListResponse(&corpusListResp); err != nil {
log.Warn().
Err(err).
Msg("Corpus list response validation failed, but continuing with potentially invalid data")
// Continue processing despite validation errors to be resilient
}
log.Info().
Int("corpus_count", len(corpusListResp.Corpora)).
Msg("Corpus list retrieved successfully")
result := m.formatCorpusList(&corpusListResp)
return mcp.NewToolResultText(result), nil
}
// handleCorpusStatistics retrieves and formats statistics for a corpus query
func (m *MetadataTool) handleCorpusStatistics(ctx context.Context, corpus string) (*mcp.CallToolResult, error) {
log.Debug().
Str("corpus", corpus).
Msg("Retrieving corpus statistics")
var statsResp service.StatisticsResponse
var endpoint string
if corpus == "" {
endpoint = "statistics"
} else {
endpoint = fmt.Sprintf("statistics?corpusQuery=%s", corpus)
}
err := m.client.GetJSON(ctx, endpoint, &statsResp)
if err != nil {
log.Error().
Err(err).
Str("corpus", corpus).
Msg("Failed to retrieve corpus statistics")
return nil, fmt.Errorf("failed to retrieve corpus statistics: %w", err)
}
// Validate the response
if err := m.validator.ValidateStatisticsResponse(&statsResp); err != nil {
log.Warn().
Err(err).
Msg("Statistics response validation failed, but continuing with potentially invalid data")
// Continue processing despite validation errors to be resilient
}
log.Info().
Str("corpus", corpus).
Int("documents", statsResp.Documents).
Int("tokens", statsResp.Tokens).
Msg("Corpus statistics retrieved successfully")
result := m.formatCorpusStatistics(corpus, &statsResp)
return mcp.NewToolResultText(result), nil
}
// formatCorpusList formats the corpus list response into a readable text format
func (m *MetadataTool) formatCorpusList(response *service.CorpusListResponse) string {
var result strings.Builder
result.WriteString("KorAP Available Corpora\n")
result.WriteString("=======================\n\n")
if len(response.Corpora) == 0 {
result.WriteString("No corpora available.\n")
return result.String()
}
result.WriteString(fmt.Sprintf("Total Corpora: %d\n\n", len(response.Corpora)))
for i, corpus := range response.Corpora {
result.WriteString(fmt.Sprintf("%d. %s\n", i+1, corpus.Name))
result.WriteString(fmt.Sprintf(" ID: %s\n", corpus.ID))
if corpus.Description != "" {
result.WriteString(fmt.Sprintf(" Description: %s\n", corpus.Description))
}
if corpus.Documents > 0 {
result.WriteString(fmt.Sprintf(" Documents: %d\n", corpus.Documents))
}
if corpus.Tokens > 0 {
result.WriteString(fmt.Sprintf(" Tokens: %d\n", corpus.Tokens))
}
if corpus.Sentences > 0 {
result.WriteString(fmt.Sprintf(" Sentences: %d\n", corpus.Sentences))
}
if corpus.Paragraphs > 0 {
result.WriteString(fmt.Sprintf(" Paragraphs: %d\n", corpus.Paragraphs))
}
result.WriteString("\n")
}
return result.String()
}
// formatCorpusStatistics formats the corpus statistics response into a readable text format
func (m *MetadataTool) formatCorpusStatistics(corpus string, response *service.StatisticsResponse) string {
var result strings.Builder
result.WriteString("KorAP Corpus Statistics\n")
result.WriteString("=======================\n\n")
if corpus == "" {
result.WriteString("Corpus Query: (all available data)\n\n")
} else {
result.WriteString(fmt.Sprintf("Corpus Query: %s\n\n", corpus))
}
result.WriteString("Statistics:\n")
result.WriteString("-----------\n")
result.WriteString(fmt.Sprintf("Documents: %d\n", response.Documents))
result.WriteString(fmt.Sprintf("Tokens: %d\n", response.Tokens))
if response.Sentences > 0 {
result.WriteString(fmt.Sprintf("Sentences: %d\n", response.Sentences))
}
if response.Paragraphs > 0 {
result.WriteString(fmt.Sprintf("Paragraphs: %d\n", response.Paragraphs))
}
// Add any additional fields if present
if len(response.Fields) > 0 {
result.WriteString("\nAdditional Fields:\n")
result.WriteString("------------------\n")
for key, value := range response.Fields {
result.WriteString(fmt.Sprintf("%s: %v\n", key, value))
}
}
return result.String()
}