Support metadata retrieval

commit: bd154ea40354e09aed17447c33398ed8a5eafedd [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Jun 12 17:01:58 2025 +0200
committer: Akron <nils@diewald-online.de> Thu Jun 12 17:01:58 2025 +0200
tree: cf45b4e3428a7ec62dad4ff1b64f234b16c02e8f
parent: 8138c3538c91266aefddd5e64174b7e6cb61b38b [diff]
diff --git a/tools/metadata.go b/tools/metadata.go
new file mode 100644
index 0000000..6fd84b1
--- /dev/null
+++ b/tools/metadata.go

@@ -0,0 +1,243 @@
+package tools
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	"github.com/korap/korap-mcp/service"
+	"github.com/mark3labs/mcp-go/mcp"
+	"github.com/rs/zerolog/log"
+)
+
+// MetadataTool implements the Tool interface for KorAP corpus metadata retrieval
+type MetadataTool struct {
+	client *service.Client
+}
+
+// NewMetadataTool creates a new metadata tool instance
+func NewMetadataTool(client *service.Client) *MetadataTool {
+	return &MetadataTool{
+		client: client,
+	}
+}
+
+// Name returns the tool name
+func (m *MetadataTool) Name() string {
+	return "korap_metadata"
+}
+
+// Description returns the tool description
+func (m *MetadataTool) Description() string {
+	return "Retrieve metadata and statistics for KorAP corpora"
+}
+
+// InputSchema returns the JSON schema for tool parameters
+func (m *MetadataTool) InputSchema() map[string]interface{} {
+	return map[string]interface{}{
+		"type": "object",
+		"properties": map[string]interface{}{
+			"action": map[string]interface{}{
+				"type":        "string",
+				"description": "Type of metadata to retrieve: 'list' for corpus list, 'statistics' for corpus statistics",
+				"enum":        []string{"list", "statistics"},
+				"default":     "list",
+			},
+			"corpus": map[string]interface{}{
+				"type":        "string",
+				"description": "Virtual corpus query to filter results (optional, when not provided refers to all data available to the user)",
+			},
+		},
+		"required": []string{"action"},
+	}
+}
+
+// Execute performs the metadata retrieval operation
+func (m *MetadataTool) Execute(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
+	log.Debug().
+		Str("tool", m.Name()).
+		Msg("Executing metadata tool")
+
+	// Extract required action parameter
+	action, err := request.RequireString("action")
+	if err != nil {
+		return nil, fmt.Errorf("action parameter is required: %w", err)
+	}
+
+	// Extract optional corpus parameter
+	corpus := request.GetString("corpus", "")
+
+	log.Debug().
+		Str("action", action).
+		Str("corpus", corpus).
+		Msg("Parsed metadata parameters")
+
+	// Validate parameters before authentication
+	switch action {
+	case "list":
+		// No additional validation needed for list
+	case "statistics":
+		// No additional validation needed for statistics - corpus is optional
+	default:
+		return nil, fmt.Errorf("unknown action: %s", action)
+	}
+
+	// Check if client is available and authenticated
+	if m.client == nil {
+		return nil, fmt.Errorf("KorAP client not configured")
+	}
+
+	if !m.client.IsAuthenticated() {
+		log.Warn().Msg("Client not authenticated, attempting authentication")
+		if err := m.client.AuthenticateWithClientCredentials(ctx); err != nil {
+			return nil, fmt.Errorf("authentication failed: %w", err)
+		}
+	}
+
+	// Handle different actions
+	switch action {
+	case "list":
+		return m.handleListCorpora(ctx)
+	case "statistics":
+		return m.handleCorpusStatistics(ctx, corpus)
+	default:
+		// This should never be reached due to validation above
+		return nil, fmt.Errorf("unknown action: %s", action)
+	}
+}
+
+// handleListCorpora retrieves and formats the list of available corpora
+func (m *MetadataTool) handleListCorpora(ctx context.Context) (*mcp.CallToolResult, error) {
+	log.Debug().Msg("Retrieving corpus list")
+
+	var corpusListResp service.CorpusListResponse
+	err := m.client.GetJSON(ctx, "corpus", &corpusListResp)
+	if err != nil {
+		log.Error().
+			Err(err).
+			Msg("Failed to retrieve corpus list")
+		return nil, fmt.Errorf("failed to retrieve corpus list: %w", err)
+	}
+
+	log.Info().
+		Int("corpus_count", len(corpusListResp.Corpora)).
+		Msg("Corpus list retrieved successfully")
+
+	result := m.formatCorpusList(&corpusListResp)
+	return mcp.NewToolResultText(result), nil
+}
+
+// handleCorpusStatistics retrieves and formats statistics for a corpus query
+func (m *MetadataTool) handleCorpusStatistics(ctx context.Context, corpus string) (*mcp.CallToolResult, error) {
+	log.Debug().
+		Str("corpus", corpus).
+		Msg("Retrieving corpus statistics")
+
+	var statsResp service.StatisticsResponse
+	var endpoint string
+	if corpus == "" {
+		endpoint = "statistics"
+	} else {
+		endpoint = fmt.Sprintf("statistics?corpusQuery=%s", corpus)
+	}
+
+	err := m.client.GetJSON(ctx, endpoint, &statsResp)
+	if err != nil {
+		log.Error().
+			Err(err).
+			Str("corpus", corpus).
+			Msg("Failed to retrieve corpus statistics")
+		return nil, fmt.Errorf("failed to retrieve corpus statistics: %w", err)
+	}
+
+	log.Info().
+		Str("corpus", corpus).
+		Int("documents", statsResp.Documents).
+		Int("tokens", statsResp.Tokens).
+		Msg("Corpus statistics retrieved successfully")
+
+	result := m.formatCorpusStatistics(corpus, &statsResp)
+	return mcp.NewToolResultText(result), nil
+}
+
+// formatCorpusList formats the corpus list response into a readable text format
+func (m *MetadataTool) formatCorpusList(response *service.CorpusListResponse) string {
+	var result strings.Builder
+
+	result.WriteString("KorAP Available Corpora\n")
+	result.WriteString("=======================\n\n")
+
+	if len(response.Corpora) == 0 {
+		result.WriteString("No corpora available.\n")
+		return result.String()
+	}
+
+	result.WriteString(fmt.Sprintf("Total Corpora: %d\n\n", len(response.Corpora)))
+
+	for i, corpus := range response.Corpora {
+		result.WriteString(fmt.Sprintf("%d. %s\n", i+1, corpus.Name))
+		result.WriteString(fmt.Sprintf("   ID: %s\n", corpus.ID))
+
+		if corpus.Description != "" {
+			result.WriteString(fmt.Sprintf("   Description: %s\n", corpus.Description))
+		}
+
+		if corpus.Documents > 0 {
+			result.WriteString(fmt.Sprintf("   Documents: %d\n", corpus.Documents))
+		}
+
+		if corpus.Tokens > 0 {
+			result.WriteString(fmt.Sprintf("   Tokens: %d\n", corpus.Tokens))
+		}
+
+		if corpus.Sentences > 0 {
+			result.WriteString(fmt.Sprintf("   Sentences: %d\n", corpus.Sentences))
+		}
+
+		if corpus.Paragraphs > 0 {
+			result.WriteString(fmt.Sprintf("   Paragraphs: %d\n", corpus.Paragraphs))
+		}
+
+		result.WriteString("\n")
+	}
+
+	return result.String()
+}
+
+// formatCorpusStatistics formats the corpus statistics response into a readable text format
+func (m *MetadataTool) formatCorpusStatistics(corpus string, response *service.StatisticsResponse) string {
+	var result strings.Builder
+
+	result.WriteString("KorAP Corpus Statistics\n")
+	result.WriteString("=======================\n\n")
+
+	if corpus == "" {
+		result.WriteString("Corpus Query: (all available data)\n\n")
+	} else {
+		result.WriteString(fmt.Sprintf("Corpus Query: %s\n\n", corpus))
+	}
+
+	result.WriteString("Statistics:\n")
+	result.WriteString("-----------\n")
+	result.WriteString(fmt.Sprintf("Documents: %d\n", response.Documents))
+	result.WriteString(fmt.Sprintf("Tokens: %d\n", response.Tokens))
+
+	if response.Sentences > 0 {
+		result.WriteString(fmt.Sprintf("Sentences: %d\n", response.Sentences))
+	}
+
+	if response.Paragraphs > 0 {
+		result.WriteString(fmt.Sprintf("Paragraphs: %d\n", response.Paragraphs))
+	}
+
+	// Add any additional fields if present
+	if len(response.Fields) > 0 {
+		result.WriteString("\nAdditional Fields:\n")
+		result.WriteString("------------------\n")
+		for key, value := range response.Fields {
+			result.WriteString(fmt.Sprintf("%s: %v\n", key, value))
+		}
+	}
+
+	return result.String()
+}

diff --git a/tools/metadata_test.go b/tools/metadata_test.go
new file mode 100644
index 0000000..88c7e6d
--- /dev/null
+++ b/tools/metadata_test.go

@@ -0,0 +1,285 @@
+package tools
+
+import (
+	"context"
+	"testing"
+
+	"github.com/korap/korap-mcp/service"
+	"github.com/mark3labs/mcp-go/mcp"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestMetadataTool_Name(t *testing.T) {
+	client := &service.Client{}
+	tool := NewMetadataTool(client)
+
+	assert.Equal(t, "korap_metadata", tool.Name())
+}
+
+func TestMetadataTool_Description(t *testing.T) {
+	client := &service.Client{}
+	tool := NewMetadataTool(client)
+
+	expected := "Retrieve metadata and statistics for KorAP corpora"
+	assert.Equal(t, expected, tool.Description())
+}
+
+func TestMetadataTool_InputSchema(t *testing.T) {
+	client := &service.Client{}
+	tool := NewMetadataTool(client)
+
+	schema := tool.InputSchema()
+
+	// Verify it's an object type
+	assert.Equal(t, "object", schema["type"])
+
+	// Verify properties exist
+	properties, ok := schema["properties"].(map[string]interface{})
+	assert.True(t, ok)
+	assert.Contains(t, properties, "action")
+	assert.Contains(t, properties, "corpus")
+
+	// Verify action property details
+	action, ok := properties["action"].(map[string]interface{})
+	assert.True(t, ok)
+	assert.Equal(t, "string", action["type"])
+
+	enum, ok := action["enum"].([]string)
+	assert.True(t, ok)
+	assert.Contains(t, enum, "list")
+	assert.Contains(t, enum, "statistics")
+	assert.Equal(t, "list", action["default"])
+
+	// Verify required fields
+	required, ok := schema["required"].([]string)
+	assert.True(t, ok)
+	assert.Contains(t, required, "action")
+}
+
+func TestNewMetadataTool(t *testing.T) {
+	client := &service.Client{}
+	tool := NewMetadataTool(client)
+
+	assert.NotNil(t, tool)
+	assert.Equal(t, client, tool.client)
+}
+
+func TestMetadataTool_Execute_MissingAction(t *testing.T) {
+	client := &service.Client{}
+	tool := NewMetadataTool(client)
+
+	// Create request without action parameter
+	request := mcp.CallToolRequest{
+		Params: mcp.CallToolParams{
+			Arguments: map[string]interface{}{},
+		},
+	}
+
+	_, err := tool.Execute(context.Background(), request)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "action parameter is required")
+}
+
+func TestMetadataTool_Execute_NilClient(t *testing.T) {
+	tool := NewMetadataTool(nil)
+
+	request := mcp.CallToolRequest{
+		Params: mcp.CallToolParams{
+			Arguments: map[string]interface{}{
+				"action": "list",
+			},
+		},
+	}
+
+	_, err := tool.Execute(context.Background(), request)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "KorAP client not configured")
+}
+
+func TestMetadataTool_Execute_UnknownAction(t *testing.T) {
+	client := &service.Client{}
+	tool := NewMetadataTool(client)
+
+	request := mcp.CallToolRequest{
+		Params: mcp.CallToolParams{
+			Arguments: map[string]interface{}{
+				"action": "unknown",
+			},
+		},
+	}
+
+	_, err := tool.Execute(context.Background(), request)
+	assert.Error(t, err)
+	// The unknown action error should come before authentication
+	assert.Contains(t, err.Error(), "unknown action: unknown")
+}
+
+func TestMetadataTool_Execute_StatisticsWithoutCorpus(t *testing.T) {
+	tool := NewMetadataTool(nil)
+
+	request := mcp.CallToolRequest{
+		Params: mcp.CallToolParams{
+			Arguments: map[string]interface{}{
+				"action": "statistics",
+			},
+		},
+	}
+
+	_, err := tool.Execute(context.Background(), request)
+	assert.Error(t, err)
+	// Should fail at client check since corpus is now optional
+	assert.Contains(t, err.Error(), "KorAP client not configured")
+}
+
+func TestMetadataTool_Execute_ParameterExtraction(t *testing.T) {
+	// This test verifies that parameters are extracted correctly
+	// It should fail with authentication error since we don't have a mock server
+	// but we can verify the parameters were parsed correctly by checking the log messages
+
+	client := &service.Client{}
+	tool := NewMetadataTool(client)
+
+	tests := []struct {
+		name      string
+		arguments map[string]interface{}
+		expectErr bool
+	}{
+		{
+			name: "list_action",
+			arguments: map[string]interface{}{
+				"action": "list",
+			},
+			expectErr: true, // Will fail at authentication
+		},
+		{
+			name: "statistics_action",
+			arguments: map[string]interface{}{
+				"action": "statistics",
+				"corpus": "test-corpus",
+			},
+			expectErr: true, // Will fail at authentication
+		},
+		{
+			name: "statistics_with_empty_corpus",
+			arguments: map[string]interface{}{
+				"action": "statistics",
+				"corpus": "",
+			},
+			expectErr: true, // Will fail at authentication (corpus is optional)
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			request := mcp.CallToolRequest{
+				Params: mcp.CallToolParams{
+					Arguments: tt.arguments,
+				},
+			}
+
+			_, err := tool.Execute(context.Background(), request)
+			if tt.expectErr {
+				assert.Error(t, err)
+			}
+		})
+	}
+}
+
+func TestMetadataTool_formatCorpusList(t *testing.T) {
+	client := &service.Client{}
+	tool := NewMetadataTool(client)
+
+	// Test empty response
+	emptyResponse := &service.CorpusListResponse{
+		Corpora: []service.CorpusInfo{},
+	}
+
+	result := tool.formatCorpusList(emptyResponse)
+	assert.Contains(t, result, "KorAP Available Corpora")
+	assert.Contains(t, result, "No corpora available")
+
+	// Test response with corpora
+	responseWithCorpora := &service.CorpusListResponse{
+		Corpora: []service.CorpusInfo{
+			{
+				ID:          "corpus1",
+				Name:        "Test Corpus 1",
+				Description: "A test corpus",
+				Documents:   100,
+				Tokens:      50000,
+				Sentences:   2500,
+				Paragraphs:  500,
+			},
+			{
+				ID:        "corpus2",
+				Name:      "Test Corpus 2",
+				Documents: 200,
+				Tokens:    75000,
+			},
+		},
+	}
+
+	result = tool.formatCorpusList(responseWithCorpora)
+	assert.Contains(t, result, "KorAP Available Corpora")
+	assert.Contains(t, result, "Total Corpora: 2")
+	assert.Contains(t, result, "1. Test Corpus 1")
+	assert.Contains(t, result, "ID: corpus1")
+	assert.Contains(t, result, "Description: A test corpus")
+	assert.Contains(t, result, "Documents: 100")
+	assert.Contains(t, result, "Tokens: 50000")
+	assert.Contains(t, result, "Sentences: 2500")
+	assert.Contains(t, result, "Paragraphs: 500")
+	assert.Contains(t, result, "2. Test Corpus 2")
+	assert.Contains(t, result, "ID: corpus2")
+	assert.Contains(t, result, "Documents: 200")
+	assert.Contains(t, result, "Tokens: 75000")
+}
+
+func TestMetadataTool_formatCorpusStatistics(t *testing.T) {
+	client := &service.Client{}
+	tool := NewMetadataTool(client)
+
+	// Test minimal statistics
+	minimalStats := &service.StatisticsResponse{
+		Documents: 100,
+		Tokens:    50000,
+	}
+
+	result := tool.formatCorpusStatistics("test-corpus", minimalStats)
+	assert.Contains(t, result, "KorAP Corpus Statistics")
+	assert.Contains(t, result, "Corpus Query: test-corpus")
+	assert.Contains(t, result, "Documents: 100")
+	assert.Contains(t, result, "Tokens: 50000")
+
+	// Test complete statistics with additional fields
+	completeStats := &service.StatisticsResponse{
+		Documents:  200,
+		Tokens:     100000,
+		Sentences:  5000,
+		Paragraphs: 1000,
+		Fields: map[string]interface{}{
+			"genre":    "literature",
+			"language": "German",
+			"year":     2023,
+		},
+	}
+
+	result = tool.formatCorpusStatistics("complete-corpus", completeStats)
+	assert.Contains(t, result, "KorAP Corpus Statistics")
+	assert.Contains(t, result, "Corpus Query: complete-corpus")
+	assert.Contains(t, result, "Documents: 200")
+	assert.Contains(t, result, "Tokens: 100000")
+	assert.Contains(t, result, "Sentences: 5000")
+	assert.Contains(t, result, "Paragraphs: 1000")
+	assert.Contains(t, result, "Additional Fields:")
+	assert.Contains(t, result, "genre: literature")
+	assert.Contains(t, result, "language: German")
+	assert.Contains(t, result, "year: 2023")
+
+	// Test empty corpus query (all available data)
+	result = tool.formatCorpusStatistics("", minimalStats)
+	assert.Contains(t, result, "KorAP Corpus Statistics")
+	assert.Contains(t, result, "Corpus Query: (all available data)")
+	assert.Contains(t, result, "Documents: 100")
+	assert.Contains(t, result, "Tokens: 50000")
+}

diff --git a/tools/search.go b/tools/search.go
index 7a0422d..c96f407 100644
--- a/tools/search.go
+++ b/tools/search.go

@@ -49,7 +49,7 @@
 			},
 			"corpus": map[string]interface{}{
 				"type":        "string",
-				"description": "Virtual corpus to search in",
+				"description": "Virtual corpus query to filter search results (optional, when not provided searches all available data)",
 			},
 			"count": map[string]interface{}{
 				"type":        "integer",
commit	bd154ea40354e09aed17447c33398ed8a5eafedd	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Jun 12 17:01:58 2025 +0200
committer	Akron <nils@diewald-online.de>	Thu Jun 12 17:01:58 2025 +0200
tree	cf45b4e3428a7ec62dad4ff1b64f234b16c02e8f
parent	8138c3538c91266aefddd5e64174b7e6cb61b38b [diff]