Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 1 | package parser |
| 2 | |
Akron | bf5149c | 2025-05-20 15:53:41 +0200 | [diff] [blame] | 3 | // parser is a function that takes a JSON string and returns an AST node. |
| 4 | // It is used to parse a JSON string into an AST node. |
| 5 | |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 6 | import ( |
| 7 | "encoding/json" |
| 8 | "fmt" |
| 9 | "strings" |
| 10 | |
Akron | fa55bb2 | 2025-05-26 15:10:42 +0200 | [diff] [blame^] | 11 | "github.com/KorAP/KoralPipe-TermMapper/ast" |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 12 | ) |
| 13 | |
| 14 | // rawNode represents the raw JSON structure |
| 15 | type rawNode struct { |
| 16 | Type string `json:"@type"` |
| 17 | Wrap json.RawMessage `json:"wrap,omitempty"` |
| 18 | Operands []rawNode `json:"operands,omitempty"` |
| 19 | Relation string `json:"relation,omitempty"` |
| 20 | Foundry string `json:"foundry,omitempty"` |
| 21 | Key string `json:"key,omitempty"` |
| 22 | Layer string `json:"layer,omitempty"` |
| 23 | Match string `json:"match,omitempty"` |
| 24 | Value string `json:"value,omitempty"` |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 25 | // Store any additional fields |
| 26 | Extra map[string]interface{} `json:"-"` |
| 27 | } |
| 28 | |
| 29 | // UnmarshalJSON implements the json.Unmarshaler interface |
| 30 | func (r *rawNode) UnmarshalJSON(data []byte) error { |
| 31 | // First unmarshal into a map to capture all fields |
| 32 | var raw map[string]interface{} |
| 33 | if err := json.Unmarshal(data, &raw); err != nil { |
| 34 | return err |
| 35 | } |
| 36 | |
| 37 | // Create a temporary struct to unmarshal known fields |
| 38 | type tempNode rawNode |
| 39 | var temp tempNode |
| 40 | if err := json.Unmarshal(data, &temp); err != nil { |
| 41 | return err |
| 42 | } |
| 43 | *r = rawNode(temp) |
| 44 | |
| 45 | // Store any fields not in the struct in Extra |
| 46 | r.Extra = make(map[string]interface{}) |
| 47 | for k, v := range raw { |
| 48 | switch k { |
| 49 | case "@type", "wrap", "operands", "relation", "foundry", "key", "layer", "match", "value": |
| 50 | continue |
| 51 | default: |
| 52 | r.Extra[k] = v |
| 53 | } |
| 54 | } |
| 55 | |
| 56 | return nil |
| 57 | } |
| 58 | |
| 59 | // MarshalJSON implements the json.Marshaler interface |
| 60 | func (r rawNode) MarshalJSON() ([]byte, error) { |
| 61 | // Create a map with all fields |
| 62 | raw := make(map[string]interface{}) |
| 63 | |
| 64 | // Add the known fields if they're not empty |
| 65 | raw["@type"] = r.Type |
| 66 | if r.Wrap != nil { |
| 67 | raw["wrap"] = r.Wrap |
| 68 | } |
| 69 | if len(r.Operands) > 0 { |
| 70 | raw["operands"] = r.Operands |
| 71 | } |
| 72 | if r.Relation != "" { |
| 73 | raw["relation"] = r.Relation |
| 74 | } |
| 75 | if r.Foundry != "" { |
| 76 | raw["foundry"] = r.Foundry |
| 77 | } |
| 78 | if r.Key != "" { |
| 79 | raw["key"] = r.Key |
| 80 | } |
| 81 | if r.Layer != "" { |
| 82 | raw["layer"] = r.Layer |
| 83 | } |
| 84 | if r.Match != "" { |
| 85 | raw["match"] = r.Match |
| 86 | } |
| 87 | if r.Value != "" { |
| 88 | raw["value"] = r.Value |
| 89 | } |
| 90 | |
| 91 | // Add any extra fields |
| 92 | for k, v := range r.Extra { |
| 93 | raw[k] = v |
| 94 | } |
| 95 | |
| 96 | return json.Marshal(raw) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 97 | } |
| 98 | |
| 99 | // ParseJSON parses a JSON string into our AST representation |
| 100 | func ParseJSON(data []byte) (ast.Node, error) { |
| 101 | var raw rawNode |
| 102 | if err := json.Unmarshal(data, &raw); err != nil { |
| 103 | return nil, fmt.Errorf("failed to parse JSON: %w", err) |
| 104 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 105 | if raw.Type == "" { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 106 | return nil, fmt.Errorf("missing required field '@type' in JSON") |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 107 | } |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 108 | return parseNode(raw) |
| 109 | } |
| 110 | |
| 111 | // parseNode converts a raw node into an AST node |
| 112 | func parseNode(raw rawNode) (ast.Node, error) { |
| 113 | switch raw.Type { |
| 114 | case "koral:token": |
| 115 | if raw.Wrap == nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 116 | return nil, fmt.Errorf("token node of type '%s' missing required 'wrap' field", raw.Type) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 117 | } |
| 118 | var wrapRaw rawNode |
| 119 | if err := json.Unmarshal(raw.Wrap, &wrapRaw); err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 120 | return nil, fmt.Errorf("failed to parse 'wrap' field in token node: %w", err) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 121 | } |
| 122 | wrap, err := parseNode(wrapRaw) |
| 123 | if err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 124 | return nil, fmt.Errorf("error parsing wrapped node: %w", err) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 125 | } |
| 126 | return &ast.Token{Wrap: wrap}, nil |
| 127 | |
| 128 | case "koral:termGroup": |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 129 | if len(raw.Operands) == 0 { |
| 130 | return nil, fmt.Errorf("term group must have at least one operand") |
| 131 | } |
| 132 | |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 133 | operands := make([]ast.Node, len(raw.Operands)) |
| 134 | for i, op := range raw.Operands { |
| 135 | node, err := parseNode(op) |
| 136 | if err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 137 | return nil, fmt.Errorf("error parsing operand %d: %w", i+1, err) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 138 | } |
| 139 | operands[i] = node |
| 140 | } |
| 141 | |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 142 | if raw.Relation == "" { |
| 143 | return nil, fmt.Errorf("term group must have a 'relation' field") |
| 144 | } |
| 145 | |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 146 | relation := ast.AndRelation |
| 147 | if strings.HasSuffix(raw.Relation, "or") { |
| 148 | relation = ast.OrRelation |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 149 | } else if !strings.HasSuffix(raw.Relation, "and") { |
| 150 | return nil, fmt.Errorf("invalid relation type '%s', must be one of: 'relation:and', 'relation:or'", raw.Relation) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 151 | } |
| 152 | |
| 153 | return &ast.TermGroup{ |
| 154 | Operands: operands, |
| 155 | Relation: relation, |
| 156 | }, nil |
| 157 | |
| 158 | case "koral:term": |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 159 | if raw.Key == "" { |
| 160 | return nil, fmt.Errorf("term must have a 'key' field") |
| 161 | } |
| 162 | |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 163 | match := ast.MatchEqual |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 164 | if raw.Match != "" { |
| 165 | if strings.HasSuffix(raw.Match, "ne") { |
| 166 | match = ast.MatchNotEqual |
| 167 | } else if !strings.HasSuffix(raw.Match, "eq") { |
| 168 | return nil, fmt.Errorf("invalid match type '%s', must be one of: 'match:eq', 'match:ne'", raw.Match) |
| 169 | } |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 170 | } |
| 171 | |
| 172 | return &ast.Term{ |
| 173 | Foundry: raw.Foundry, |
| 174 | Key: raw.Key, |
| 175 | Layer: raw.Layer, |
| 176 | Match: match, |
| 177 | Value: raw.Value, |
| 178 | }, nil |
| 179 | |
| 180 | default: |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 181 | // Store the original JSON content |
| 182 | rawContent, err := json.Marshal(raw) |
| 183 | if err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 184 | return nil, fmt.Errorf("failed to marshal unknown node type '%s': %w", raw.Type, err) |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 185 | } |
| 186 | |
| 187 | // Create a catchall node |
| 188 | catchall := &ast.CatchallNode{ |
| 189 | NodeType: raw.Type, |
| 190 | RawContent: rawContent, |
| 191 | } |
| 192 | |
| 193 | // Parse wrap if present |
| 194 | if raw.Wrap != nil { |
| 195 | var wrapRaw rawNode |
| 196 | if err := json.Unmarshal(raw.Wrap, &wrapRaw); err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 197 | return nil, fmt.Errorf("failed to parse 'wrap' field in unknown node type '%s': %w", raw.Type, err) |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 198 | } |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 199 | |
| 200 | // Check if the wrapped node is a known type |
| 201 | if wrapRaw.Type == "koral:term" || wrapRaw.Type == "koral:token" || wrapRaw.Type == "koral:termGroup" { |
| 202 | wrap, err := parseNode(wrapRaw) |
| 203 | if err != nil { |
| 204 | return nil, fmt.Errorf("error parsing wrapped node in unknown node type '%s': %w", raw.Type, err) |
| 205 | } |
| 206 | catchall.Wrap = wrap |
| 207 | } else { |
| 208 | // For unknown types, recursively parse |
| 209 | wrap, err := parseNode(wrapRaw) |
| 210 | if err != nil { |
| 211 | return nil, fmt.Errorf("error parsing wrapped node in unknown node type '%s': %w", raw.Type, err) |
| 212 | } |
| 213 | catchall.Wrap = wrap |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 214 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 215 | } |
| 216 | |
| 217 | // Parse operands if present |
| 218 | if len(raw.Operands) > 0 { |
| 219 | operands := make([]ast.Node, len(raw.Operands)) |
| 220 | for i, op := range raw.Operands { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 221 | // Check if the operand is a known type |
| 222 | if op.Type == "koral:term" || op.Type == "koral:token" || op.Type == "koral:termGroup" { |
| 223 | node, err := parseNode(op) |
| 224 | if err != nil { |
| 225 | return nil, fmt.Errorf("error parsing operand %d in unknown node type '%s': %w", i+1, raw.Type, err) |
| 226 | } |
| 227 | operands[i] = node |
| 228 | } else { |
| 229 | // For unknown types, recursively parse |
| 230 | node, err := parseNode(op) |
| 231 | if err != nil { |
| 232 | return nil, fmt.Errorf("error parsing operand %d in unknown node type '%s': %w", i+1, raw.Type, err) |
| 233 | } |
| 234 | operands[i] = node |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 235 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 236 | } |
| 237 | catchall.Operands = operands |
| 238 | } |
| 239 | |
| 240 | return catchall, nil |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 241 | } |
| 242 | } |
| 243 | |
| 244 | // SerializeToJSON converts an AST node back to JSON |
| 245 | func SerializeToJSON(node ast.Node) ([]byte, error) { |
| 246 | raw := nodeToRaw(node) |
| 247 | return json.MarshalIndent(raw, "", " ") |
| 248 | } |
| 249 | |
| 250 | // nodeToRaw converts an AST node to a raw node for JSON serialization |
| 251 | func nodeToRaw(node ast.Node) rawNode { |
| 252 | switch n := node.(type) { |
| 253 | case *ast.Token: |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 254 | if n.Wrap == nil { |
| 255 | return rawNode{ |
| 256 | Type: "koral:token", |
| 257 | } |
| 258 | } |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 259 | return rawNode{ |
| 260 | Type: "koral:token", |
| 261 | Wrap: json.RawMessage(nodeToRaw(n.Wrap).toJSON()), |
| 262 | } |
| 263 | |
| 264 | case *ast.TermGroup: |
| 265 | operands := make([]rawNode, len(n.Operands)) |
| 266 | for i, op := range n.Operands { |
| 267 | operands[i] = nodeToRaw(op) |
| 268 | } |
| 269 | return rawNode{ |
| 270 | Type: "koral:termGroup", |
| 271 | Operands: operands, |
| 272 | Relation: "relation:" + string(n.Relation), |
| 273 | } |
| 274 | |
| 275 | case *ast.Term: |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 276 | raw := rawNode{ |
| 277 | Type: "koral:term", |
| 278 | Key: n.Key, |
| 279 | Match: "match:" + string(n.Match), |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 280 | } |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 281 | if n.Foundry != "" { |
| 282 | raw.Foundry = n.Foundry |
| 283 | } |
| 284 | if n.Layer != "" { |
| 285 | raw.Layer = n.Layer |
| 286 | } |
| 287 | if n.Value != "" { |
| 288 | raw.Value = n.Value |
| 289 | } |
| 290 | return raw |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 291 | |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 292 | case *ast.CatchallNode: |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 293 | // For catchall nodes, use the stored raw content if available |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 294 | if n.RawContent != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 295 | var raw rawNode |
| 296 | if err := json.Unmarshal(n.RawContent, &raw); err == nil { |
| 297 | // Ensure we preserve the node type |
| 298 | raw.Type = n.NodeType |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 299 | |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 300 | // Handle wrap and operands if present |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 301 | if n.Wrap != nil { |
| 302 | raw.Wrap = json.RawMessage(nodeToRaw(n.Wrap).toJSON()) |
| 303 | } |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 304 | if len(n.Operands) > 0 { |
| 305 | operands := make([]rawNode, len(n.Operands)) |
| 306 | for i, op := range n.Operands { |
| 307 | operands[i] = nodeToRaw(op) |
| 308 | } |
| 309 | raw.Operands = operands |
| 310 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 311 | return raw |
| 312 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 313 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 314 | |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 315 | // If RawContent is nil or invalid, create a minimal raw node |
| 316 | raw := rawNode{ |
| 317 | Type: n.NodeType, |
| 318 | } |
| 319 | if n.Wrap != nil { |
| 320 | raw.Wrap = json.RawMessage(nodeToRaw(n.Wrap).toJSON()) |
| 321 | } |
| 322 | if len(n.Operands) > 0 { |
| 323 | operands := make([]rawNode, len(n.Operands)) |
| 324 | for i, op := range n.Operands { |
| 325 | operands[i] = nodeToRaw(op) |
| 326 | } |
| 327 | raw.Operands = operands |
| 328 | } |
| 329 | return raw |
| 330 | } |
| 331 | |
| 332 | // Return a minimal raw node for unknown types |
| 333 | return rawNode{ |
| 334 | Type: "koral:unknown", |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 335 | } |
| 336 | } |
| 337 | |
| 338 | // toJSON converts a raw node to JSON bytes |
| 339 | func (r rawNode) toJSON() []byte { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 340 | data, err := json.Marshal(r) |
| 341 | if err != nil { |
| 342 | // Return a minimal valid JSON object if marshaling fails |
| 343 | return []byte(`{"@type":"koral:unknown"}`) |
| 344 | } |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 345 | return data |
| 346 | } |