Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 1 | package parser |
| 2 | |
Akron | bf5149c | 2025-05-20 15:53:41 +0200 | [diff] [blame] | 3 | // parser is a function that takes a JSON string and returns an AST node. |
| 4 | // It is used to parse a JSON string into an AST node. |
| 5 | |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 6 | import ( |
| 7 | "encoding/json" |
| 8 | "fmt" |
| 9 | "strings" |
| 10 | |
Akron | 87948e8 | 2025-05-26 18:19:51 +0200 | [diff] [blame] | 11 | "maps" |
| 12 | |
Akron | fa55bb2 | 2025-05-26 15:10:42 +0200 | [diff] [blame] | 13 | "github.com/KorAP/KoralPipe-TermMapper/ast" |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 14 | ) |
| 15 | |
| 16 | // rawNode represents the raw JSON structure |
| 17 | type rawNode struct { |
| 18 | Type string `json:"@type"` |
| 19 | Wrap json.RawMessage `json:"wrap,omitempty"` |
| 20 | Operands []rawNode `json:"operands,omitempty"` |
| 21 | Relation string `json:"relation,omitempty"` |
| 22 | Foundry string `json:"foundry,omitempty"` |
| 23 | Key string `json:"key,omitempty"` |
| 24 | Layer string `json:"layer,omitempty"` |
| 25 | Match string `json:"match,omitempty"` |
| 26 | Value string `json:"value,omitempty"` |
Akron | 1a5fccd | 2025-05-27 09:54:09 +0200 | [diff] [blame^] | 27 | Rewrites []ast.Rewrite `json:"rewrites,omitempty"` |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 28 | // Store any additional fields |
Akron | 87948e8 | 2025-05-26 18:19:51 +0200 | [diff] [blame] | 29 | Extra map[string]any `json:"-"` |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 30 | } |
| 31 | |
| 32 | // UnmarshalJSON implements the json.Unmarshaler interface |
| 33 | func (r *rawNode) UnmarshalJSON(data []byte) error { |
| 34 | // First unmarshal into a map to capture all fields |
Akron | 87948e8 | 2025-05-26 18:19:51 +0200 | [diff] [blame] | 35 | var raw map[string]any |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 36 | if err := json.Unmarshal(data, &raw); err != nil { |
| 37 | return err |
| 38 | } |
| 39 | |
| 40 | // Create a temporary struct to unmarshal known fields |
| 41 | type tempNode rawNode |
| 42 | var temp tempNode |
| 43 | if err := json.Unmarshal(data, &temp); err != nil { |
| 44 | return err |
| 45 | } |
| 46 | *r = rawNode(temp) |
| 47 | |
| 48 | // Store any fields not in the struct in Extra |
Akron | 87948e8 | 2025-05-26 18:19:51 +0200 | [diff] [blame] | 49 | r.Extra = make(map[string]any) |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 50 | for k, v := range raw { |
| 51 | switch k { |
Akron | 1a5fccd | 2025-05-27 09:54:09 +0200 | [diff] [blame^] | 52 | case "@type", "wrap", "operands", "relation", "foundry", "key", "layer", "match", "value", "rewrites": |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 53 | continue |
| 54 | default: |
| 55 | r.Extra[k] = v |
| 56 | } |
| 57 | } |
| 58 | |
| 59 | return nil |
| 60 | } |
| 61 | |
| 62 | // MarshalJSON implements the json.Marshaler interface |
| 63 | func (r rawNode) MarshalJSON() ([]byte, error) { |
| 64 | // Create a map with all fields |
Akron | 87948e8 | 2025-05-26 18:19:51 +0200 | [diff] [blame] | 65 | raw := make(map[string]any) |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 66 | |
| 67 | // Add the known fields if they're not empty |
| 68 | raw["@type"] = r.Type |
| 69 | if r.Wrap != nil { |
| 70 | raw["wrap"] = r.Wrap |
| 71 | } |
| 72 | if len(r.Operands) > 0 { |
| 73 | raw["operands"] = r.Operands |
| 74 | } |
| 75 | if r.Relation != "" { |
| 76 | raw["relation"] = r.Relation |
| 77 | } |
| 78 | if r.Foundry != "" { |
| 79 | raw["foundry"] = r.Foundry |
| 80 | } |
| 81 | if r.Key != "" { |
| 82 | raw["key"] = r.Key |
| 83 | } |
| 84 | if r.Layer != "" { |
| 85 | raw["layer"] = r.Layer |
| 86 | } |
| 87 | if r.Match != "" { |
| 88 | raw["match"] = r.Match |
| 89 | } |
| 90 | if r.Value != "" { |
| 91 | raw["value"] = r.Value |
| 92 | } |
Akron | 1a5fccd | 2025-05-27 09:54:09 +0200 | [diff] [blame^] | 93 | if len(r.Rewrites) > 0 { |
| 94 | raw["rewrites"] = r.Rewrites |
| 95 | } |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 96 | |
| 97 | // Add any extra fields |
Akron | 87948e8 | 2025-05-26 18:19:51 +0200 | [diff] [blame] | 98 | maps.Copy(raw, r.Extra) |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 99 | |
| 100 | return json.Marshal(raw) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 101 | } |
| 102 | |
| 103 | // ParseJSON parses a JSON string into our AST representation |
| 104 | func ParseJSON(data []byte) (ast.Node, error) { |
| 105 | var raw rawNode |
| 106 | if err := json.Unmarshal(data, &raw); err != nil { |
| 107 | return nil, fmt.Errorf("failed to parse JSON: %w", err) |
| 108 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 109 | if raw.Type == "" { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 110 | return nil, fmt.Errorf("missing required field '@type' in JSON") |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 111 | } |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 112 | return parseNode(raw) |
| 113 | } |
| 114 | |
| 115 | // parseNode converts a raw node into an AST node |
| 116 | func parseNode(raw rawNode) (ast.Node, error) { |
| 117 | switch raw.Type { |
| 118 | case "koral:token": |
| 119 | if raw.Wrap == nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 120 | return nil, fmt.Errorf("token node of type '%s' missing required 'wrap' field", raw.Type) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 121 | } |
| 122 | var wrapRaw rawNode |
| 123 | if err := json.Unmarshal(raw.Wrap, &wrapRaw); err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 124 | return nil, fmt.Errorf("failed to parse 'wrap' field in token node: %w", err) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 125 | } |
| 126 | wrap, err := parseNode(wrapRaw) |
| 127 | if err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 128 | return nil, fmt.Errorf("error parsing wrapped node: %w", err) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 129 | } |
Akron | 1a5fccd | 2025-05-27 09:54:09 +0200 | [diff] [blame^] | 130 | return &ast.Token{Wrap: wrap, Rewrites: raw.Rewrites}, nil |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 131 | |
| 132 | case "koral:termGroup": |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 133 | if len(raw.Operands) == 0 { |
| 134 | return nil, fmt.Errorf("term group must have at least one operand") |
| 135 | } |
| 136 | |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 137 | operands := make([]ast.Node, len(raw.Operands)) |
| 138 | for i, op := range raw.Operands { |
| 139 | node, err := parseNode(op) |
| 140 | if err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 141 | return nil, fmt.Errorf("error parsing operand %d: %w", i+1, err) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 142 | } |
| 143 | operands[i] = node |
| 144 | } |
| 145 | |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 146 | if raw.Relation == "" { |
| 147 | return nil, fmt.Errorf("term group must have a 'relation' field") |
| 148 | } |
| 149 | |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 150 | relation := ast.AndRelation |
| 151 | if strings.HasSuffix(raw.Relation, "or") { |
| 152 | relation = ast.OrRelation |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 153 | } else if !strings.HasSuffix(raw.Relation, "and") { |
| 154 | return nil, fmt.Errorf("invalid relation type '%s', must be one of: 'relation:and', 'relation:or'", raw.Relation) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 155 | } |
| 156 | |
| 157 | return &ast.TermGroup{ |
| 158 | Operands: operands, |
| 159 | Relation: relation, |
Akron | 1a5fccd | 2025-05-27 09:54:09 +0200 | [diff] [blame^] | 160 | Rewrites: raw.Rewrites, |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 161 | }, nil |
| 162 | |
| 163 | case "koral:term": |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 164 | if raw.Key == "" { |
| 165 | return nil, fmt.Errorf("term must have a 'key' field") |
| 166 | } |
| 167 | |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 168 | match := ast.MatchEqual |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 169 | if raw.Match != "" { |
| 170 | if strings.HasSuffix(raw.Match, "ne") { |
| 171 | match = ast.MatchNotEqual |
| 172 | } else if !strings.HasSuffix(raw.Match, "eq") { |
| 173 | return nil, fmt.Errorf("invalid match type '%s', must be one of: 'match:eq', 'match:ne'", raw.Match) |
| 174 | } |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 175 | } |
| 176 | |
| 177 | return &ast.Term{ |
Akron | 1a5fccd | 2025-05-27 09:54:09 +0200 | [diff] [blame^] | 178 | Foundry: raw.Foundry, |
| 179 | Key: raw.Key, |
| 180 | Layer: raw.Layer, |
| 181 | Match: match, |
| 182 | Value: raw.Value, |
| 183 | Rewrites: raw.Rewrites, |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 184 | }, nil |
| 185 | |
| 186 | default: |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 187 | // Store the original JSON content |
| 188 | rawContent, err := json.Marshal(raw) |
| 189 | if err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 190 | return nil, fmt.Errorf("failed to marshal unknown node type '%s': %w", raw.Type, err) |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 191 | } |
| 192 | |
| 193 | // Create a catchall node |
| 194 | catchall := &ast.CatchallNode{ |
| 195 | NodeType: raw.Type, |
| 196 | RawContent: rawContent, |
| 197 | } |
| 198 | |
| 199 | // Parse wrap if present |
| 200 | if raw.Wrap != nil { |
| 201 | var wrapRaw rawNode |
| 202 | if err := json.Unmarshal(raw.Wrap, &wrapRaw); err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 203 | return nil, fmt.Errorf("failed to parse 'wrap' field in unknown node type '%s': %w", raw.Type, err) |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 204 | } |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 205 | |
| 206 | // Check if the wrapped node is a known type |
| 207 | if wrapRaw.Type == "koral:term" || wrapRaw.Type == "koral:token" || wrapRaw.Type == "koral:termGroup" { |
| 208 | wrap, err := parseNode(wrapRaw) |
| 209 | if err != nil { |
| 210 | return nil, fmt.Errorf("error parsing wrapped node in unknown node type '%s': %w", raw.Type, err) |
| 211 | } |
| 212 | catchall.Wrap = wrap |
| 213 | } else { |
| 214 | // For unknown types, recursively parse |
| 215 | wrap, err := parseNode(wrapRaw) |
| 216 | if err != nil { |
| 217 | return nil, fmt.Errorf("error parsing wrapped node in unknown node type '%s': %w", raw.Type, err) |
| 218 | } |
| 219 | catchall.Wrap = wrap |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 220 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 221 | } |
| 222 | |
| 223 | // Parse operands if present |
| 224 | if len(raw.Operands) > 0 { |
| 225 | operands := make([]ast.Node, len(raw.Operands)) |
| 226 | for i, op := range raw.Operands { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 227 | // Check if the operand is a known type |
| 228 | if op.Type == "koral:term" || op.Type == "koral:token" || op.Type == "koral:termGroup" { |
| 229 | node, err := parseNode(op) |
| 230 | if err != nil { |
| 231 | return nil, fmt.Errorf("error parsing operand %d in unknown node type '%s': %w", i+1, raw.Type, err) |
| 232 | } |
| 233 | operands[i] = node |
| 234 | } else { |
| 235 | // For unknown types, recursively parse |
| 236 | node, err := parseNode(op) |
| 237 | if err != nil { |
| 238 | return nil, fmt.Errorf("error parsing operand %d in unknown node type '%s': %w", i+1, raw.Type, err) |
| 239 | } |
| 240 | operands[i] = node |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 241 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 242 | } |
| 243 | catchall.Operands = operands |
| 244 | } |
| 245 | |
| 246 | return catchall, nil |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 247 | } |
| 248 | } |
| 249 | |
| 250 | // SerializeToJSON converts an AST node back to JSON |
| 251 | func SerializeToJSON(node ast.Node) ([]byte, error) { |
Akron | 87948e8 | 2025-05-26 18:19:51 +0200 | [diff] [blame] | 252 | return json.MarshalIndent(nodeToRaw(node), "", " ") |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 253 | } |
| 254 | |
| 255 | // nodeToRaw converts an AST node to a raw node for JSON serialization |
| 256 | func nodeToRaw(node ast.Node) rawNode { |
| 257 | switch n := node.(type) { |
| 258 | case *ast.Token: |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 259 | if n.Wrap == nil { |
| 260 | return rawNode{ |
| 261 | Type: "koral:token", |
| 262 | } |
| 263 | } |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 264 | return rawNode{ |
| 265 | Type: "koral:token", |
| 266 | Wrap: json.RawMessage(nodeToRaw(n.Wrap).toJSON()), |
| 267 | } |
| 268 | |
| 269 | case *ast.TermGroup: |
| 270 | operands := make([]rawNode, len(n.Operands)) |
| 271 | for i, op := range n.Operands { |
| 272 | operands[i] = nodeToRaw(op) |
| 273 | } |
| 274 | return rawNode{ |
| 275 | Type: "koral:termGroup", |
| 276 | Operands: operands, |
| 277 | Relation: "relation:" + string(n.Relation), |
| 278 | } |
| 279 | |
| 280 | case *ast.Term: |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 281 | raw := rawNode{ |
| 282 | Type: "koral:term", |
| 283 | Key: n.Key, |
| 284 | Match: "match:" + string(n.Match), |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 285 | } |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 286 | if n.Foundry != "" { |
| 287 | raw.Foundry = n.Foundry |
| 288 | } |
| 289 | if n.Layer != "" { |
| 290 | raw.Layer = n.Layer |
| 291 | } |
| 292 | if n.Value != "" { |
| 293 | raw.Value = n.Value |
| 294 | } |
| 295 | return raw |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 296 | |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 297 | case *ast.CatchallNode: |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 298 | // For catchall nodes, use the stored raw content if available |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 299 | if n.RawContent != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 300 | var raw rawNode |
| 301 | if err := json.Unmarshal(n.RawContent, &raw); err == nil { |
| 302 | // Ensure we preserve the node type |
| 303 | raw.Type = n.NodeType |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 304 | |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 305 | // Handle wrap and operands if present |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 306 | if n.Wrap != nil { |
| 307 | raw.Wrap = json.RawMessage(nodeToRaw(n.Wrap).toJSON()) |
| 308 | } |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 309 | if len(n.Operands) > 0 { |
| 310 | operands := make([]rawNode, len(n.Operands)) |
| 311 | for i, op := range n.Operands { |
| 312 | operands[i] = nodeToRaw(op) |
| 313 | } |
| 314 | raw.Operands = operands |
| 315 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 316 | return raw |
| 317 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 318 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 319 | |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 320 | // If RawContent is nil or invalid, create a minimal raw node |
| 321 | raw := rawNode{ |
| 322 | Type: n.NodeType, |
| 323 | } |
| 324 | if n.Wrap != nil { |
| 325 | raw.Wrap = json.RawMessage(nodeToRaw(n.Wrap).toJSON()) |
| 326 | } |
| 327 | if len(n.Operands) > 0 { |
| 328 | operands := make([]rawNode, len(n.Operands)) |
| 329 | for i, op := range n.Operands { |
| 330 | operands[i] = nodeToRaw(op) |
| 331 | } |
| 332 | raw.Operands = operands |
| 333 | } |
| 334 | return raw |
| 335 | } |
| 336 | |
| 337 | // Return a minimal raw node for unknown types |
| 338 | return rawNode{ |
| 339 | Type: "koral:unknown", |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 340 | } |
| 341 | } |
| 342 | |
| 343 | // toJSON converts a raw node to JSON bytes |
| 344 | func (r rawNode) toJSON() []byte { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 345 | data, err := json.Marshal(r) |
| 346 | if err != nil { |
| 347 | // Return a minimal valid JSON object if marshaling fails |
| 348 | return []byte(`{"@type":"koral:unknown"}`) |
| 349 | } |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 350 | return data |
| 351 | } |