Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 1 | package parser |
| 2 | |
Akron | bf5149c | 2025-05-20 15:53:41 +0200 | [diff] [blame] | 3 | // parser is a function that takes a JSON string and returns an AST node. |
| 4 | // It is used to parse a JSON string into an AST node. |
| 5 | |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 6 | import ( |
| 7 | "encoding/json" |
| 8 | "fmt" |
| 9 | "strings" |
| 10 | |
Akron | 87948e8 | 2025-05-26 18:19:51 +0200 | [diff] [blame] | 11 | "maps" |
| 12 | |
Akron | fa55bb2 | 2025-05-26 15:10:42 +0200 | [diff] [blame] | 13 | "github.com/KorAP/KoralPipe-TermMapper/ast" |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 14 | ) |
| 15 | |
| 16 | // rawNode represents the raw JSON structure |
| 17 | type rawNode struct { |
| 18 | Type string `json:"@type"` |
| 19 | Wrap json.RawMessage `json:"wrap,omitempty"` |
| 20 | Operands []rawNode `json:"operands,omitempty"` |
| 21 | Relation string `json:"relation,omitempty"` |
| 22 | Foundry string `json:"foundry,omitempty"` |
| 23 | Key string `json:"key,omitempty"` |
| 24 | Layer string `json:"layer,omitempty"` |
| 25 | Match string `json:"match,omitempty"` |
| 26 | Value string `json:"value,omitempty"` |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 27 | // Store any additional fields |
Akron | 87948e8 | 2025-05-26 18:19:51 +0200 | [diff] [blame] | 28 | Extra map[string]any `json:"-"` |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 29 | } |
| 30 | |
| 31 | // UnmarshalJSON implements the json.Unmarshaler interface |
| 32 | func (r *rawNode) UnmarshalJSON(data []byte) error { |
| 33 | // First unmarshal into a map to capture all fields |
Akron | 87948e8 | 2025-05-26 18:19:51 +0200 | [diff] [blame] | 34 | var raw map[string]any |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 35 | if err := json.Unmarshal(data, &raw); err != nil { |
| 36 | return err |
| 37 | } |
| 38 | |
| 39 | // Create a temporary struct to unmarshal known fields |
| 40 | type tempNode rawNode |
| 41 | var temp tempNode |
| 42 | if err := json.Unmarshal(data, &temp); err != nil { |
| 43 | return err |
| 44 | } |
| 45 | *r = rawNode(temp) |
| 46 | |
| 47 | // Store any fields not in the struct in Extra |
Akron | 87948e8 | 2025-05-26 18:19:51 +0200 | [diff] [blame] | 48 | r.Extra = make(map[string]any) |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 49 | for k, v := range raw { |
| 50 | switch k { |
| 51 | case "@type", "wrap", "operands", "relation", "foundry", "key", "layer", "match", "value": |
| 52 | continue |
| 53 | default: |
| 54 | r.Extra[k] = v |
| 55 | } |
| 56 | } |
| 57 | |
| 58 | return nil |
| 59 | } |
| 60 | |
| 61 | // MarshalJSON implements the json.Marshaler interface |
| 62 | func (r rawNode) MarshalJSON() ([]byte, error) { |
| 63 | // Create a map with all fields |
Akron | 87948e8 | 2025-05-26 18:19:51 +0200 | [diff] [blame] | 64 | raw := make(map[string]any) |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 65 | |
| 66 | // Add the known fields if they're not empty |
| 67 | raw["@type"] = r.Type |
| 68 | if r.Wrap != nil { |
| 69 | raw["wrap"] = r.Wrap |
| 70 | } |
| 71 | if len(r.Operands) > 0 { |
| 72 | raw["operands"] = r.Operands |
| 73 | } |
| 74 | if r.Relation != "" { |
| 75 | raw["relation"] = r.Relation |
| 76 | } |
| 77 | if r.Foundry != "" { |
| 78 | raw["foundry"] = r.Foundry |
| 79 | } |
| 80 | if r.Key != "" { |
| 81 | raw["key"] = r.Key |
| 82 | } |
| 83 | if r.Layer != "" { |
| 84 | raw["layer"] = r.Layer |
| 85 | } |
| 86 | if r.Match != "" { |
| 87 | raw["match"] = r.Match |
| 88 | } |
| 89 | if r.Value != "" { |
| 90 | raw["value"] = r.Value |
| 91 | } |
| 92 | |
| 93 | // Add any extra fields |
Akron | 87948e8 | 2025-05-26 18:19:51 +0200 | [diff] [blame] | 94 | maps.Copy(raw, r.Extra) |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 95 | |
| 96 | return json.Marshal(raw) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 97 | } |
| 98 | |
| 99 | // ParseJSON parses a JSON string into our AST representation |
| 100 | func ParseJSON(data []byte) (ast.Node, error) { |
| 101 | var raw rawNode |
| 102 | if err := json.Unmarshal(data, &raw); err != nil { |
| 103 | return nil, fmt.Errorf("failed to parse JSON: %w", err) |
| 104 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 105 | if raw.Type == "" { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 106 | return nil, fmt.Errorf("missing required field '@type' in JSON") |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 107 | } |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 108 | return parseNode(raw) |
| 109 | } |
| 110 | |
| 111 | // parseNode converts a raw node into an AST node |
| 112 | func parseNode(raw rawNode) (ast.Node, error) { |
| 113 | switch raw.Type { |
| 114 | case "koral:token": |
| 115 | if raw.Wrap == nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 116 | return nil, fmt.Errorf("token node of type '%s' missing required 'wrap' field", raw.Type) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 117 | } |
| 118 | var wrapRaw rawNode |
| 119 | if err := json.Unmarshal(raw.Wrap, &wrapRaw); err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 120 | return nil, fmt.Errorf("failed to parse 'wrap' field in token node: %w", err) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 121 | } |
| 122 | wrap, err := parseNode(wrapRaw) |
| 123 | if err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 124 | return nil, fmt.Errorf("error parsing wrapped node: %w", err) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 125 | } |
| 126 | return &ast.Token{Wrap: wrap}, nil |
| 127 | |
| 128 | case "koral:termGroup": |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 129 | if len(raw.Operands) == 0 { |
| 130 | return nil, fmt.Errorf("term group must have at least one operand") |
| 131 | } |
| 132 | |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 133 | operands := make([]ast.Node, len(raw.Operands)) |
| 134 | for i, op := range raw.Operands { |
| 135 | node, err := parseNode(op) |
| 136 | if err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 137 | return nil, fmt.Errorf("error parsing operand %d: %w", i+1, err) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 138 | } |
| 139 | operands[i] = node |
| 140 | } |
| 141 | |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 142 | if raw.Relation == "" { |
| 143 | return nil, fmt.Errorf("term group must have a 'relation' field") |
| 144 | } |
| 145 | |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 146 | relation := ast.AndRelation |
| 147 | if strings.HasSuffix(raw.Relation, "or") { |
| 148 | relation = ast.OrRelation |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 149 | } else if !strings.HasSuffix(raw.Relation, "and") { |
| 150 | return nil, fmt.Errorf("invalid relation type '%s', must be one of: 'relation:and', 'relation:or'", raw.Relation) |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 151 | } |
| 152 | |
| 153 | return &ast.TermGroup{ |
| 154 | Operands: operands, |
| 155 | Relation: relation, |
| 156 | }, nil |
| 157 | |
| 158 | case "koral:term": |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 159 | if raw.Key == "" { |
| 160 | return nil, fmt.Errorf("term must have a 'key' field") |
| 161 | } |
| 162 | |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 163 | match := ast.MatchEqual |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 164 | if raw.Match != "" { |
| 165 | if strings.HasSuffix(raw.Match, "ne") { |
| 166 | match = ast.MatchNotEqual |
| 167 | } else if !strings.HasSuffix(raw.Match, "eq") { |
| 168 | return nil, fmt.Errorf("invalid match type '%s', must be one of: 'match:eq', 'match:ne'", raw.Match) |
| 169 | } |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 170 | } |
| 171 | |
| 172 | return &ast.Term{ |
| 173 | Foundry: raw.Foundry, |
| 174 | Key: raw.Key, |
| 175 | Layer: raw.Layer, |
| 176 | Match: match, |
| 177 | Value: raw.Value, |
| 178 | }, nil |
| 179 | |
| 180 | default: |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 181 | // Store the original JSON content |
| 182 | rawContent, err := json.Marshal(raw) |
| 183 | if err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 184 | return nil, fmt.Errorf("failed to marshal unknown node type '%s': %w", raw.Type, err) |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 185 | } |
| 186 | |
| 187 | // Create a catchall node |
| 188 | catchall := &ast.CatchallNode{ |
| 189 | NodeType: raw.Type, |
| 190 | RawContent: rawContent, |
| 191 | } |
| 192 | |
| 193 | // Parse wrap if present |
| 194 | if raw.Wrap != nil { |
| 195 | var wrapRaw rawNode |
| 196 | if err := json.Unmarshal(raw.Wrap, &wrapRaw); err != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 197 | return nil, fmt.Errorf("failed to parse 'wrap' field in unknown node type '%s': %w", raw.Type, err) |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 198 | } |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 199 | |
| 200 | // Check if the wrapped node is a known type |
| 201 | if wrapRaw.Type == "koral:term" || wrapRaw.Type == "koral:token" || wrapRaw.Type == "koral:termGroup" { |
| 202 | wrap, err := parseNode(wrapRaw) |
| 203 | if err != nil { |
| 204 | return nil, fmt.Errorf("error parsing wrapped node in unknown node type '%s': %w", raw.Type, err) |
| 205 | } |
| 206 | catchall.Wrap = wrap |
| 207 | } else { |
| 208 | // For unknown types, recursively parse |
| 209 | wrap, err := parseNode(wrapRaw) |
| 210 | if err != nil { |
| 211 | return nil, fmt.Errorf("error parsing wrapped node in unknown node type '%s': %w", raw.Type, err) |
| 212 | } |
| 213 | catchall.Wrap = wrap |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 214 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 215 | } |
| 216 | |
| 217 | // Parse operands if present |
| 218 | if len(raw.Operands) > 0 { |
| 219 | operands := make([]ast.Node, len(raw.Operands)) |
| 220 | for i, op := range raw.Operands { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 221 | // Check if the operand is a known type |
| 222 | if op.Type == "koral:term" || op.Type == "koral:token" || op.Type == "koral:termGroup" { |
| 223 | node, err := parseNode(op) |
| 224 | if err != nil { |
| 225 | return nil, fmt.Errorf("error parsing operand %d in unknown node type '%s': %w", i+1, raw.Type, err) |
| 226 | } |
| 227 | operands[i] = node |
| 228 | } else { |
| 229 | // For unknown types, recursively parse |
| 230 | node, err := parseNode(op) |
| 231 | if err != nil { |
| 232 | return nil, fmt.Errorf("error parsing operand %d in unknown node type '%s': %w", i+1, raw.Type, err) |
| 233 | } |
| 234 | operands[i] = node |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 235 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 236 | } |
| 237 | catchall.Operands = operands |
| 238 | } |
| 239 | |
| 240 | return catchall, nil |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 241 | } |
| 242 | } |
| 243 | |
| 244 | // SerializeToJSON converts an AST node back to JSON |
| 245 | func SerializeToJSON(node ast.Node) ([]byte, error) { |
Akron | 87948e8 | 2025-05-26 18:19:51 +0200 | [diff] [blame] | 246 | return json.MarshalIndent(nodeToRaw(node), "", " ") |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 247 | } |
| 248 | |
| 249 | // nodeToRaw converts an AST node to a raw node for JSON serialization |
| 250 | func nodeToRaw(node ast.Node) rawNode { |
| 251 | switch n := node.(type) { |
| 252 | case *ast.Token: |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 253 | if n.Wrap == nil { |
| 254 | return rawNode{ |
| 255 | Type: "koral:token", |
| 256 | } |
| 257 | } |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 258 | return rawNode{ |
| 259 | Type: "koral:token", |
| 260 | Wrap: json.RawMessage(nodeToRaw(n.Wrap).toJSON()), |
| 261 | } |
| 262 | |
| 263 | case *ast.TermGroup: |
| 264 | operands := make([]rawNode, len(n.Operands)) |
| 265 | for i, op := range n.Operands { |
| 266 | operands[i] = nodeToRaw(op) |
| 267 | } |
| 268 | return rawNode{ |
| 269 | Type: "koral:termGroup", |
| 270 | Operands: operands, |
| 271 | Relation: "relation:" + string(n.Relation), |
| 272 | } |
| 273 | |
| 274 | case *ast.Term: |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 275 | raw := rawNode{ |
| 276 | Type: "koral:term", |
| 277 | Key: n.Key, |
| 278 | Match: "match:" + string(n.Match), |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 279 | } |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 280 | if n.Foundry != "" { |
| 281 | raw.Foundry = n.Foundry |
| 282 | } |
| 283 | if n.Layer != "" { |
| 284 | raw.Layer = n.Layer |
| 285 | } |
| 286 | if n.Value != "" { |
| 287 | raw.Value = n.Value |
| 288 | } |
| 289 | return raw |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 290 | |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 291 | case *ast.CatchallNode: |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 292 | // For catchall nodes, use the stored raw content if available |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 293 | if n.RawContent != nil { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 294 | var raw rawNode |
| 295 | if err := json.Unmarshal(n.RawContent, &raw); err == nil { |
| 296 | // Ensure we preserve the node type |
| 297 | raw.Type = n.NodeType |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 298 | |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 299 | // Handle wrap and operands if present |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 300 | if n.Wrap != nil { |
| 301 | raw.Wrap = json.RawMessage(nodeToRaw(n.Wrap).toJSON()) |
| 302 | } |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 303 | if len(n.Operands) > 0 { |
| 304 | operands := make([]rawNode, len(n.Operands)) |
| 305 | for i, op := range n.Operands { |
| 306 | operands[i] = nodeToRaw(op) |
| 307 | } |
| 308 | raw.Operands = operands |
| 309 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 310 | return raw |
| 311 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 312 | } |
Akron | 3295842 | 2025-05-16 16:33:05 +0200 | [diff] [blame] | 313 | |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 314 | // If RawContent is nil or invalid, create a minimal raw node |
| 315 | raw := rawNode{ |
| 316 | Type: n.NodeType, |
| 317 | } |
| 318 | if n.Wrap != nil { |
| 319 | raw.Wrap = json.RawMessage(nodeToRaw(n.Wrap).toJSON()) |
| 320 | } |
| 321 | if len(n.Operands) > 0 { |
| 322 | operands := make([]rawNode, len(n.Operands)) |
| 323 | for i, op := range n.Operands { |
| 324 | operands[i] = nodeToRaw(op) |
| 325 | } |
| 326 | raw.Operands = operands |
| 327 | } |
| 328 | return raw |
| 329 | } |
| 330 | |
| 331 | // Return a minimal raw node for unknown types |
| 332 | return rawNode{ |
| 333 | Type: "koral:unknown", |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 334 | } |
| 335 | } |
| 336 | |
| 337 | // toJSON converts a raw node to JSON bytes |
| 338 | func (r rawNode) toJSON() []byte { |
Akron | 56e09e7 | 2025-05-22 15:38:35 +0200 | [diff] [blame] | 339 | data, err := json.Marshal(r) |
| 340 | if err != nil { |
| 341 | // Return a minimal valid JSON object if marshaling fails |
| 342 | return []byte(`{"@type":"koral:unknown"}`) |
| 343 | } |
Akron | b7e1f35 | 2025-05-16 15:45:23 +0200 | [diff] [blame] | 344 | return data |
| 345 | } |