blob: 40bd9d73cbe0fe286e4b35e273537bcfbfb3e72d [file] [log] [blame]
Joachim Bingel4b405f52013-11-15 15:29:30 +00001package de.ids_mannheim.korap.query.serialize;
2
Joachim Bingelb5ada902013-11-19 14:46:04 +00003import de.ids_mannheim.korap.query.PoliqarpPlusLexer;
4import de.ids_mannheim.korap.query.PoliqarpPlusParser;
Joachim Bingel16da4e12013-12-17 09:48:12 +00005import de.ids_mannheim.korap.util.QueryException;
Michael Hanld8116e52014-04-25 20:31:29 +00006import org.antlr.v4.runtime.*;
7import org.antlr.v4.runtime.tree.ParseTree;
Michael Hanl27e50582013-12-07 18:04:13 +00008import org.slf4j.Logger;
9import org.slf4j.LoggerFactory;
10
Michael Hanld8116e52014-04-25 20:31:29 +000011import java.lang.reflect.Method;
12import java.util.*;
13
Joachim Bingel4b405f52013-11-15 15:29:30 +000014/**
15 * Map representation of Poliqarp syntax tree as returned by ANTLR
Joachim Bingel4b405f52013-11-15 15:29:30 +000016 *
Michael Hanld8116e52014-04-25 20:31:29 +000017 * @author joachim
Joachim Bingel4b405f52013-11-15 15:29:30 +000018 */
Joachim Bingelc8a28e42014-04-24 15:06:42 +000019public class PoliqarpPlusTree extends Antlr4AbstractSyntaxTree {
Michael Hanl27e50582013-12-07 18:04:13 +000020
21 Logger log = LoggerFactory.getLogger(PoliqarpPlusTree.class);
Michael Hanld8116e52014-04-25 20:31:29 +000022 /**
23 * Top-level map representing the whole request.
24 */
25 LinkedHashMap<String, Object> requestMap = new LinkedHashMap<String, Object>();
26 /**
27 * Keeps track of open node categories
28 */
29 LinkedList<String> openNodeCats = new LinkedList<String>();
30 /**
31 * Flag that indicates whether token fields or meta fields are currently being processed
32 */
33 boolean inMeta = false;
34 /**
35 * Flag that indicates whether a cq_segment is to be ignored (e.g. when it is empty, is followed directly by only a spanclass and has no other children etc...).
36 */
37 boolean ignoreCq_segment = false;
38 /**
39 * Flag that indicates whether a cq_segments element is quantified by an occ element.
40 */
41 boolean cqHasOccSibling = false;
42 /**
43 * Flag that indicates whether a cq_segments' children are quantified by an occ element.
44 */
45 boolean cqHasOccChild = false;
46 /**
47 * Flag for negation of complete field
48 */
49 boolean negField = false;
50 /**
51 * Flag that indicates whether subsequent element is to be aligned.
52 */
53 boolean alignNext = false;
54 /**
55 * Flag that indicates whether current element has been aligned.
56 */
57 boolean isAligned = false;
58 /**
59 * Indicates a sequence which has an align operator as its child. Needed for deciding
60 * when to close the align group object.
61 */
Joachim Bingela67e6a32014-01-02 18:35:24 +000062// ParseTree alignedSequence = null;
Michael Hanld8116e52014-04-25 20:31:29 +000063 /**
64 * Parser object deriving the ANTLR parse tree.
65 */
66 Parser parser;
67 /**
68 * Keeps track of all visited nodes in a tree
69 */
70 List<ParseTree> visited = new ArrayList<ParseTree>();
Joachim Bingel4b405f52013-11-15 15:29:30 +000071
Michael Hanld8116e52014-04-25 20:31:29 +000072 /**
73 * Keeps track of active fields (like 'base=foo').
74 */
75 LinkedList<ArrayList<Object>> fieldStack = new LinkedList<ArrayList<Object>>();
76 /**
77 * Keeps track of active tokens.
78 */
79 LinkedList<LinkedHashMap<String, Object>> tokenStack = new LinkedList<LinkedHashMap<String, Object>>();
80 /**
81 * Marks the currently active token in order to know where to add flags (might already have been taken away from token stack).
82 */
83 LinkedHashMap<String, Object> curToken = new LinkedHashMap<String, Object>();
84 /**
85 * Keeps track of active object.
86 */
87 LinkedList<LinkedHashMap<String, Object>> objectStack = new LinkedList<LinkedHashMap<String, Object>>();
88 /**
89 * Marks the object to which following occurrence information is to be added.
90 */
91 LinkedHashMap<String, Object> curOccGroup = new LinkedHashMap<String, Object>();
92 /**
93 * Keeps track of how many objects there are to pop after every recursion of {@link #processNode(ParseTree)}
94 */
95 LinkedList<Integer> objectsToPop = new LinkedList<Integer>();
96 /**
97 * Keeps track of how many objects there are to pop after every recursion of {@link #processNode(ParseTree)}
98 */
99 LinkedList<Integer> tokensToPop = new LinkedList<Integer>();
100 /**
101 * Keeps track of how many objects there are to pop after every recursion of {@link #processNode(ParseTree)}
102 */
103 LinkedList<Integer> fieldsToPop = new LinkedList<Integer>();
104 /**
105 * If true, print debug statements
106 */
107 public static boolean verbose = false;
108 /**
109 * Index of the current child of its parent (needed for relating occ elements to their operands).
110 */
111 int curChildIndex = 0;
112 /**
113 *
114 */
115 Integer stackedObjects = 0;
116 Integer stackedTokens = 0;
117 Integer stackedFields = 0;
118
119
120 /**
121 * Most centrally, this class maintains a set of nested maps and lists which represent the JSON tree, which is built by the JSON serialiser
122 * on basis of the {@link #requestMap} at the root of the tree.
123 * <br/>
124 * The class further maintains a set of stacks which effectively keep track of which objects to embed in which containing objects.
125 *
126 * @param query The syntax tree as returned by ANTLR
127 * @throws QueryException
128 */
129 public PoliqarpPlusTree(String query) throws QueryException {
130 try {
131 process(query);
132 } catch (NullPointerException e) {
133 if (query.contains(" ")) {
134 System.err.println("Warning: It seems like your query contains illegal whitespace characters. Trying again with whitespaces removed...");
135 query = query.replaceAll(" ", "");
136 process(query);
137 } else {
138 throw new QueryException("Error handling query.");
139 }
140 }
141 System.out.println(">>> " + requestMap.get("query") + " <<<");
Michael Hanl27e50582013-12-07 18:04:13 +0000142 log.info(">>> " + requestMap.get("query") + " <<<");
Michael Hanld8116e52014-04-25 20:31:29 +0000143 }
Joachim Bingel4b405f52013-11-15 15:29:30 +0000144
Michael Hanld8116e52014-04-25 20:31:29 +0000145 @Override
146 public Map<String, Object> getRequestMap() {
147 return requestMap;
148 }
149
150 @Override
151 public void process(String query) throws QueryException {
152 ParseTree tree = null;
153 try {
154 tree = parsePoliqarpQuery(query);
155 } catch (QueryException e) {
156 // if the second time query could not be parsed, throw exception!
157 tree = parsePoliqarpQuery(query.replaceAll(" ", ""));
158 }
159 super.parser = this.parser;
160 System.out.println("Processing PoliqarpPlus");
161 requestMap.put("@context", "http://ids-mannheim.de/ns/KorAP/json-ld/v0.1/context.jsonld");
Joachim Bingelc8a28e42014-04-24 15:06:42 +0000162// prepareContext(requestMap);
Michael Hanld8116e52014-04-25 20:31:29 +0000163 processNode(tree);
164 }
165
166 /**
167 * Recursively calls itself with the children of the currently active node, traversing the tree nodes in a top-down, depth-first fashion.
168 * A list is maintained that contains all visited nodes
169 * in case they have been directly addressed by its (grand-/grand-grand-/...) parent node, such that some processing time is saved, as these node will
170 * not be processed. This method is effectively a list of if-statements that are responsible for treating the different node types correctly and filling the
171 * respective maps/lists.
172 *
173 * @param node The currently processed node. The process(String query) method calls this method with the root.
174 * @throws QueryException
175 */
176 @SuppressWarnings("unchecked")
177 private void processNode(ParseTree node) throws QueryException {
178 // Top-down processing
179 if (visited.contains(node)) return;
180 else visited.add(node);
181
182 if (alignNext) {
183 alignNext = false;
184 isAligned = true;
185 }
186
187 String nodeCat = getNodeCat(node);
188 openNodeCats.push(nodeCat);
189
190 stackedObjects = 0;
191 stackedTokens = 0;
192 stackedFields = 0;
193
194 if (verbose) {
195 System.err.println(" " + objectStack);
Joachim Bingelef7059b2014-04-22 13:44:19 +0000196// System.err.println(" "+tokenStack);
Michael Hanld8116e52014-04-25 20:31:29 +0000197 System.out.println(openNodeCats);
198 }
199
Joachim Bingel4b405f52013-11-15 15:29:30 +0000200
201 /*
Michael Hanld8116e52014-04-25 20:31:29 +0000202 ****************************************************************
Joachim Bingel4b405f52013-11-15 15:29:30 +0000203 ****************************************************************
204 * Processing individual node categories *
205 ****************************************************************
206 ****************************************************************
207 */
Joachim Bingel4b405f52013-11-15 15:29:30 +0000208
Michael Hanld8116e52014-04-25 20:31:29 +0000209 // cq_segments/sq_segments: token group
210 if (nodeCat.equals("cq_segments") || nodeCat.equals("sq_segments")) {
211 cqHasOccSibling = false;
212 cqHasOccChild = false;
213 // disregard empty segments in simple queries (parsed by ANTLR as empty cq_segments)
214 ignoreCq_segment = (node.getChildCount() == 1 && (node.getChild(0).toStringTree(parser).equals(" ") || getNodeCat(node.getChild(0)).equals("spanclass") || getNodeCat(node.getChild(0)).equals("position")));
215 // ignore this node if it only serves as an aligned sequence container
216 if (node.getChildCount() > 1) {
217 if (getNodeCat(node.getChild(1)).equals("cq_segments") && hasChild(node.getChild(1), "alignment")) {
Joachim Bingelc8a28e42014-04-24 15:06:42 +0000218// if (getNodeCat(node.getChild(0)).equals("align")) {
Michael Hanld8116e52014-04-25 20:31:29 +0000219 ignoreCq_segment = true;
220 }
221 }
222 if (!ignoreCq_segment) {
223 LinkedHashMap<String, Object> sequence = new LinkedHashMap<String, Object>();
224 // Step 0: cq_segments has 'occ' child -> introduce group as super group to the sequence/token/group
225 // this requires creating a group and inserting it at a suitable place
226 if (node.getParent().getChildCount() > curChildIndex + 2 && getNodeCat(node.getParent().getChild(curChildIndex + 2)).equals("occ")) {
227 cqHasOccSibling = true;
228 createOccGroup(node);
229 }
230 if (getNodeCat(node.getChild(node.getChildCount() - 1)).equals("occ")) {
231 cqHasOccChild = true;
232 }
233 // Step I: decide type of element (one or more elements? -> token or sequence)
234 // take into account a possible 'occ' child with accompanying parantheses, therefore 3 extra children
235 int occExtraChildren = cqHasOccChild ? 3 : 0;
236 if (node.getChildCount() > 1 + occExtraChildren) {
237 ParseTree emptySegments = getFirstChildWithCat(node, "empty_segments");
238 if (emptySegments != null && emptySegments != node.getChild(0)) {
239 String[] minmax = parseEmptySegments(emptySegments);
240 Integer min = Integer.parseInt(minmax[0]);
241 Integer max = Integer.parseInt(minmax[1]);
242 sequence.put("@type", "korap:group");
243 sequence.put("operation", "operation:sequence");
244 sequence.put("inOrder", true);
245 ArrayList<Object> constraint = new ArrayList<Object>();
246 sequence.put("distances", constraint);
247 ArrayList<Object> sequenceOperands = new ArrayList<Object>();
248 sequence.put("operands", sequenceOperands);
249 objectStack.push(sequence);
250 stackedObjects++;
251 LinkedHashMap<String, Object> distMap = new LinkedHashMap<String, Object>();
252 constraint.add(distMap);
253 distMap.put("@type", "korap:distance");
254 distMap.put("key", "w");
255 distMap.put("min", min);
256 distMap.put("max", max);
257 } else {
258 sequence.put("@type", "korap:group");
259 sequence.put("operation", "operation:" + "sequence");
260 ArrayList<Object> sequenceOperands = new ArrayList<Object>();
261 if (emptySegments != null) {
262 String[] minmax = parseEmptySegments(emptySegments);
263 Integer min = Integer.parseInt(minmax[0]);
264 Integer max = Integer.parseInt(minmax[1]);
265 sequence.put("offset-min", min - 1);
266 sequence.put("offset-max", max - 1);
267 }
268 sequence.put("operands", sequenceOperands);
269 objectStack.push(sequence);
270 stackedObjects++;
271 }
272 } else {
273 // if only child, make the sequence a mere token...
274 // ... but only if it has a real token/element beneath it
275 if (!isContainerOnly(node)) {
276 sequence.put("@type", "korap:token");
277 tokenStack.push(sequence);
278 stackedTokens++;
279 objectStack.push(sequence);
280 stackedObjects++;
281 // else, it's a group (with shrink()/spanclass/align... as child)
282 } else {
Joachim Bingelef7059b2014-04-22 13:44:19 +0000283// sequence.put("@type", "korap:group");
Joachim Bingel2980bda2014-04-04 12:09:46 +0000284// objectStack.push(sequence);
285// stackedObjects++;
Michael Hanld8116e52014-04-25 20:31:29 +0000286 }
287 }
288 // Step II: decide where to put this element
289 // check if this is an argument for a containing occurrence group (see step 0)
290 if (cqHasOccSibling) {
291 ArrayList<Object> topGroupOperands = (ArrayList<Object>) objectStack.get(1).get("operands");
292 topGroupOperands.add(sequence);
293 // ...if not modified by occurrence, put into appropriate super object
294 } else {
295 if (openNodeCats.get(1).equals("query")) {
296 // cq_segment is top query node
297 if (node.getParent().getChildCount() == 1) {
298 // only child
299 requestMap.put("query", sequence);
300 } else {
301 // not an only child, need to create containing sequence
302 if (node.getParent().getChild(0).equals(node)) {
303 // if first child, create containing sequence and embed there
304 LinkedHashMap<String, Object> superSequence = new LinkedHashMap<String, Object>();
305 superSequence.put("@type", "korap:group");
306 superSequence.put("operation", "operation:" + "sequence");
307 ArrayList<Object> operands = new ArrayList<Object>();
308 superSequence.put("operands", operands);
309 operands.add(sequence);
310 requestMap.put("query", superSequence);
311 objectStack.push(superSequence); // add at 2nd position to keep current cq_segment accessible
312 stackedObjects++;
313 } else {
314 // if not first child, add to previously created parent sequence
315 ArrayList<Object> topSequenceOperands;
316 try {
317 topSequenceOperands = (ArrayList<Object>) objectStack.get(1).get("operands");
318 } catch (IndexOutOfBoundsException e) {
319 // Normally, the current element has been added to the object stack, so the try-block works fine.
320 // In some cases however, the element is not added (see ultimate else-block in Step I), and we need a
321 // fallback to the first element in the object stack.
322 topSequenceOperands = (ArrayList<Object>) objectStack.get(0).get("operands");
323 }
324
325 topSequenceOperands.add(sequence);
326 }
327 }
328 } else if (!objectStack.isEmpty()) {
329 // embed in super sequence
330 ArrayList<Object> topSequenceOperands;
331 if (!isContainerOnly(node)) {
332 try {
333 topSequenceOperands = (ArrayList<Object>) objectStack.get(1).get("operands");
334 topSequenceOperands.add(sequence);
335 } catch (IndexOutOfBoundsException e) {
Joachim Bingelef7059b2014-04-22 13:44:19 +0000336// topSequenceOperands = (ArrayList<Object>) objectStack.get(0).get("operands");
Michael Hanld8116e52014-04-25 20:31:29 +0000337 }
338 }
339
340
341 }
342 }
343 }
344 }
345
346 // cq_segment
347 if (nodeCat.equals("cq_segment")) {
348 int onTopOfObjectStack = 0;
349 // Step I: determine whether to create new token or get token from the stack (if added by cq_segments)
350 LinkedHashMap<String, Object> token;
351 if (tokenStack.isEmpty()) {
352 token = new LinkedHashMap<String, Object>();
353 tokenStack.push(token);
354 stackedTokens++;
355 // do this only if token is newly created, otherwise it'll be in objectStack twice
356 objectStack.push(token);
357 onTopOfObjectStack = 1;
358 stackedObjects++;
359 } else {
360 // in case cq_segments has already added the token
361 token = tokenStack.getFirst();
362 }
363 curToken = token;
364 // Step II: start filling object and add to containing sequence
365 token.put("@type", "korap:token");
366 // add token to sequence only if it is not an only child (in that case, cq_segments has already added the info and is just waiting for the values from "field")
367 // take into account a possible 'occ' child
368 if (node.getParent().getChildCount() > 1) {
369 if (node.getText().equals("[]")) {
Joachim Bingelffd65e32014-01-22 14:22:57 +0000370// LinkedHashMap<String, Object> sequence = objectStack.get(onTopOfObjectStack);
371// String offsetStr = (String) sequence.get("offset");
372// if (offsetStr == null) {
373// sequence.put("offset", "1");
374// } else {
375// Integer offset = Integer.parseInt(offsetStr);
376// sequence.put("offset", offset+1);
377// }
378//
Michael Hanld8116e52014-04-25 20:31:29 +0000379 } else {
380 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) objectStack.get(onTopOfObjectStack).get("operands");
381 topSequenceOperands.add(token);
382 }
383 }
384 }
Joachim Bingel4b405f52013-11-15 15:29:30 +0000385
Michael Hanld8116e52014-04-25 20:31:29 +0000386 // cq_segment modified by occurrence
387 if (nodeCat.equals("cq_seg_occ")) {
388 LinkedHashMap<String, Object> group = new LinkedHashMap<String, Object>();
389 curOccGroup = group;
390 group.put("@type", "korap:group");
391 group.put("operands", new ArrayList<Object>());
392 objectStack.push(group);
393 stackedObjects++;
394 // add group to sequence only if it is not an only child (in that case, cq_segments has already added the info and is just waiting for the values from "field")
395 // take into account a possible 'occ' child
396// if (node.getParent().getChildCount()>1) {
397 if (objectStack.size() > 1) {
398 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) objectStack.get(1).get("operands");
399 topSequenceOperands.add(group);
400 } else {
401 requestMap.put("query", group);
402 }
403 }
404
405 // disjoint cq_segments, like ([base=foo][base=bar])|[base=foobar]
406 if (nodeCat.equals("cq_disj_segments")) {
407 LinkedHashMap<String, Object> disjunction = new LinkedHashMap<String, Object>();
408 objectStack.push(disjunction);
409 stackedObjects++;
410 ArrayList<Object> disjOperands = new ArrayList<Object>();
411 disjunction.put("@type", "korap:group");
412 disjunction.put("operation", "operation:" + "or");
413 disjunction.put("operands", disjOperands);
414 // decide where to put the disjunction
415 if (openNodeCats.get(1).equals("query")) {
416 requestMap.put("query", disjunction);
417 } else if (openNodeCats.get(1).equals("cq_segments")) {
418 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) objectStack.get(1).get("operands");
419 topSequenceOperands.add(disjunction);
420 }
421 }
422
423 // field element (outside meta)
424 if (nodeCat.equals("field")) {
425 LinkedHashMap<String, Object> fieldMap = new LinkedHashMap<String, Object>();
426 // Step I: extract info
427 String layer = "";
428 String foundry = null;
429 String value = null;
430 String key = null;
431 ParseTree fieldNameNode = node.getChild(0);
432 if (fieldNameNode.getChildCount() == 1) {
433 layer = fieldNameNode.getChild(0).toStringTree(parser); //e.g. (field_name base) (field_op !=) (re_query "bar*")
434 } else if (fieldNameNode.getChildCount() == 3) {
435 // layer is indicated, merge layer and field name (0th and 2nd children, 1st is "/")
436 foundry = fieldNameNode.getChild(0).toStringTree(parser);
437 layer = fieldNameNode.getChild(2).toStringTree(parser);
Joachim Bingelac13a472014-02-14 21:18:52 +0000438// } else if (fieldNameNode.getChildCount() == 5) {
439// // layer and value are indicated
440// foundry = fieldNameNode.getChild(0).toStringTree(poliqarpParser);
441// layer = fieldNameNode.getChild(2).toStringTree(poliqarpParser);
442// value = fieldNameNode.getChild(4).toStringTree(poliqarpParser);
Michael Hanld8116e52014-04-25 20:31:29 +0000443 }
444 if (hasChild(node, "key")) {
445 ParseTree keyNode = getFirstChildWithCat(node, "key");
446 key = keyNode.getChild(0).toStringTree(parser);
447 }
448
449 String relation = node.getChild(1).getChild(0).toStringTree(parser);
450 if (negField) {
451 if (relation.startsWith("!")) {
452 relation = relation.substring(1);
453 } else {
454 relation = "!" + relation;
455 }
456 }
457 if (relation.equals("=")) {
458 relation = "eq";
459 } else if (relation.equals("!=")) {
460 relation = "ne";
461 }
462
463 ParseTree valNode;
464 if (hasChild(node, "key")) valNode = node.getChild(3);
465 else valNode = node.getChild(2);
466 String valType = getNodeCat(valNode);
467 fieldMap.put("@type", "korap:term");
468 if (valType.equals("simple_query")) {
469 value = valNode.getChild(0).getChild(0).toStringTree(parser); //e.g. (simple_query (sq_segment foo))
470 } else if (valType.equals("re_query")) {
471 value = valNode.getChild(0).toStringTree(parser); //e.g. (re_query "bar*")
472 fieldMap.put("type", "type:regex");
473 value = value.substring(1, value.length() - 1); //remove trailing quotes
474 }
475 if (key == null) {
476 fieldMap.put("key", value);
477 } else {
478 fieldMap.put("key", key);
479 fieldMap.put("value", value);
480 }
481
482 if (layer.equals("base")) layer = "lemma";
483 fieldMap.put("layer", layer);
484 if (foundry != null) fieldMap.put("foundry", foundry);
485
486 fieldMap.put("match", "match:" + relation);
487 // Step II: decide where to put the field map (as the only value of a token or the meta filter or as a part of a group in case of coordinated fields)
488 if (fieldStack.isEmpty()) {
489 if (!inMeta) {
490 tokenStack.getFirst().put("wrap", fieldMap);
491 } else {
492 ((HashMap<String, Object>) requestMap.get("meta")).put("key", fieldMap);
493 }
494 } else {
495 fieldStack.getFirst().add(fieldMap);
496 }
497 visited.add(node.getChild(0));
498 visited.add(node.getChild(1));
499 visited.add(node.getChild(2));
500 if (key != null) visited.add(node.getChild(3));
501 }
502
503 if (nodeCat.equals("neg_field") || nodeCat.equals("neg_field_group")) {
504 negField = !negField;
505 }
506
507 // conj_field serves for both conjunctions and disjunctions
508 if (nodeCat.equals("conj_field")) {
509 LinkedHashMap<String, Object> group = new LinkedHashMap<String, Object>();
510
511 group.put("@type", "korap:termGroup");
512
513 // Step I: get operator (& or |)
514 ParseTree operatorNode = node.getChild(1).getChild(0);
515 String operator = getNodeCat(operatorNode);
516 String relation = operator.equals("&") ? "and" : "or";
517 if (negField) {
518 relation = relation.equals("or") ? "and" : "or";
519 }
520 group.put("relation", "relation:" + relation);
521 ArrayList<Object> groupOperands = new ArrayList<Object>();
522 group.put("operands", groupOperands);
523 fieldStack.push(groupOperands);
524 stackedFields++;
525 // Step II: decide where to put the group (directly under token or in top meta filter section or embed in super group)
526 if (openNodeCats.get(1).equals("cq_segment")) {
527 tokenStack.getFirst().put("wrap", group);
528 } else if (openNodeCats.get(1).equals("meta_field_group")) {
529 ((HashMap<String, Object>) requestMap.get("meta")).put("key", group);
530 } else if (openNodeCats.get(2).equals("conj_field")) {
531 fieldStack.get(1).add(group);
532 } else {
533 tokenStack.getFirst().put("wrap", group);
534 }
535 // skip the operator
536 visited.add(node.getChild(1));
537 }
538
539
540 if (nodeCat.equals("sq_segment")) {
541 // Step I: determine whether to create new token or get token from the stack (if added by cq_segments)
542 LinkedHashMap<String, Object> token;
543 if (tokenStack.isEmpty()) {
544 token = new LinkedHashMap<String, Object>();
545 tokenStack.push(token);
546 stackedTokens++;
547 } else {
548 // in case sq_segments has already added the token
549 token = tokenStack.getFirst();
550 }
551 curToken = token;
552 objectStack.push(token);
553 stackedObjects++;
554 // Step II: fill object (token values) and put into containing sequence
555 if (node.getText().equals("[]")) {
556
557 } else {
558 token.put("@type", "korap:token");
559 String word = node.getChild(0).toStringTree(parser);
560 LinkedHashMap<String, Object> tokenValues = new LinkedHashMap<String, Object>();
561 token.put("wrap", tokenValues);
562 tokenValues.put("@type", "korap:term");
563 tokenValues.put("key", word);
564 tokenValues.put("layer", "orth");
565 tokenValues.put("match", "match:" + "eq");
566 // add token to sequence only if it is not an only child (in that case, sq_segments has already added the info and is just waiting for the values from "field")
567 if (node.getParent().getChildCount() > 1) {
568 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) objectStack.get(1).get("operands");
569 topSequenceOperands.add(token);
570 }
571 }
572 visited.add(node.getChild(0));
573 }
574
575 if (nodeCat.equals("re_query")) {
576 LinkedHashMap<String, Object> reQuery = new LinkedHashMap<String, Object>();
577 reQuery.put("type", "type:regex");
578 String regex = node.getChild(0).toStringTree(parser);
579 reQuery.put("key", regex);
580 reQuery.put("match", "match:" + "eq");
581
582 // if in field, regex was already added there
583 if (!openNodeCats.get(1).equals("field")) {
584 LinkedHashMap<String, Object> token = new LinkedHashMap<String, Object>();
585 token.put("@type", "korap:token");
586 token.put("wrap", reQuery);
587 reQuery.put("@type", "korap:term");
588
589 if (openNodeCats.get(1).equals("query")) {
590 requestMap.put("query", token);
591 } else {
592 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) objectStack.get(1).get("operands");
593 topSequenceOperands.add(token);
594 }
595 }
596 }
597
598 if (nodeCat.equals("alignment")) {
599 alignNext = true;
600 LinkedHashMap<String, Object> alignGroup = new LinkedHashMap<String, Object>();
601 // push but don't increase the stackedObjects counter in order to keep this
602 // group open until the mother cq_segments node will be closed, since the
603 // operands are siblings of this align node rather than children, i.e. the group
604 // would be removed from the stack before seeing its operands.
605 objectStack.push(alignGroup);
606 stackedObjects++;
607 // Step I: get info
608 // fill group
609 alignGroup.put("@type", "korap:group");
610 alignGroup.put("alignment", "left");
611 alignGroup.put("operands", new ArrayList<Object>());
612 // Step II: decide where to put the group
613 // add group to sequence only if it is not an only child (in that case, sq_segments has already added the info and is just waiting for the relevant info)
614 if (node.getParent().getChildCount() > 1) {
615 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) objectStack.get(1).get("operands");
616 topSequenceOperands.add(alignGroup);
617 } else if (openNodeCats.get(2).equals("query")) {
618 requestMap.put("query", alignGroup);
619 } else {
620 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) objectStack.get(1).get("operands");
621 topSequenceOperands.add(alignGroup);
622 }
623 visited.add(node.getChild(0));
624 }
625
626 if (nodeCat.equals("element")) {
627 // Step I: determine whether to create new token or get token from the stack (if added by cq_segments)
628 LinkedHashMap<String, Object> elem;
629 if (tokenStack.isEmpty()) {
630 elem = new LinkedHashMap<String, Object>();
631 } else {
632 // in case sq_segments has already added the token
633 elem = tokenStack.getFirst();
634 }
635 curToken = elem;
636 objectStack.push(elem);
637 stackedObjects++;
638 // Step II: fill object (token values) and put into containing sequence
639 elem.put("@type", "korap:span");
640 int valChildIdx = node.getChildCount() - 2; // closing '>' is last child
641 String value = node.getChild(valChildIdx).toStringTree(parser);
642 ParseTree foundryNode = getFirstChildWithCat(node, "foundry");
643 ParseTree layerNode = getFirstChildWithCat(node, "layer");
644 if (foundryNode != null) {
645 elem.put("foundry", foundryNode.getChild(0).toStringTree(parser));
646 }
647 if (layerNode != null) {
648 elem.put("layer", layerNode.getChild(0).toStringTree(parser));
649 }
650 elem.put("key", value);
651 // add token to sequence only if it is not an only child (in that case, cq_segments has already added the info and is just waiting for the values from "field")
652 if (node.getParent().getChildCount() > 1) {
653 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) objectStack.get(1).get("operands");
654 topSequenceOperands.add(elem);
655 }
656 visited.add(node.getChild(0));
657 visited.add(node.getChild(1));
658 visited.add(node.getChild(2));
659 }
660
661 if (nodeCat.equals("spanclass")) {
662 LinkedHashMap<String, Object> span = new LinkedHashMap<String, Object>();
663 span.put("@type", "korap:group");
664 span.put("operation", "operation:" + "class");
665 objectStack.push(span);
666 stackedObjects++;
667 ArrayList<Object> spanOperands = new ArrayList<Object>();
668 // Step I: get info
669 int classId = 0;
670 if (getNodeCat(node.getChild(1)).equals("spanclass_id")) {
671 String ref = node.getChild(1).getChild(0).toStringTree(parser);
672 try {
673 classId = Integer.parseInt(ref);
674 } catch (NumberFormatException e) {
675 throw new QueryException("The specified class reference in the shrink/split-Operator is not a number: " + ref);
676 }
677 // only allow class id up to 255
678 if (classId > 255) {
679 classId = 0;
680 }
681 }
682 span.put("class", classId);
683 span.put("operands", spanOperands);
684 // Step II: decide where to put the span
685 // add span to sequence only if it is not an only child (in that case, cq_segments has already added the info and is just waiting for the relevant info)
686 if (openNodeCats.get(2).equals("query") && node.getParent().getChildCount() == 1) {
687 requestMap.put("query", span);
688 } else if (objectStack.size() > 1) {
689 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) objectStack.get(1).get("operands");
690 topSequenceOperands.add(span);
691 }
692 // ignore leading and trailing braces
693 visited.add(node.getChild(0));
694 visited.add(node.getChild(node.getChildCount() - 1));
695 if (getNodeCat(node.getChild(1)).equals("spanclass_id")) {
696 visited.add(node.getChild(1));
697 }
698 }
699
700 if (nodeCat.equals("position")) {
701 LinkedHashMap<String, Object> positionGroup = new LinkedHashMap<String, Object>();
702 objectStack.push(positionGroup);
703 stackedObjects++;
704 ArrayList<Object> posOperands = new ArrayList<Object>();
705 // Step I: get info
706 String relation = getNodeCat(node.getChild(0));
707 positionGroup.put("@type", "korap:group");
708 positionGroup.put("operation", "operation:" + "position");
709 positionGroup.put("frame", "frame:" + relation.toLowerCase());
Joachim Bingelffd65e32014-01-22 14:22:57 +0000710// positionGroup.put("@subtype", "incl");
Michael Hanld8116e52014-04-25 20:31:29 +0000711 positionGroup.put("operands", posOperands);
712 // Step II: decide where to put the group
713 // add group to sequence only if it is not an only child (in that case, sq_segments has already added the info and is just waiting for the relevant info)
714 if (node.getParent().getChildCount() > 1) {
715 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) objectStack.get(1).get("operands");
716 topSequenceOperands.add(positionGroup);
717 } else if (openNodeCats.get(2).equals("query")) {
718 requestMap.put("query", positionGroup);
719 } else {
720 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) objectStack.get(1).get("operands");
721 topSequenceOperands.add(positionGroup);
722 }
723 }
724
725 if (nodeCat.equals("shrink")) {
726 LinkedHashMap<String, Object> shrinkGroup = new LinkedHashMap<String, Object>();
727 objectStack.push(shrinkGroup);
728 stackedObjects++;
729 ArrayList<Object> shrinkOperands = new ArrayList<Object>();
730 // Step I: get info
731 ArrayList<Integer> classRefs = new ArrayList<Integer>();
732 String classRefOp = null;
733 if (getNodeCat(node.getChild(2)).equals("spanclass_id")) {
734 ParseTree spanNode = node.getChild(2);
735 for (int i = 0; i < spanNode.getChildCount() - 1; i++) {
736 String ref = spanNode.getChild(i).getText();
737 if (ref.equals("|") || ref.equals("&")) {
738 classRefOp = ref.equals("|") ? "intersection" : "union";
739 } else {
740 try {
741 int classRef = Integer.parseInt(ref);
742 // only allow class id up to 255
743 if (classRef > 255) {
744 classRef = 0;
745 }
746 classRefs.add(classRef);
747 } catch (NumberFormatException e) {
748 throw new QueryException("The specified class reference in the shrink/split-Operator is not a number.");
749 }
750 }
751 }
752 } else {
753 classRefs.add(0);
754 }
755 shrinkGroup.put("@type", "korap:group");
756 String type = node.getChild(0).toStringTree(parser);
757 String operation = type.equals("shrink") ? "submatch" : "split";
758 shrinkGroup.put("operation", "operation:" + operation);
759 shrinkGroup.put("classRef", classRefs);
760 if (classRefOp != null) {
761 shrinkGroup.put("classRefOp", "classRefOp:" + classRefOp);
762 }
763 shrinkGroup.put("operands", shrinkOperands);
764 int i = 1;
765 // Step II: decide where to put the group
766 // add group to sequence only if it is not an only child (in that case, sq_segments has already added the info and is just waiting for the relevant info)
767 if (node.getParent().getChildCount() > 1) {
768 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) objectStack.get(i).get("operands"); // this shrinkGroup is on top
769 topSequenceOperands.add(shrinkGroup);
770 } else if (openNodeCats.get(2).equals("query")) {
771 requestMap.put("query", shrinkGroup);
772 } else if (objectStack.size() > 1) {
773 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) objectStack.get(i).get("operands");
774 topSequenceOperands.add(shrinkGroup);
775 }
776 visited.add(node.getChild(0));
777 }
778
779 // repetition of token group
780 if (nodeCat.equals("occ")) {
781 ParseTree occChild = node.getChild(0);
782 String repetition = occChild.toStringTree(parser);
783 int[] minmax = parseRepetition(repetition);
784 curOccGroup.put("operation", "operation:" + "repetition");
785 curOccGroup.put("min", minmax[0]);
786 curOccGroup.put("max", minmax[1]);
787 visited.add(occChild);
788 }
789
790 // flags for case sensitivity and whole-word-matching
791 if (nodeCat.equals("flag")) {
792 String flag = getNodeCat(node.getChild(0)).substring(1); //substring removes leading slash '/'
793 // add to current token's value
794 if (flag.contains("i")) ((HashMap<String, Object>) curToken.get("wrap")).put("caseInsensitive", true);
795 else if (flag.contains("I")) ((HashMap<String, Object>) curToken.get("wrap")).put("caseInsensitive", false);
796 else ((HashMap<String, Object>) curToken.get("wrap")).put("flag", flag);
797 }
798
799 if (nodeCat.equals("meta")) {
800 inMeta = true;
801 LinkedHashMap<String, Object> metaFilter = new LinkedHashMap<String, Object>();
802 requestMap.put("meta", metaFilter);
803 metaFilter.put("@type", "korap:meta");
804 }
805
806 if (nodeCat.equals("within") && !getNodeCat(node.getParent()).equals("position")) {
807 ParseTree domainNode = node.getChild(2);
808 String domain = getNodeCat(domainNode);
809 LinkedHashMap<String, Object> curObject = (LinkedHashMap<String, Object>) objectStack.getFirst();
810 curObject.put("within", domain);
811 visited.add(node.getChild(0));
812 visited.add(node.getChild(1));
813 visited.add(domainNode);
814 }
815
816 objectsToPop.push(stackedObjects);
817 tokensToPop.push(stackedTokens);
818 fieldsToPop.push(stackedFields);
Joachim Bingel1417e192013-12-04 16:33:07 +0000819
Joachim Bingel4b405f52013-11-15 15:29:30 +0000820 /*
821 ****************************************************************
822 ****************************************************************
823 * recursion until 'request' node (root of tree) is processed *
Joachim Bingel7fd4b1b2013-12-04 09:04:40 +0000824 ****************************************************************
Joachim Bingel4b405f52013-11-15 15:29:30 +0000825 ****************************************************************
826 */
Michael Hanld8116e52014-04-25 20:31:29 +0000827 for (int i = 0; i < node.getChildCount(); i++) {
828 ParseTree child = node.getChild(i);
829 curChildIndex = i;
830 processNode(child);
831 }
832
833 // set negField back
834 if (nodeCat.equals("neg_field") || nodeCat.equals("neg_field_group")) {
835 negField = !negField;
836 }
837
838 // pop the align group that was introduced by previous 'align' but never closed
Joachim Bingel84e33df2014-01-31 14:02:46 +0000839// if (isAligned) {
840// isAligned=false;
841// objectStack.pop();
842// }
Joachim Bingel4b405f52013-11-15 15:29:30 +0000843
Michael Hanld8116e52014-04-25 20:31:29 +0000844 // Stuff that happens when leaving a node (taking items off the stacks)
845 for (int i = 0; i < objectsToPop.get(0); i++) {
846 objectStack.pop();
847 }
848 objectsToPop.pop();
849 for (int i = 0; i < tokensToPop.get(0); i++) {
850 tokenStack.pop();
851 }
852 tokensToPop.pop();
853 for (int i = 0; i < fieldsToPop.get(0); i++) {
854 fieldStack.pop();
855 }
856 fieldsToPop.pop();
857 openNodeCats.pop();
858 }
Joachim Bingelba9a0ab2014-01-29 10:12:25 +0000859
Michael Hanld8116e52014-04-25 20:31:29 +0000860 private int[] parseRepetition(String repetition) {
861 if (repetition.equals("*")) {
862 return new int[]{0, 100};
863 } else if (repetition.equals("+")) {
864 return new int[]{1, 100};
865 } else if (repetition.equals("?")) {
866 return new int[]{0, 1};
867 } else {
868 repetition = repetition.substring(1, repetition.length() - 1); // remove braces
869 String[] splitted = repetition.split(",");
870 if (splitted.length == 2) {
871 return new int[]{Integer.parseInt(splitted[0]), Integer.parseInt(splitted[1])};
872 } else {
873 return new int[]{Integer.parseInt(splitted[0]), Integer.parseInt(splitted[0])};
874 }
Joachim Bingelffd65e32014-01-22 14:22:57 +0000875
Michael Hanld8116e52014-04-25 20:31:29 +0000876 }
877 }
Joachim Bingel94a1ccd2013-12-10 10:37:29 +0000878
Michael Hanld8116e52014-04-25 20:31:29 +0000879 private String[] parseEmptySegments(ParseTree emptySegments) {
880 String[] minmax = new String[2];
881 Integer min = 1;
882 Integer max = 1;
883 ParseTree child;
884 for (int i = 0; i < emptySegments.getChildCount() - 1; i++) {
885 child = emptySegments.getChild(i);
886 ParseTree nextSibling = emptySegments.getChild(i + 1);
887 String nextSiblingString = nextSibling.toStringTree();
888 if (child.toStringTree().equals("[]")) {
889 if (nextSiblingString.equals("?")) {
890 max++;
891 } else if (nextSiblingString.startsWith("{")) {
892 String occ = nextSiblingString.substring(1, nextSiblingString.length() - 1);
893 System.out.println(occ);
894 if (occ.contains(",")) {
895 String[] minmaxOcc = occ.split(",");
896 min += Integer.parseInt(minmaxOcc[0]);
897 max += Integer.parseInt(minmaxOcc[1]);
898 } else {
899 min += Integer.parseInt(occ);
900 max += Integer.parseInt(occ);
901 }
902 } else {
903 min++;
904 max++;
905 }
906 }
907 }
908 child = emptySegments.getChild(emptySegments.getChildCount() - 1);
909 if (child.toStringTree().equals("[]")) {
910 min++;
911 max++;
912 }
913 minmax[0] = min.toString();
914 minmax[1] = max.toString();
915 return minmax;
916 }
Joachim Bingelb5f7bf02014-01-07 16:36:54 +0000917
Michael Hanld8116e52014-04-25 20:31:29 +0000918 @SuppressWarnings("unchecked")
919 private void createOccGroup(ParseTree node) {
920 LinkedHashMap<String, Object> occGroup = new LinkedHashMap<String, Object>();
921 occGroup.put("@type", "korap:group");
922 ArrayList<Object> groupOperands = new ArrayList<Object>();
923 occGroup.put("operands", groupOperands);
924 curOccGroup = occGroup;
925 objectStack.push(occGroup);
926 stackedObjects++;
927 // if only this group is on the object stack, add as top query element
928 if (objectStack.size() == 1) {
929 requestMap.put("query", occGroup);
930 // embed in super sequence
931 } else {
932 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) objectStack.get(1).get("operands");
933 topSequenceOperands.add(occGroup);
934 }
935 }
Joachim Bingel4b405f52013-11-15 15:29:30 +0000936
Joachim Bingel4b405f52013-11-15 15:29:30 +0000937
Michael Hanld8116e52014-04-25 20:31:29 +0000938 private ParserRuleContext parsePoliqarpQuery(String p) throws QueryException {
939 checkUnbalancedPars(p);
Joachim Bingel4b405f52013-11-15 15:29:30 +0000940
Michael Hanld8116e52014-04-25 20:31:29 +0000941 Lexer poliqarpLexer = new PoliqarpPlusLexer((CharStream) null);
942 ParserRuleContext tree = null;
943 // Like p. 111
944 try {
Joachim Bingel4b405f52013-11-15 15:29:30 +0000945
Michael Hanld8116e52014-04-25 20:31:29 +0000946 // Tokenize input data
947 ANTLRInputStream input = new ANTLRInputStream(p);
948 poliqarpLexer.setInputStream(input);
949 CommonTokenStream tokens = new CommonTokenStream(poliqarpLexer);
950 parser = new PoliqarpPlusParser(tokens);
951
952 // Don't throw out erroneous stuff
953 parser.setErrorHandler(new BailErrorStrategy());
954 parser.removeErrorListeners();
955
956 // Get starting rule from parser
957 Method startRule = PoliqarpPlusParser.class.getMethod("request");
958 tree = (ParserRuleContext) startRule.invoke(parser, (Object[]) null);
959 }
960
961 // Some things went wrong ...
962 catch (Exception e) {
963 System.err.println(e.getMessage());
964 }
965
966 if (tree == null) throw new QueryException(
967 "The query you specified could not be processed. Please make sure it is well-formed.");
968
969 // Return the generated tree
970 return tree;
971 }
972
973 public static void main(String[] args) {
Joachim Bingel4b405f52013-11-15 15:29:30 +0000974 /*
975 * For testing
976 */
Michael Hanld8116e52014-04-25 20:31:29 +0000977 String[] queries = new String[]{
978 "shrink(1|2:{1:[base=der]}{2:[base=Mann]})",
979 "{[base=Mann]}",
980 "shrink(1:[orth=Der]{1:[orth=Mann][orth=geht]})",
981 "[base=Mann/i]",
982 "[cnx/base=pos:n]",
983 "<cnx/c=np>",
984 "contains(<cnx/c=np>, [mate/pos=NE])",
985 "matches(<A>,[pos=N]*)",
986 "[base=Auto]matches(<A>,[][pos=N]{4})",
987 "[base=der][][base=Mann]"
988 };
Joachim Bingelc8a28e42014-04-24 15:06:42 +0000989// PoliqarpPlusTree.verbose=true;
Michael Hanld8116e52014-04-25 20:31:29 +0000990 for (String q : queries) {
991 try {
992 System.out.println(q);
Joachim Bingelc8a28e42014-04-24 15:06:42 +0000993// System.out.println(PoliqarpPlusTree.parsePoliqarpQuery(q).toStringTree(PoliqarpPlusTree.parser));
Michael Hanld8116e52014-04-25 20:31:29 +0000994 @SuppressWarnings("unused")
995 PoliqarpPlusTree pt = new PoliqarpPlusTree(q);
996 System.out.println(q);
997 System.out.println();
998
999 } catch (Exception npe) {
1000 npe.printStackTrace();
1001 System.out.println("null\n");
1002 }
1003 }
1004 }
Nils Diewald414a1682014-02-26 19:34:05 +00001005}