blob: aba18bb03e7fef3ef0940bec650a9569b83e8e80 [file] [log] [blame]
Joachim Bingel4b405f52013-11-15 15:29:30 +00001package de.ids_mannheim.korap.query.serialize;
2
3import java.lang.reflect.Method;
4import java.util.ArrayList;
5import java.util.HashMap;
6import java.util.LinkedHashMap;
7import java.util.LinkedList;
8import java.util.List;
9import java.util.Map;
10import java.util.regex.Matcher;
11import java.util.regex.Pattern;
12
13import org.antlr.v4.runtime.ANTLRInputStream;
14import org.antlr.v4.runtime.BailErrorStrategy;
15import org.antlr.v4.runtime.CharStream;
16import org.antlr.v4.runtime.CommonTokenStream;
17import org.antlr.v4.runtime.Lexer;
18import org.antlr.v4.runtime.Parser;
19import org.antlr.v4.runtime.ParserRuleContext;
20import org.antlr.v4.runtime.tree.ParseTree;
Joachim Bingel76b498d2014-06-06 12:06:46 +000021import org.slf4j.LoggerFactory;
Joachim Bingel4b405f52013-11-15 15:29:30 +000022
Joachim Bingelb5ada902013-11-19 14:46:04 +000023//import de.ids_mannheim.korap.query.poliqarp.PoliqarpLexer;
24//import de.ids_mannheim.korap.query.poliqarp.PoliqarpParser;
25import de.ids_mannheim.korap.query.PoliqarpLexer;
26import de.ids_mannheim.korap.query.PoliqarpParser;
27import de.ids_mannheim.korap.query.serialize.AbstractSyntaxTree;
Joachim Bingel4b405f52013-11-15 15:29:30 +000028
29/**
30 * Map representation of Poliqarp syntax tree as returned by ANTLR
31 * @author joachim
32 *
33 */
Joachim Bingelc8a28e42014-04-24 15:06:42 +000034public class PoliqarpTree extends Antlr4AbstractSyntaxTree {
Joachim Bingel76b498d2014-06-06 12:06:46 +000035 private static org.slf4j.Logger log = LoggerFactory
36 .getLogger(PoliqarpTree.class);
Joachim Bingel4b405f52013-11-15 15:29:30 +000037 /**
38 * Top-level map representing the whole request.
39 */
40 LinkedHashMap<String,Object> requestMap = new LinkedHashMap<String,Object>();
41 /**
42 * Keeps track of open node categories
43 */
44 LinkedList<String> openNodeCats = new LinkedList<String>();
45 /**
46 * Flag that indicates whether token fields or meta fields are currently being processed
47 */
48 boolean inMeta = false;
49 /**
50 * Parser object deriving the ANTLR parse tree.
51 */
52 static Parser poliqarpParser;
53 /**
54 * Keeps track of all visited nodes in a tree
55 */
56 List<ParseTree> visited = new ArrayList<ParseTree>();
57
58 /**
59 * Keeps track of active fields (like 'base=foo').
60 */
61 LinkedList<ArrayList<Object>> fieldStack = new LinkedList<ArrayList<Object>>();
62 /**
63 * Keeps track of active sequences.
64 */
65 LinkedList<LinkedHashMap<String,Object>> sequenceStack = new LinkedList<LinkedHashMap<String,Object>>();
66 /**
67 * Keeps track of active tokens.
68 */
69 LinkedList<LinkedHashMap<String,Object>> tokenStack = new LinkedList<LinkedHashMap<String,Object>>();
70 /**
71 * Keeps track of sequence/token/field groups.
72 */
73 LinkedList<ArrayList<Object>> groupStack = new LinkedList<ArrayList<Object>>();
74 /**
75 * Marks the currently active object (sequence/token/group...) in order to know where to add stuff like occurrence info etc.
76 */
77 LinkedHashMap<String,Object> curObject = new LinkedHashMap<String,Object>();
78 /**
79 * Marks the currently active token in order to know where to add flags (might already have been taken away from token stack).
80 */
81 LinkedHashMap<String,Object> curToken = new LinkedHashMap<String,Object>();
82
83 /**
84 *
85 * @param tree The syntax tree as returned by ANTLR
86 * @param parser The ANTLR parser instance that generated the parse tree
87 */
88 public PoliqarpTree(String query) {
Joachim Bingel4b405f52013-11-15 15:29:30 +000089 prepareContext();
90 process(query);
91 System.out.println(">>> "+requestMap+" <<<");
92 }
93
94 private void prepareContext() {
95 LinkedHashMap<String,Object> context = new LinkedHashMap<String,Object>();
96 LinkedHashMap<String,Object> operands = new LinkedHashMap<String,Object>();
97 LinkedHashMap<String,Object> relation = new LinkedHashMap<String,Object>();
98 LinkedHashMap<String,Object> classMap = new LinkedHashMap<String,Object>();
99
100 operands.put("@id", "korap:operands");
101 operands.put("@container", "@list");
102
103 relation.put("@id", "korap:relation");
104 relation.put("@type", "korap:relation#types");
105
106 classMap.put("@id", "korap:class");
107 classMap.put("@type", "xsd:integer");
108
109 context.put("korap", "http://korap.ids-mannheim.de/ns/query");
110 context.put("@language", "de");
111 context.put("operands", operands);
112 context.put("relation", relation);
113 context.put("class", classMap);
114 context.put("query", "korap:query");
115 context.put("filter", "korap:filter");
116 context.put("meta", "korap:meta");
117
118 requestMap.put("@context", context);
119 }
120
121 @Override
122 public Map<String, Object> getRequestMap() {
Joachim Bingel593964f2013-11-29 16:45:47 +0000123 return requestMap;
Joachim Bingel4b405f52013-11-15 15:29:30 +0000124 }
125
126 @Override
127 public void process(String query) {
128 ParseTree tree = parsePoliqarpQuery(query);
Joachim Bingel76b498d2014-06-06 12:06:46 +0000129 log.info("Processing Poliqarp query.");
Joachim Bingel4b405f52013-11-15 15:29:30 +0000130 System.out.println("Processing Poliqarp");
131 processNode(tree);
Joachim Bingel76b498d2014-06-06 12:06:46 +0000132 log.info(requestMap.toString());
Joachim Bingel4b405f52013-11-15 15:29:30 +0000133 }
134
135 @SuppressWarnings("unchecked")
136 private void processNode(ParseTree node) {
137 // Top-down processing
138 if (visited.contains(node)) return;
139 else visited.add(node);
140
141 String nodeCat = getNodeCat(node);
142 openNodeCats.push(nodeCat);
143
144// System.out.println(openNodeCats);
145
146 /*
147 ****************************************************************
148 ****************************************************************
149 * Processing individual node categories *
150 ****************************************************************
151 ****************************************************************
152 */
153 if (nodeCat.equals("query")) {
154 }
155
156 // cq_segments/sq_segments: token group
157 if (nodeCat.equals("cq_segments") || nodeCat.equals("sq_segments")) {
158 // disregard empty segments in simple queries (parsed by ANTLR as empty cq_segments)
159 if (node.getChildCount() > 0 && !node.getChild(0).toStringTree(poliqarpParser).equals(" ")) {
160 LinkedHashMap<String,Object> sequence = new LinkedHashMap<String,Object>();
161 curObject = sequence;
162 // Step I: decide type of element (one or more elements? -> token or sequence)
163 if (node.getChildCount()>1) {
164 sequence.put("@type", "korap:sequence");
165 ArrayList<Object> sequenceOperands = new ArrayList<Object>();
166 sequence.put("operands", sequenceOperands);
167 } else {
168 // if only child, make the sequence a mere korap:token
169 sequence.put("@type", "korap:token");
170 tokenStack.push(sequence);
171 }
172 // Step II: decide where to put this element (top query node or embedded in super sequence?)
173 if (openNodeCats.get(1).equals("query")) {
174 requestMap.put("query", sequence);
175 } else if (!groupStack.isEmpty()) {
176 groupStack.getFirst().add(sequence);
177 } else {
178 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) sequenceStack.getFirst().get("operands");
179 topSequenceOperands.add(sequence);
180 }
181 sequenceStack.push(sequence);
182 }
183 }
184
185 // cq_segment
186 if (nodeCat.equals("cq_segment")) {
187 // Step I: determine whether to create new token or get token from the stack (if added by cq_segments)
188 LinkedHashMap<String, Object> token;
189 if (tokenStack.isEmpty()) {
190 token = new LinkedHashMap<String, Object>();
191 tokenStack.push(token);
192 } else {
193 // in case cq_segments has already added the token
194 token = tokenStack.getFirst();
195 }
196 curObject = token;
197 curToken = token;
198
199 // Step II: start filling object and add to containing sequence
200 token.put("@type", "korap:token");
201 // add token to sequence only if it is not an only child (in that case, cq_segments has already added the info and is just waiting for the values from "field")
202 if (node.getParent().getChildCount()>1) {
203 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) sequenceStack.getFirst().get("operands");
204 topSequenceOperands.add(token);
205 }
206 }
207
208 // disjoint cq_segments, like ([base=foo][base=bar])|[base=foobar]
209 if (nodeCat.equals("cq_disj_segments")) {
210 LinkedHashMap<String,Object> disjunction = new LinkedHashMap<String,Object>();
211 curObject = disjunction;
212 ArrayList<Object> disjOperands = new ArrayList<Object>();
213 disjunction.put("@type", "korap:group");
214 disjunction.put("relation", "or");
215 disjunction.put("operands", disjOperands);
216 groupStack.push(disjOperands);
217
218 // decide where to put the disjunction
219 if (openNodeCats.get(1).equals("query")) {
220 requestMap.put("query", disjunction);
221 } else if (openNodeCats.get(1).equals("cq_segments")) {
222 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) sequenceStack.getFirst().get("operands");
223 topSequenceOperands.add(disjunction);
224 }
225 }
226
227 // field element (outside meta)
228 if (nodeCat.equals("field")) {
229 LinkedHashMap<String,Object> fieldMap = new LinkedHashMap<String,Object>();
230
231 // Step I: extract info
232 String featureName = node.getChild(0).getChild(0).toStringTree(poliqarpParser); //e.g. (field_name base) (field_op !=) (re_query "bar*")
233 String relation = node.getChild(1).getChild(0).toStringTree(poliqarpParser);
234 String value = "";
235 ParseTree valNode = node.getChild(2);
236 String valType = getNodeCat(valNode);
237 fieldMap.put("@type", "korap:term");
238 if (valType.equals("simple_query")) {
239 value = valNode.getChild(0).getChild(0).toStringTree(poliqarpParser); //e.g. (simple_query (sq_segment foo))
240 } else if (valType.equals("re_query")) {
241 value = valNode.getChild(0).toStringTree(poliqarpParser); //e.g. (re_query "bar*")
242 fieldMap.put("@subtype", "korap:value#regex");
243 }
244 fieldMap.put("@value", featureName+":"+value);
245 fieldMap.put("relation", relation);
246
247 // Step II: decide where to put the field map (as the only value of a token or the meta filter or as a part of a group in case of coordinated fields)
248 if (fieldStack.isEmpty()) {
249 if (!inMeta) {
250 tokenStack.getFirst().put("@value", fieldMap);
251 } else {
252 ((HashMap<String, Object>) requestMap.get("meta")).put("@value", fieldMap);
253 }
254 } else {
255 fieldStack.getFirst().add(fieldMap);
256 }
257 visited.add(node.getChild(0));
258 visited.add(node.getChild(1));
259 visited.add(node.getChild(2));
260 }
261
262 // conj_field serves for both conjunctions and disjunctions
263 if (nodeCat.equals("conj_field")) {
264 LinkedHashMap<String,Object> group = new LinkedHashMap<String,Object>();
265 ArrayList<Object> groupOperands = new ArrayList<Object>();
266
267 group.put("@type", "korap:group");
268 group.put("operands", groupOperands);
269 fieldStack.push(groupOperands);
270
271 // Step I: get operator (& or |)
272 ParseTree operatorNode = node.getChild(1).getChild(0);
273 String operator = getNodeCat(operatorNode);
274 if (operator.equals("|")) {
275 group.put("relation", "or");
276 } else if (operator.equals("&")) {
277 group.put("relation", "and");
278 }
279
280 // Step II: decide where to put the group (directly under token or in top meta filter section or embed in super group)
281 if (openNodeCats.get(1).equals("cq_segment")) {
282 tokenStack.getFirst().put("@value", group);
283 } else if (openNodeCats.get(1).equals("meta_field_group")) {
284 ((HashMap<String, Object>) requestMap.get("meta")).put("@value", group);
285 } else {
286 fieldStack.get(1).add(group);
287 }
288 // skip the operator
289 visited.add(node.getChild(1));
290 }
291
292
293 if (nodeCat.equals("sq_segment")) {
294 // Step I: determine whether to create new token or get token from the stack (if added by cq_segments)
295 LinkedHashMap<String, Object> token;
296 if (tokenStack.isEmpty()) {
297 token = new LinkedHashMap<String, Object>();
298 tokenStack.push(token);
299 } else {
300 // in case sq_segments has already added the token
301 token = tokenStack.getFirst();
302 }
303 curObject = token;
304 curToken = token;
305 // Step II: fill object (token values) and put into containing sequence
306 token.put("@type", "korap:token");
307 String word = node.getChild(0).toStringTree(poliqarpParser);
308 LinkedHashMap<String,Object> tokenValues = new LinkedHashMap<String,Object>();
309 token.put("@value", tokenValues);
310 tokenValues.put("orth", word);
311 tokenValues.put("relation", "=");
312
313 // add token to sequence only if it is not an only child (in that case, sq_segments has already added the info and is just waiting for the values from "field")
314 if (node.getParent().getChildCount()>1) {
315 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) sequenceStack.getFirst().get("operands");
316 topSequenceOperands.add(token);
317 }
318 }
319
320 // repetition of token group
321 if (nodeCat.equals("occ")) {
322 ParseTree occChild = node.getChild(0);
323 String repetition = occChild.toStringTree(poliqarpParser);
324 curObject.put("repetition", repetition);
325 visited.add(occChild);
326 }
327
328 // flags for case sensitivity and whole-word-matching
329 if (nodeCat.equals("flag")) {
330 String flag = getNodeCat(node.getChild(0)).substring(1); //substring removes leading slash '/'
331 // add to current token's value
332 ((HashMap<String, Object>) curToken.get("@value")).put("flag", flag);
333 }
334
335 if (nodeCat.equals("meta")) {
336 inMeta=true;
337 LinkedHashMap<String,Object> metaFilter = new LinkedHashMap<String,Object>();
338 requestMap.put("meta", metaFilter);
339 metaFilter.put("@type", "korap:meta");
340 }
341
342
343
344 if (nodeCat.equals("within")) {
345 ParseTree domainNode = node.getChild(2);
346 String domain = getNodeCat(domainNode);
347// queryOperands.add("within:"+domain);
348 curObject.put("within", domain);
349 visited.add(node.getChild(0));
350 visited.add(node.getChild(1));
351 visited.add(domainNode);
352 }
353
354 /*
355 ****************************************************************
356 ****************************************************************
357 * recursion until 'request' node (root of tree) is processed *
358 * **************************************************************
359 ****************************************************************
360 */
361 for (int i=0; i<node.getChildCount(); i++) {
362 ParseTree child = node.getChild(i);
363 processNode(child);
364 }
365
366 // Stuff that happens when leaving a node (taking it off the stack)
367 if (nodeCat.equals("cq_segments") || nodeCat.equals("sq_segments")) {
368 // exclude whitespaces analysed as empty cq_segments
369 if (node.getChildCount() > 0 && !getNodeCat(node.getChild(0)).equals(" ")) {
370 sequenceStack.pop();
371 }
372 }
373
374 if (nodeCat.equals("cq_disj_segments")) {
375 groupStack.pop();
376 }
377
378 if (nodeCat.equals("cq_segment") || nodeCat.equals("sq_segment")){
379 tokenStack.pop();
380 }
381
382 if (nodeCat.equals("conj_field")) {
383 fieldStack.pop();
384 }
385
386 openNodeCats.pop();
387
388 }
389
Joachim Bingelc8a28e42014-04-24 15:06:42 +0000390// /**
391// * Returns the category (or 'label') of the root of a ParseTree.
392// * @param node
393// * @return
394// */
395// public String getNodeCat(ParseTree node) {
396// String nodeCat = node.toStringTree(poliqarpParser);
397// Pattern p = Pattern.compile("\\((.*?)\\s"); // from opening parenthesis to 1st whitespace
398// Matcher m = p.matcher(node.toStringTree(poliqarpParser));
399// if (m.find()) {
400// nodeCat = m.group(1);
401// }
402// return nodeCat;
403// }
Joachim Bingel4b405f52013-11-15 15:29:30 +0000404
405 private static ParserRuleContext parsePoliqarpQuery (String p) {
406 Lexer poliqarpLexer = new PoliqarpLexer((CharStream)null);
407 ParserRuleContext tree = null;
408 // Like p. 111
409 try {
410
411 // Tokenize input data
412 ANTLRInputStream input = new ANTLRInputStream(p);
413 poliqarpLexer.setInputStream(input);
414 CommonTokenStream tokens = new CommonTokenStream(poliqarpLexer);
415 poliqarpParser = new PoliqarpParser(tokens);
416
417 // Don't throw out erroneous stuff
418 poliqarpParser.setErrorHandler(new BailErrorStrategy());
419 poliqarpParser.removeErrorListeners();
420
421 // Get starting rule from parser
422 Method startRule = PoliqarpParser.class.getMethod("request");
423 tree = (ParserRuleContext) startRule.invoke(poliqarpParser, (Object[])null);
424 }
425
426 // Some things went wrong ...
427 catch (Exception e) {
Joachim Bingel76b498d2014-06-06 12:06:46 +0000428 log.error(e.getMessage());
429 System.err.println( e.getMessage() );
Joachim Bingel4b405f52013-11-15 15:29:30 +0000430 }
431
432 // Return the generated tree
433 return tree;
434 }
435
436 public static void main(String[] args) {
437 /*
438 * For testing
439 */
440 String[] queries = new String[] {
441// "[base=foo]|([base=foo][base=bar])*",
442// "([base=foo]|[base=bar])[base=foobar]",
443// "[base=foo]([base=bar]|[base=foobar/i])",
444// "[base=bar|base=foo]",
445// "[base=bar]",
446// "[base=foo][base=bar]",
447// "[(base=bar|base=foo)&orth=wee]",
448// "[base=foo/i][base=bar]{2,4}",
449// "foo bar/i"
450 "[base=foo] meta author=Goethe&year=1885",
Joachim Bingela14e13a2013-12-04 15:59:07 +0000451 "[base=foo]|([base=foo][base=bar])* meta author=Goethe&year=1815",
Joachim Bingelcc1dc242014-01-15 09:32:38 +0000452 "[base=foo]*",
Joachim Bingel4b405f52013-11-15 15:29:30 +0000453 };
454 for (String q : queries) {
455 try {
456 System.out.println(q);
457 System.out.println(PoliqarpTree.parsePoliqarpQuery(q).toStringTree(PoliqarpTree.poliqarpParser));
458 @SuppressWarnings("unused")
459 PoliqarpTree pt = new PoliqarpTree(q);
460 System.out.println(PoliqarpTree.parsePoliqarpQuery(q).toStringTree(PoliqarpTree.poliqarpParser));
461 System.out.println();
462
463 } catch (NullPointerException npe) {
464 npe.printStackTrace();
465 System.out.println("null\n");
466 }
467 }
468 }
469
470}