blob: 852eb54b8717dd9498ca0e03a8ea45e121b61c07 [file] [log] [blame]
Joachim Bingel4b405f52013-11-15 15:29:30 +00001package de.ids_mannheim.korap.query.serialize;
2
3import java.lang.reflect.Method;
4import java.util.ArrayList;
5import java.util.HashMap;
6import java.util.LinkedHashMap;
7import java.util.LinkedList;
8import java.util.List;
9import java.util.Map;
10import java.util.regex.Matcher;
11import java.util.regex.Pattern;
12
13import org.antlr.v4.runtime.ANTLRInputStream;
14import org.antlr.v4.runtime.BailErrorStrategy;
15import org.antlr.v4.runtime.CharStream;
16import org.antlr.v4.runtime.CommonTokenStream;
17import org.antlr.v4.runtime.Lexer;
18import org.antlr.v4.runtime.Parser;
19import org.antlr.v4.runtime.ParserRuleContext;
20import org.antlr.v4.runtime.tree.ParseTree;
21
Joachim Bingelb5ada902013-11-19 14:46:04 +000022//import de.ids_mannheim.korap.query.poliqarp.PoliqarpLexer;
23//import de.ids_mannheim.korap.query.poliqarp.PoliqarpParser;
24import de.ids_mannheim.korap.query.PoliqarpLexer;
25import de.ids_mannheim.korap.query.PoliqarpParser;
26import de.ids_mannheim.korap.query.serialize.AbstractSyntaxTree;
Joachim Bingel4b405f52013-11-15 15:29:30 +000027
28/**
29 * Map representation of Poliqarp syntax tree as returned by ANTLR
30 * @author joachim
31 *
32 */
33public class PoliqarpTree extends AbstractSyntaxTree {
34
35 /**
36 * Top-level map representing the whole request.
37 */
38 LinkedHashMap<String,Object> requestMap = new LinkedHashMap<String,Object>();
39 /**
40 * Keeps track of open node categories
41 */
42 LinkedList<String> openNodeCats = new LinkedList<String>();
43 /**
44 * Flag that indicates whether token fields or meta fields are currently being processed
45 */
46 boolean inMeta = false;
47 /**
48 * Parser object deriving the ANTLR parse tree.
49 */
50 static Parser poliqarpParser;
51 /**
52 * Keeps track of all visited nodes in a tree
53 */
54 List<ParseTree> visited = new ArrayList<ParseTree>();
55
56 /**
57 * Keeps track of active fields (like 'base=foo').
58 */
59 LinkedList<ArrayList<Object>> fieldStack = new LinkedList<ArrayList<Object>>();
60 /**
61 * Keeps track of active sequences.
62 */
63 LinkedList<LinkedHashMap<String,Object>> sequenceStack = new LinkedList<LinkedHashMap<String,Object>>();
64 /**
65 * Keeps track of active tokens.
66 */
67 LinkedList<LinkedHashMap<String,Object>> tokenStack = new LinkedList<LinkedHashMap<String,Object>>();
68 /**
69 * Keeps track of sequence/token/field groups.
70 */
71 LinkedList<ArrayList<Object>> groupStack = new LinkedList<ArrayList<Object>>();
72 /**
73 * Marks the currently active object (sequence/token/group...) in order to know where to add stuff like occurrence info etc.
74 */
75 LinkedHashMap<String,Object> curObject = new LinkedHashMap<String,Object>();
76 /**
77 * Marks the currently active token in order to know where to add flags (might already have been taken away from token stack).
78 */
79 LinkedHashMap<String,Object> curToken = new LinkedHashMap<String,Object>();
80
81 /**
82 *
83 * @param tree The syntax tree as returned by ANTLR
84 * @param parser The ANTLR parser instance that generated the parse tree
85 */
86 public PoliqarpTree(String query) {
Joachim Bingel4b405f52013-11-15 15:29:30 +000087 prepareContext();
88 process(query);
89 System.out.println(">>> "+requestMap+" <<<");
90 }
91
92 private void prepareContext() {
93 LinkedHashMap<String,Object> context = new LinkedHashMap<String,Object>();
94 LinkedHashMap<String,Object> operands = new LinkedHashMap<String,Object>();
95 LinkedHashMap<String,Object> relation = new LinkedHashMap<String,Object>();
96 LinkedHashMap<String,Object> classMap = new LinkedHashMap<String,Object>();
97
98 operands.put("@id", "korap:operands");
99 operands.put("@container", "@list");
100
101 relation.put("@id", "korap:relation");
102 relation.put("@type", "korap:relation#types");
103
104 classMap.put("@id", "korap:class");
105 classMap.put("@type", "xsd:integer");
106
107 context.put("korap", "http://korap.ids-mannheim.de/ns/query");
108 context.put("@language", "de");
109 context.put("operands", operands);
110 context.put("relation", relation);
111 context.put("class", classMap);
112 context.put("query", "korap:query");
113 context.put("filter", "korap:filter");
114 context.put("meta", "korap:meta");
115
116 requestMap.put("@context", context);
117 }
118
119 @Override
120 public Map<String, Object> getRequestMap() {
Joachim Bingel593964f2013-11-29 16:45:47 +0000121 return requestMap;
Joachim Bingel4b405f52013-11-15 15:29:30 +0000122 }
123
124 @Override
125 public void process(String query) {
126 ParseTree tree = parsePoliqarpQuery(query);
127 System.out.println("Processing Poliqarp");
128 processNode(tree);
129 }
130
131 @SuppressWarnings("unchecked")
132 private void processNode(ParseTree node) {
133 // Top-down processing
134 if (visited.contains(node)) return;
135 else visited.add(node);
136
137 String nodeCat = getNodeCat(node);
138 openNodeCats.push(nodeCat);
139
140// System.out.println(openNodeCats);
141
142 /*
143 ****************************************************************
144 ****************************************************************
145 * Processing individual node categories *
146 ****************************************************************
147 ****************************************************************
148 */
149 if (nodeCat.equals("query")) {
150 }
151
152 // cq_segments/sq_segments: token group
153 if (nodeCat.equals("cq_segments") || nodeCat.equals("sq_segments")) {
154 // disregard empty segments in simple queries (parsed by ANTLR as empty cq_segments)
155 if (node.getChildCount() > 0 && !node.getChild(0).toStringTree(poliqarpParser).equals(" ")) {
156 LinkedHashMap<String,Object> sequence = new LinkedHashMap<String,Object>();
157 curObject = sequence;
158 // Step I: decide type of element (one or more elements? -> token or sequence)
159 if (node.getChildCount()>1) {
160 sequence.put("@type", "korap:sequence");
161 ArrayList<Object> sequenceOperands = new ArrayList<Object>();
162 sequence.put("operands", sequenceOperands);
163 } else {
164 // if only child, make the sequence a mere korap:token
165 sequence.put("@type", "korap:token");
166 tokenStack.push(sequence);
167 }
168 // Step II: decide where to put this element (top query node or embedded in super sequence?)
169 if (openNodeCats.get(1).equals("query")) {
170 requestMap.put("query", sequence);
171 } else if (!groupStack.isEmpty()) {
172 groupStack.getFirst().add(sequence);
173 } else {
174 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) sequenceStack.getFirst().get("operands");
175 topSequenceOperands.add(sequence);
176 }
177 sequenceStack.push(sequence);
178 }
179 }
180
181 // cq_segment
182 if (nodeCat.equals("cq_segment")) {
183 // Step I: determine whether to create new token or get token from the stack (if added by cq_segments)
184 LinkedHashMap<String, Object> token;
185 if (tokenStack.isEmpty()) {
186 token = new LinkedHashMap<String, Object>();
187 tokenStack.push(token);
188 } else {
189 // in case cq_segments has already added the token
190 token = tokenStack.getFirst();
191 }
192 curObject = token;
193 curToken = token;
194
195 // Step II: start filling object and add to containing sequence
196 token.put("@type", "korap:token");
197 // add token to sequence only if it is not an only child (in that case, cq_segments has already added the info and is just waiting for the values from "field")
198 if (node.getParent().getChildCount()>1) {
199 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) sequenceStack.getFirst().get("operands");
200 topSequenceOperands.add(token);
201 }
202 }
203
204 // disjoint cq_segments, like ([base=foo][base=bar])|[base=foobar]
205 if (nodeCat.equals("cq_disj_segments")) {
206 LinkedHashMap<String,Object> disjunction = new LinkedHashMap<String,Object>();
207 curObject = disjunction;
208 ArrayList<Object> disjOperands = new ArrayList<Object>();
209 disjunction.put("@type", "korap:group");
210 disjunction.put("relation", "or");
211 disjunction.put("operands", disjOperands);
212 groupStack.push(disjOperands);
213
214 // decide where to put the disjunction
215 if (openNodeCats.get(1).equals("query")) {
216 requestMap.put("query", disjunction);
217 } else if (openNodeCats.get(1).equals("cq_segments")) {
218 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) sequenceStack.getFirst().get("operands");
219 topSequenceOperands.add(disjunction);
220 }
221 }
222
223 // field element (outside meta)
224 if (nodeCat.equals("field")) {
225 LinkedHashMap<String,Object> fieldMap = new LinkedHashMap<String,Object>();
226
227 // Step I: extract info
228 String featureName = node.getChild(0).getChild(0).toStringTree(poliqarpParser); //e.g. (field_name base) (field_op !=) (re_query "bar*")
229 String relation = node.getChild(1).getChild(0).toStringTree(poliqarpParser);
230 String value = "";
231 ParseTree valNode = node.getChild(2);
232 String valType = getNodeCat(valNode);
233 fieldMap.put("@type", "korap:term");
234 if (valType.equals("simple_query")) {
235 value = valNode.getChild(0).getChild(0).toStringTree(poliqarpParser); //e.g. (simple_query (sq_segment foo))
236 } else if (valType.equals("re_query")) {
237 value = valNode.getChild(0).toStringTree(poliqarpParser); //e.g. (re_query "bar*")
238 fieldMap.put("@subtype", "korap:value#regex");
239 }
240 fieldMap.put("@value", featureName+":"+value);
241 fieldMap.put("relation", relation);
242
243 // Step II: decide where to put the field map (as the only value of a token or the meta filter or as a part of a group in case of coordinated fields)
244 if (fieldStack.isEmpty()) {
245 if (!inMeta) {
246 tokenStack.getFirst().put("@value", fieldMap);
247 } else {
248 ((HashMap<String, Object>) requestMap.get("meta")).put("@value", fieldMap);
249 }
250 } else {
251 fieldStack.getFirst().add(fieldMap);
252 }
253 visited.add(node.getChild(0));
254 visited.add(node.getChild(1));
255 visited.add(node.getChild(2));
256 }
257
258 // conj_field serves for both conjunctions and disjunctions
259 if (nodeCat.equals("conj_field")) {
260 LinkedHashMap<String,Object> group = new LinkedHashMap<String,Object>();
261 ArrayList<Object> groupOperands = new ArrayList<Object>();
262
263 group.put("@type", "korap:group");
264 group.put("operands", groupOperands);
265 fieldStack.push(groupOperands);
266
267 // Step I: get operator (& or |)
268 ParseTree operatorNode = node.getChild(1).getChild(0);
269 String operator = getNodeCat(operatorNode);
270 if (operator.equals("|")) {
271 group.put("relation", "or");
272 } else if (operator.equals("&")) {
273 group.put("relation", "and");
274 }
275
276 // Step II: decide where to put the group (directly under token or in top meta filter section or embed in super group)
277 if (openNodeCats.get(1).equals("cq_segment")) {
278 tokenStack.getFirst().put("@value", group);
279 } else if (openNodeCats.get(1).equals("meta_field_group")) {
280 ((HashMap<String, Object>) requestMap.get("meta")).put("@value", group);
281 } else {
282 fieldStack.get(1).add(group);
283 }
284 // skip the operator
285 visited.add(node.getChild(1));
286 }
287
288
289 if (nodeCat.equals("sq_segment")) {
290 // Step I: determine whether to create new token or get token from the stack (if added by cq_segments)
291 LinkedHashMap<String, Object> token;
292 if (tokenStack.isEmpty()) {
293 token = new LinkedHashMap<String, Object>();
294 tokenStack.push(token);
295 } else {
296 // in case sq_segments has already added the token
297 token = tokenStack.getFirst();
298 }
299 curObject = token;
300 curToken = token;
301 // Step II: fill object (token values) and put into containing sequence
302 token.put("@type", "korap:token");
303 String word = node.getChild(0).toStringTree(poliqarpParser);
304 LinkedHashMap<String,Object> tokenValues = new LinkedHashMap<String,Object>();
305 token.put("@value", tokenValues);
306 tokenValues.put("orth", word);
307 tokenValues.put("relation", "=");
308
309 // add token to sequence only if it is not an only child (in that case, sq_segments has already added the info and is just waiting for the values from "field")
310 if (node.getParent().getChildCount()>1) {
311 ArrayList<Object> topSequenceOperands = (ArrayList<Object>) sequenceStack.getFirst().get("operands");
312 topSequenceOperands.add(token);
313 }
314 }
315
316 // repetition of token group
317 if (nodeCat.equals("occ")) {
318 ParseTree occChild = node.getChild(0);
319 String repetition = occChild.toStringTree(poliqarpParser);
320 curObject.put("repetition", repetition);
321 visited.add(occChild);
322 }
323
324 // flags for case sensitivity and whole-word-matching
325 if (nodeCat.equals("flag")) {
326 String flag = getNodeCat(node.getChild(0)).substring(1); //substring removes leading slash '/'
327 // add to current token's value
328 ((HashMap<String, Object>) curToken.get("@value")).put("flag", flag);
329 }
330
331 if (nodeCat.equals("meta")) {
332 inMeta=true;
333 LinkedHashMap<String,Object> metaFilter = new LinkedHashMap<String,Object>();
334 requestMap.put("meta", metaFilter);
335 metaFilter.put("@type", "korap:meta");
336 }
337
338
339
340 if (nodeCat.equals("within")) {
341 ParseTree domainNode = node.getChild(2);
342 String domain = getNodeCat(domainNode);
343// queryOperands.add("within:"+domain);
344 curObject.put("within", domain);
345 visited.add(node.getChild(0));
346 visited.add(node.getChild(1));
347 visited.add(domainNode);
348 }
349
350 /*
351 ****************************************************************
352 ****************************************************************
353 * recursion until 'request' node (root of tree) is processed *
354 * **************************************************************
355 ****************************************************************
356 */
357 for (int i=0; i<node.getChildCount(); i++) {
358 ParseTree child = node.getChild(i);
359 processNode(child);
360 }
361
362 // Stuff that happens when leaving a node (taking it off the stack)
363 if (nodeCat.equals("cq_segments") || nodeCat.equals("sq_segments")) {
364 // exclude whitespaces analysed as empty cq_segments
365 if (node.getChildCount() > 0 && !getNodeCat(node.getChild(0)).equals(" ")) {
366 sequenceStack.pop();
367 }
368 }
369
370 if (nodeCat.equals("cq_disj_segments")) {
371 groupStack.pop();
372 }
373
374 if (nodeCat.equals("cq_segment") || nodeCat.equals("sq_segment")){
375 tokenStack.pop();
376 }
377
378 if (nodeCat.equals("conj_field")) {
379 fieldStack.pop();
380 }
381
382 openNodeCats.pop();
383
384 }
385
386 /**
387 * Returns the category (or 'label') of the root of a ParseTree.
388 * @param node
389 * @return
390 */
391 public String getNodeCat(ParseTree node) {
392 String nodeCat = node.toStringTree(poliqarpParser);
393 Pattern p = Pattern.compile("\\((.*?)\\s"); // from opening parenthesis to 1st whitespace
394 Matcher m = p.matcher(node.toStringTree(poliqarpParser));
395 if (m.find()) {
396 nodeCat = m.group(1);
397 }
398 return nodeCat;
399 }
400
401 private static ParserRuleContext parsePoliqarpQuery (String p) {
402 Lexer poliqarpLexer = new PoliqarpLexer((CharStream)null);
403 ParserRuleContext tree = null;
404 // Like p. 111
405 try {
406
407 // Tokenize input data
408 ANTLRInputStream input = new ANTLRInputStream(p);
409 poliqarpLexer.setInputStream(input);
410 CommonTokenStream tokens = new CommonTokenStream(poliqarpLexer);
411 poliqarpParser = new PoliqarpParser(tokens);
412
413 // Don't throw out erroneous stuff
414 poliqarpParser.setErrorHandler(new BailErrorStrategy());
415 poliqarpParser.removeErrorListeners();
416
417 // Get starting rule from parser
418 Method startRule = PoliqarpParser.class.getMethod("request");
419 tree = (ParserRuleContext) startRule.invoke(poliqarpParser, (Object[])null);
420 }
421
422 // Some things went wrong ...
423 catch (Exception e) {
424 System.err.println( e.getMessage() );
425 }
426
427 // Return the generated tree
428 return tree;
429 }
430
431 public static void main(String[] args) {
432 /*
433 * For testing
434 */
435 String[] queries = new String[] {
436// "[base=foo]|([base=foo][base=bar])*",
437// "([base=foo]|[base=bar])[base=foobar]",
438// "[base=foo]([base=bar]|[base=foobar/i])",
439// "[base=bar|base=foo]",
440// "[base=bar]",
441// "[base=foo][base=bar]",
442// "[(base=bar|base=foo)&orth=wee]",
443// "[base=foo/i][base=bar]{2,4}",
444// "foo bar/i"
445 "[base=foo] meta author=Goethe&year=1885",
Joachim Bingela14e13a2013-12-04 15:59:07 +0000446 "[base=foo]|([base=foo][base=bar])* meta author=Goethe&year=1815",
447 "[base=foo]*"
Joachim Bingel4b405f52013-11-15 15:29:30 +0000448 };
449 for (String q : queries) {
450 try {
451 System.out.println(q);
452 System.out.println(PoliqarpTree.parsePoliqarpQuery(q).toStringTree(PoliqarpTree.poliqarpParser));
453 @SuppressWarnings("unused")
454 PoliqarpTree pt = new PoliqarpTree(q);
455 System.out.println(PoliqarpTree.parsePoliqarpQuery(q).toStringTree(PoliqarpTree.poliqarpParser));
456 System.out.println();
457
458 } catch (NullPointerException npe) {
459 npe.printStackTrace();
460 System.out.println("null\n");
461 }
462 }
463 }
464
465}