collection queries: regex, groups
diff --git a/src/main/antlr/CollectionQuery.g4 b/src/main/antlr/CollectionQuery.g4
index 0bd7fe5..7aad659 100644
--- a/src/main/antlr/CollectionQuery.g4
+++ b/src/main/antlr/CollectionQuery.g4
@@ -20,9 +20,9 @@
NE : '!=';
AND : '&' | 'AND' | 'and' | 'UND' | 'und' ;
OR : '|' | 'OR' | 'or' | 'ODER' | 'oder' ;
-
+QMARK : '?';
+SLASH : '/';
WS : ( ' ' | '\t' | '\r' | '\n' )+ -> skip ;
-fragment FOCC : '{' WS* ( [0-9]* WS* ',' WS* [0-9]+ | [0-9]+ WS* ','? ) WS* '}';
fragment NO_RE : ~[ \t\/];
fragment ALPHABET : ~('\t' | ' ' | '/' | '*' | '?' | '+' | '{' | '}' | '[' | ']'
| '(' | ')' | '|' | '"' | ',' | ':' | '\'' | '\\' | '!' | '=' | '~' | '&' | '^' | '<' | '>' );
@@ -34,19 +34,20 @@
WORD : ALPHABET+;
/*
- * Regular expressions (delimited by slashes in Annis)
+ * Regular expressions
*/
+fragment FOCC : '{' WS* ( [0-9]* WS* ',' WS* [0-9]+ | [0-9]+ WS* ','? ) WS* '}';
fragment RE_char : ~('*' | '?' | '+' | '{' | '}' | '[' | ']' | '/'
| '(' | ')' | '|' | '"' | ':' | '\'' | '\\');
fragment RE_alter : ((RE_char | ('(' REGEX ')') | RE_chgroup) '|' REGEX )+;
fragment RE_chgroup : '[' RE_char+ ']';
+fragment RE_quant : (RE_star | RE_plus | RE_occ) QMARK?;
fragment RE_opt : (RE_char | RE_chgroup | ( '(' REGEX ')')) '?';
fragment RE_star : (RE_char | RE_chgroup | ( '(' REGEX ')')) '*';
fragment RE_plus : (RE_char | RE_chgroup | ( '(' REGEX ')')) '+';
fragment RE_occ : (RE_char | RE_chgroup | ( '(' REGEX ')')) FOCC;
fragment RE_group : '(' REGEX ')';
-SLASH : '/';
-REGEX : SLASH ('.' | RE_char | RE_alter | RE_chgroup | RE_opt | RE_star | RE_plus | RE_occ | RE_group)* SLASH;
+REGEX : SLASH ('.' | RE_char | RE_alter | RE_chgroup | RE_opt | RE_quant | RE_group)* SLASH;
/*
* PARSER SECTION
@@ -67,12 +68,17 @@
;
field
-: WORD;
+: WORD
+;
value
-: WORD | NUMBER | '"' (WORD ws*)+'"'
-| regex;
-
+: WORD
+| NUMBER
+| '"' (WORD ws*)+'"'
+| regex
+;
+
+/*
andGroup
: (((LRB exprGroup RRB)|expr) AND)+ ((LRB exprGroup RRB)|expr)
;
@@ -80,13 +86,18 @@
orGroup
: (((LRB exprGroup RRB)|expr) OR)+ ((LRB exprGroup RRB)|expr)
;
+*/
+
+relation
+: (expr|exprGroup) conj (expr|exprGroup|relation)
+;
exprGroup
-: andGroup
-| orGroup
+: LRB (expr | exprGroup | relation) RRB
;
start
-: expr
-| exprGroup EOF
+: ( expr
+ | exprGroup EOF
+ | relation EOF )
;
\ No newline at end of file
diff --git a/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryTree.java b/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryTree.java
index 588f1a6..e47c7e5 100644
--- a/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryTree.java
+++ b/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryTree.java
@@ -10,7 +10,7 @@
import java.util.*;
/**
- * @author hanl
+ * @author hanl, bingel
* @date 06/12/2013
*/
public class CollectionQueryTree extends Antlr4AbstractSyntaxTree {
@@ -44,7 +44,7 @@
} else {
throw new NullPointerException("Parser has not been instantiated!");
}
-
+ requestMap.put("@type", "korap:filter");
System.out.println("Processing collection query");
if (verbose) System.out.println(tree.toStringTree(parser));
processNode(tree);
@@ -70,19 +70,20 @@
****************************************************************
*/
- if (nodeCat.equals("andGroup")) {
- LinkedHashMap<String, Object> exprGroup = makeTermGroup("and");
- objectStack.push(exprGroup);
+ if (nodeCat.equals("relation")) {
+ String operator = node.getChild(1).getChild(0).toStringTree(parser).equals("&") ? "and" : "or";
+ LinkedHashMap<String, Object> relationGroup = makeDocGroup(operator);
+ putIntoSuperObject(relationGroup);
+ objectStack.push(relationGroup);
stackedObjects++;
- putIntoSuperObject(exprGroup, 1);
}
- if (nodeCat.equals("orGroup")) {
- LinkedHashMap<String, Object> exprGroup = makeTermGroup("or");
- objectStack.push(exprGroup);
- stackedObjects++;
- putIntoSuperObject(exprGroup, 1);
- }
+// if (nodeCat.equals("orGroup")) {
+// LinkedHashMap<String, Object> exprGroup = makeDocGroup("or");
+// putIntoSuperObject(exprGroup);
+// objectStack.push(exprGroup);
+// stackedObjects++;
+// }
if (nodeCat.equals("expr")) {
ParseTree fieldNode = getFirstChildWithCat(node, "field");
@@ -91,26 +92,26 @@
List<ParseTree> valueNodes = getChildrenWithCat(node, "value");
if (valueNodes.size() == 1) {
- LinkedHashMap<String, Object> term = makeTerm();
- term.put("attribute", "korap:field#" + field);
- term.put("key", valueNodes.get(0).getChild(0).toStringTree(parser));
+ LinkedHashMap<String, Object> term = makeDoc();
+ term.put("key", field);
+ term.putAll(parseValue(valueNodes.get(0)));
String match = operatorNodes.get(0).getChild(0).toStringTree(parser);
term.put("match", "match:" + interpretMatch(match));
putIntoSuperObject(term);
} else { // (valueNodes.size()==2)
- LinkedHashMap<String, Object> termGroup = makeTermGroup("and");
+ LinkedHashMap<String, Object> termGroup = makeDocGroup("and");
ArrayList<Object> termGroupOperands = (ArrayList<Object>) termGroup.get("operands");
- LinkedHashMap<String, Object> term1 = makeTerm();
- term1.put("attribute", "korap:field#" + field);
- term1.put("key", valueNodes.get(0).getChild(0).toStringTree(parser));
+ LinkedHashMap<String, Object> term1 = makeDoc();
+ term1.put("key", field);
+ term1.putAll(parseValue(valueNodes.get(0)));
String match1 = operatorNodes.get(0).getChild(0).toStringTree(parser);
term1.put("match", "match:" + invertInequation(interpretMatch(match1)));
termGroupOperands.add(term1);
- LinkedHashMap<String, Object> term2 = makeTerm();
- term2.put("attribute", "korap:field#" + field);
- term2.put("key", valueNodes.get(1).getChild(0).toStringTree(parser));
+ LinkedHashMap<String, Object> term2 = makeDoc();
+ term2.put("key", field);
+ term2.putAll(parseValue(valueNodes.get(1)));
String match2 = operatorNodes.get(1).getChild(0).toStringTree(parser);
term2.put("match", "match:" + interpretMatch(match2));
termGroupOperands.add(term2);
@@ -139,7 +140,8 @@
**************************************************************
*/
if (!objectsToPop.isEmpty()) {
- for (int i = 0; i < objectsToPop.pop(); i++) {
+ int toPop = objectsToPop.pop();
+ for (int i = 0; i < toPop; i++) {
objectStack.pop();
}
}
@@ -149,7 +151,19 @@
}
- private String interpretMatch(String match) {
+ private LinkedHashMap<String, Object> parseValue(ParseTree valueNode) {
+ LinkedHashMap<String, Object> map = new LinkedHashMap<String, Object>();
+ if (getNodeCat(valueNode.getChild(0)).equals("regex")) {
+ String regex = valueNode.getChild(0).getChild(0).toStringTree(parser);
+ map.put("value", regex.substring(1, regex.length()-1));
+ map.put("type", "type:regex");
+ } else {
+ map.put("value", valueNode.getChild(0).toStringTree(parser));
+ }
+ return map;
+ }
+
+ private String interpretMatch(String match) {
String out = null;
switch (match) {
case "<":
@@ -201,10 +215,10 @@
private void putIntoSuperObject(LinkedHashMap<String, Object> object, int objStackPosition) {
if (objectStack.size() > objStackPosition) {
ArrayList<Object> topObjectOperands = (ArrayList<Object>) objectStack.get(objStackPosition).get("operands");
- topObjectOperands.add(0, object);
+ topObjectOperands.add(object);
} else {
// I want the raw object, not a wrapped
- requestMap.put("query", object);
+ requestMap.put("filter", object);
}
}
@@ -226,6 +240,7 @@
// Get starting rule from parser
Method startRule = CollectionQueryParser.class.getMethod("start");
tree = (ParserRuleContext) startRule.invoke(parser, (Object[]) null);
+ System.out.println(tree.toStringTree(parser));
}
// Some things went wrong ...
@@ -244,8 +259,10 @@
query = "(1990<year<2010&genre=Sport)|textClass=politk";
query = "(textClass=wissenschaft & textClass=politik) | textClass=ausland";
query = "1990<year<2010 & genre=Sport";
+ query = "(textClass=Sport | textClass=ausland) & corpusID=WPD";
+ query = "textClass=Sport";
CollectionQueryTree filter = new CollectionQueryTree();
-// filter.verbose = true;
+ filter.verbose = true;
try {
filter.process(query);
} catch (QueryException e) {
diff --git a/src/test/java/CollectionQueryTreeTest.java b/src/test/java/CollectionQueryTreeTest.java
index 324a0f8..a367c26 100644
--- a/src/test/java/CollectionQueryTreeTest.java
+++ b/src/test/java/CollectionQueryTreeTest.java
@@ -1,3 +1,4 @@
+import static org.junit.Assert.*;
import de.ids_mannheim.korap.query.serialize.CollectionQueryBuilder;
import de.ids_mannheim.korap.query.serialize.CollectionQueryBuilder2;
import de.ids_mannheim.korap.query.serialize.CollectionQueryTree;
@@ -9,77 +10,225 @@
public class CollectionQueryTreeTest {
- CollectionQueryTree ef;
- String map;
- private String query;
+ CollectionQueryTree cqt;
+ String map;
+ private String query;
+ private String expected;
- private boolean equalsQueryContent(String res, String query) throws QueryException {
- res = res.replaceAll(" ", "");
- ef = new CollectionQueryTree();
- ef.process(query);
- String queryMap = ef.getRequestMap().get("query").toString().replaceAll(" ", "");
- return res.equals(queryMap);
- }
+ private boolean equalsQueryContent(String res, String query) throws QueryException {
+ res = res.replaceAll(" ", "");
+ cqt = new CollectionQueryTree();
+ cqt.process(query);
+ String queryMap = cqt.getRequestMap().get("query").toString().replaceAll(" ", "");
+ return res.equals(queryMap);
+ }
- @Test
- public void testSimple() throws QueryException {
- query = "textClass=Sport";
- String regex1 = "{@type=korap:filter, filter={@type=korap:term, attribute=textClass, key=Sport, match=match:eq}}";
- ef = new CollectionQueryTree();
- ef.process(query);
- map = JsonUtils.toJSON(ef.getRequestMap());
-// assertEquals(regex1.replaceAll(" ", ""), map.replaceAll(" ", ""));
- System.out.println("THE QUERY: " + map);
- }
+ @Test
+ public void testSimple() throws QueryException {
+ query = "textClass=Sport";
+ // String regex1 = "{@type=korap:filter, filter={@type=korap:doc, attribute=textClass, key=Sport, match=match:eq}}";
+ expected = "{@type=korap:filter, filter={@type=korap:doc, key=textClass, value=Sport, match=match:eq}}";
+ cqt = new CollectionQueryTree();
+ cqt.process(query);
+ map = cqt.getRequestMap().toString();
+ assertEquals(expected.replaceAll(" ", ""), map.replaceAll(" ", ""));
+
+ query = "textClass!=Sport";
+ // String regex1 = "{@type=korap:filter, filter={@type=korap:doc, attribute=textClass, key=Sport, match=match:eq}}";
+ expected = "{@type=korap:filter, filter={@type=korap:doc, key=textClass, value=Sport, match=match:ne}}";
+ cqt = new CollectionQueryTree();
+ cqt.process(query);
+ map = cqt.getRequestMap().toString();
+ assertEquals(expected.replaceAll(" ", ""), map.replaceAll(" ", ""));
+ }
+
+ @Test
+ public void testTwoConjuncts() throws QueryException {
+ query = "textClass=Sport & year=2014";
+ expected =
+ "{@type=korap:filter, filter=" +
+ "{@type=korap:docGroup, relation=relation:and, operands=[" +
+ "{@type=korap:doc, key=textClass, value=Sport, match=match:eq}," +
+ "{@type=korap:doc, key=year, value=2014, match=match:eq}" +
+ "]}" +
+ "}";
+ cqt = new CollectionQueryTree();
+ cqt.process(query);
+ map = cqt.getRequestMap().toString();
+ assertEquals(expected.replaceAll(" ", ""), map.replaceAll(" ", ""));
+ }
+
+ @Test
+ public void testThreeConjuncts() throws QueryException {
+ query = "textClass=Sport & year=2014 & corpusID=WPD";
+ expected =
+ "{@type=korap:filter, filter=" +
+ "{@type=korap:docGroup, relation=relation:and, operands=[" +
+ "{@type=korap:doc, key=textClass, value=Sport, match=match:eq}," +
+ "{@type=korap:docGroup, relation=relation:and, operands=[" +
+ "{@type=korap:doc, key=year, value=2014, match=match:eq}," +
+ "{@type=korap:doc, key=corpusID, value=WPD, match=match:eq}" +
+ "]}" +
+ "]}" +
+ "}";
+ cqt = new CollectionQueryTree();
+ cqt.process(query);
+ map = cqt.getRequestMap().toString();
+ assertEquals(expected.replaceAll(" ", ""), map.replaceAll(" ", ""));
+ }
+
+
+ @Test
+ public void testTwoDisjuncts() throws QueryException {
+ query = "textClass=Sport | year=2014";
+ expected =
+ "{@type=korap:filter, filter=" +
+ "{@type=korap:docGroup, relation=relation:or, operands=[" +
+ "{@type=korap:doc, key=textClass, value=Sport, match=match:eq}," +
+ "{@type=korap:doc, key=year, value=2014, match=match:eq}" +
+ "]}" +
+ "}";
+ cqt = new CollectionQueryTree();
+ cqt.process(query);
+ map = cqt.getRequestMap().toString();
+ assertEquals(expected.replaceAll(" ", ""), map.replaceAll(" ", ""));
+ }
+
+ @Test
+ public void testThreeDisjuncts() throws QueryException {
+ query = "textClass=Sport | year=2014 | corpusID=WPD";
+ expected =
+ "{@type=korap:filter, filter=" +
+ "{@type=korap:docGroup, relation=relation:or, operands=[" +
+ "{@type=korap:doc, key=textClass, value=Sport, match=match:eq}," +
+ "{@type=korap:docGroup, relation=relation:or, operands=[" +
+ "{@type=korap:doc, key=year, value=2014, match=match:eq}," +
+ "{@type=korap:doc, key=corpusID, value=WPD, match=match:eq}" +
+ "]}" +
+ "]}" +
+ "}";
+ cqt = new CollectionQueryTree();
+ cqt.process(query);
+ map = cqt.getRequestMap().toString();
+ assertEquals(expected.replaceAll(" ", ""), map.replaceAll(" ", ""));
+ }
- @Test
- public void testComplex() throws QueryException {
- query = "(textClass=Sport | textClass=ausland) & corpusID=WPD";
- String regex1 = "{@type=korap:filter, filter={@type=korap:term, attribute=textClass, key=Sport, match=match:eq}}";
- ef = new CollectionQueryTree();
- ef.process(query);
- map = JsonUtils.toJSON(ef.getRequestMap());
-// assertEquals(regex1.replaceAll(" ", ""), map.replaceAll(" ", ""));
- System.out.println("THE QUERY 1: " + map);
- }
+ @Test
+ public void testMixed() throws QueryException {
+ query = "(textClass=Sport | textClass=ausland) & corpusID=WPD";
+ expected =
+ "{@type=korap:filter, filter=" +
+ "{@type=korap:docGroup, relation=relation:and, operands=[" +
+ "{@type=korap:docGroup, relation=relation:or, operands=[" +
+ "{@type=korap:doc, key=textClass, value=Sport, match=match:eq}," +
+ "{@type=korap:doc, key=textClass, value=ausland, match=match:eq}" +
+ "]}," +
+ "{@type=korap:doc, key=corpusID, value=WPD, match=match:eq}" +
+ "]}" +
+ "}";
+ cqt = new CollectionQueryTree();
+ cqt.process(query);
+ map = cqt.getRequestMap().toString();
+ assertEquals(expected.replaceAll(" ", ""), map.replaceAll(" ", ""));
+
+ query = "(textClass=Sport & textClass=ausland) & corpusID=WPD";
+ expected =
+ "{@type=korap:filter, filter=" +
+ "{@type=korap:docGroup, relation=relation:and, operands=[" +
+ "{@type=korap:docGroup, relation=relation:and, operands=[" +
+ "{@type=korap:doc, key=textClass, value=Sport, match=match:eq}," +
+ "{@type=korap:doc, key=textClass, value=ausland, match=match:eq}" +
+ "]}," +
+ "{@type=korap:doc, key=corpusID, value=WPD, match=match:eq}" +
+ "]}" +
+ "}";
+ cqt = new CollectionQueryTree();
+ cqt.process(query);
+ map = cqt.getRequestMap().toString();
+ assertEquals(expected.replaceAll(" ", ""), map.replaceAll(" ", ""));
+
+ query = "(textClass=Sport & textClass=ausland) | (corpusID=WPD & author=White)";
+ expected =
+ "{@type=korap:filter, filter=" +
+ "{@type=korap:docGroup, relation=relation:or, operands=[" +
+ "{@type=korap:docGroup, relation=relation:and, operands=[" +
+ "{@type=korap:doc, key=textClass, value=Sport, match=match:eq}," +
+ "{@type=korap:doc, key=textClass, value=ausland, match=match:eq}" +
+ "]}," +
+ "{@type=korap:docGroup, relation=relation:and, operands=[" +
+ "{@type=korap:doc, key=corpusID, value=WPD, match=match:eq}," +
+ "{@type=korap:doc, key=author, value=White, match=match:eq}" +
+ "]}" +
+ "]}" +
+ "}";
+ cqt = new CollectionQueryTree();
+ cqt.process(query);
+ map = cqt.getRequestMap().toString();
+ assertEquals(expected.replaceAll(" ", ""), map.replaceAll(" ", ""));
+
+ query = "(textClass=Sport & textClass=ausland) | (corpusID=WPD & author=White & year=2010)";
+ expected =
+ "{@type=korap:filter, filter=" +
+ "{@type=korap:docGroup, relation=relation:or, operands=[" +
+ "{@type=korap:docGroup, relation=relation:and, operands=[" +
+ "{@type=korap:doc, key=textClass, value=Sport, match=match:eq}," +
+ "{@type=korap:doc, key=textClass, value=ausland, match=match:eq}" +
+ "]}," +
+ "{@type=korap:docGroup, relation=relation:and, operands=[" +
+ "{@type=korap:doc, key=corpusID, value=WPD, match=match:eq}," +
+ "{@type=korap:docGroup, relation=relation:and, operands=[" +
+ "{@type=korap:doc, key=author, value=White, match=match:eq}," +
+ "{@type=korap:doc, key=year, value=2010, match=match:eq}" +
+ "]}" +
+ "]}" +
+ "]}" +
+ "}";
+ cqt = new CollectionQueryTree();
+ cqt.process(query);
+ map = cqt.getRequestMap().toString();
+ assertEquals(expected.replaceAll(" ", ""), map.replaceAll(" ", ""));
+ }
- @Test
- public void testBuilder() throws QueryException {
- CollectionQueryBuilder2 builder = new CollectionQueryBuilder2();
- builder.setQuery("(textClass=Sport | textClass=ausland) & corpusID=WPD");
- System.out.println("BUILDER RESULT: " + builder.toJSON());
- }
+ @Test
+ public void testDate() throws QueryException {
+ // search for pubDate between 1990 and 2010!
+ query = "1990<pubDate<2010";
+ expected =
+ "{@type=korap:filter, filter=" +
+ "{@type=korap:docGroup, relation=relation:and, operands=[" +
+ "{@type=korap:doc, key=pubDate, value=1990, match=match:gt}," +
+ "{@type=korap:doc, key=pubDate, value=2010, match=match:lt}" +
+ "]}" +
+ "}";
+ cqt = new CollectionQueryTree();
+ cqt.process(query);
+ map = cqt.getRequestMap().toString();
+ assertEquals(expected.replaceAll(" ", ""), map.replaceAll(" ", ""));
+
+ query = "pubDate>=1990";
+ expected =
+ "{@type=korap:filter, filter=" +
+ "{@type=korap:doc, key=pubDate, value=1990, match=match:geq}" +
+ "}";
+ cqt = new CollectionQueryTree();
+ cqt.process(query);
+ map = cqt.getRequestMap().toString();
+ assertEquals(expected.replaceAll(" ", ""), map.replaceAll(" ", ""));
+ }
-// @Test
- public void testSimpleBuilder() {
- CollectionQueryBuilder b = new CollectionQueryBuilder();
- b.addMetaFilter("corpusID", "WPD");
- b.addMetaFilter("textClass", "wissenschaft");
- b.setFilterAttributeRelation(Relation.AND);
- System.out.println("SIMPLE BUILDER RESULT: " + b.toCollections());
- }
-
- // old builder pubDate query
-// @Test
- public void testDateQuery() {
- CollectionQueryBuilder b = new CollectionQueryBuilder();
- String query = "pubDate=>" + TimeUtils.getNow().getMillis();
- query = query + " AND pubDate=<" + TimeUtils.getNow().getMillis();
- b.addMetaFilterQuery(query);
- b.setFilterAttributeRelation(Relation.AND);
- System.out.println("FINAL RESOURCE: " + b.toCollections());
- }
-
- @Test
- public void testDateNewQuery() throws QueryException {
- // search for pubDate between 1990 and 2010!
- String query = "1990<pubDate<2010 & genre=Sport";
- CollectionQueryBuilder2 q = new CollectionQueryBuilder2();
- q.setQuery(query);
- System.out.println("DATE QUERY RESULT: " + q.toJSON());
- }
-
+ @Test
+ public void testRegex() throws QueryException {
+ query = "author=/Go.*he/";
+ expected =
+ "{@type=korap:filter, filter=" +
+ "{@type=korap:doc, key=author, value=Go.*he, type=type:regex, match=match:eq}" +
+ "}";
+ cqt = new CollectionQueryTree();
+ cqt.process(query);
+ map = cqt.getRequestMap().toString();
+ assertEquals(expected.replaceAll(" ", ""), map.replaceAll(" ", ""));
+ }
}