Fixed regex in collection query & added flags to field values.
Change-Id: I917afb5190aa71a408a1567fe8bb47d9f60c7c3f
diff --git a/src/main/antlr/collection/CollectionQuery.g4 b/src/main/antlr/collection/CollectionQuery.g4
deleted file mode 100644
index 3116d25..0000000
--- a/src/main/antlr/collection/CollectionQuery.g4
+++ /dev/null
@@ -1,186 +0,0 @@
-grammar CollectionQuery;
-
-@header {package de.ids_mannheim.korap.query.parse.collection;}
-
-/*
- -- author: jbingel
- -- date: 2014-05-11
-*/
-
-/*
- * LEXER SECTION
- */
-/*
- Regular expression
- /x allows submatches like /^.*?RE.*?$/
- /X forces full matches
- /i means case insensitivity
- /I forces case sensitivity
-*/
-FLAG_xi : '/' ( ('x'|'X') ('i'|'I')? );
-FLAG_ix : '/' ( ('i'|'I') ('x'|'X')? );
-
-
-LRB : '(';
-RRB : ')';
-LB : '[';
-RB : ']';
-LT : '<';
-GT : '>';
-LEQ : '<=';
-GEQ : '>=';
-EQ : '=';
-AND : '&' | 'AND' | 'and' | 'UND' | 'und' ;
-OR : '|' | 'OR' | 'or' | 'ODER' | 'oder' ;
-NEG : '!';
-QMARK : '?';
-SLASH : '/';
-COLON : ':';
-DASH : '-';
-TILDE : '~';
-NEGTILDE : '!~';
-SINCE : 'since';
-UNTIL : 'until';
-IN : 'in';
-ON : 'on';
-WS : ( ' ' | '\t' | '\r' | '\n' )+ -> channel(HIDDEN);
-fragment NO_RE : ~[ \t\/];
-fragment ALPHABET : ~('\t' | ' ' | '/' | '*' | '?' | '+' | '{' | '}' | '[' | ']'
- | '(' | ')' | '|' | '"' | ',' | '\'' | '\\' | '!' | '=' | '~' | '&' | '^' | '<' | '>' );
-// EM: allow ':' in ALPHABET
-fragment ALPHA : [a-zA-Z];
-
-
-DIGIT : [0-9];
-DATE
-: DIGIT DIGIT DIGIT DIGIT (DASH DIGIT DIGIT (DASH DIGIT DIGIT)?)?
-;
-
-NL : [\r\n] -> skip;
-ws : WS+;
-
-WORD : ALPHABET+;
-//WORD : ALPHABET* ALPHA ALPHABET*; // needs to have at least one alphabetical letter (non-numeric)
-
-
-/*
- * Regular expressions
- */
-fragment FOCC : '{' WS* ( [0-9]* WS* ',' WS* [0-9]+ | [0-9]+ WS* ','? ) WS* '}';
-fragment RE_char : ~('*' | '?' | '+' | '{' | '}' | '[' | ']' | '/'
- | '(' | ')' | '|' | '"' | ':' | '\'' | '\\');
-fragment RE_alter : ((RE_char | ('(' REGEX ')') | RE_chgroup) '|' REGEX )+;
-fragment RE_chgroup : '[' RE_char+ ']';
-fragment RE_quant : (RE_star | RE_plus | RE_occ) QMARK?;
-fragment RE_opt : (RE_char | RE_chgroup | ( '(' REGEX ')')) '?';
-fragment RE_star : (RE_char | RE_chgroup | ( '(' REGEX ')')) '*';
-fragment RE_plus : (RE_char | RE_chgroup | ( '(' REGEX ')')) '+';
-fragment RE_occ : (RE_char | RE_chgroup | ( '(' REGEX ')')) FOCC;
-fragment RE_group : '(' REGEX ')';
-REGEX : SLASH ('.' | RE_char | RE_alter | RE_chgroup | RE_opt | RE_quant | RE_group)* SLASH;
-
-/*
- * PARSER SECTION
- */
-
-regex
-: REGEX
-;
-
-date
-: DATE
-;
-
-dateOp
-: SINCE
-| UNTIL
-| IN
-| ON
-;
-
-operator
-: (NEG? EQ) | LT | GT | LEQ | GEQ | TILDE | NEGTILDE;
-
-expr
-: constraint
-| dateconstraint
-| token
-;
-
-dateconstraint
-: field dateOp date
-//| date dateOp field dateOp date
-;
-
-constraint
-: field operator value
-;
-
-token
-: LB (term|termGroup) RB
-;
-
-term
-: NEG* (foundry SLASH)? layer termOp key (COLON value)? flag?
-;
-
-termOp
-: (NEG? EQ? EQ | NEG? TILDE? TILDE)
-;
-
-termGroup
-: (term | LRB termGroup RRB) booleanOp (term | LRB termGroup RRB | termGroup)
-;
-
-key
-: WORD
-| regex
-;
-
-foundry
-: WORD
-;
-
-layer
-: WORD
-;
-
-booleanOp
-: AND
-| OR
-;
-
-flag
-: FLAG_xi
-| FLAG_ix
-;
-
-field
-: WORD
-;
-
-value
-: WORD
-| DIGIT+
-| DATE
-| multiword
-| regex
-;
-
-multiword
-: '"' ~'"'* '"'
-;
-
-relation
-: (expr|exprGroup) booleanOp (expr|exprGroup|relation)
-;
-
-exprGroup
-: LRB (expr | exprGroup | relation) RRB
-;
-
-start
-: expr EOF
-| exprGroup EOF
-| relation EOF
-;
diff --git a/src/main/antlr/collection/CollectionQueryLexer.g4 b/src/main/antlr/collection/CollectionQueryLexer.g4
new file mode 100644
index 0000000..418e87c
--- /dev/null
+++ b/src/main/antlr/collection/CollectionQueryLexer.g4
@@ -0,0 +1,69 @@
+lexer grammar CollectionQueryLexer;
+
+@header {package de.ids_mannheim.korap.query.parse.collection;}
+
+/*
+ -- author: jbingel
+ -- date: 2014-05-11
+*/
+
+/*
+ * LEXER SECTION
+ */
+
+/*
+ Regular expression
+ /x allows submatches like /^.*?RE.*?$/
+ /X forces full matches
+ /i means case insensitivity
+ /I forces case sensitivity
+*/
+FLAG_xi : '/' (('x'|'X') ('i'|'I')? );
+FLAG_ix : '/' (('i'|'I') ('x'|'X')? );
+
+QUOTE : '"';
+LRB : '(';
+RRB : ')';
+LB : '[';
+RB : ']';
+LT : '<';
+GT : '>';
+LEQ : '<=';
+GEQ : '>=';
+EQ : '=';
+AND : '&' | 'AND' | 'and' | 'UND' | 'und' ;
+OR : '|' | 'OR' | 'or' | 'ODER' | 'oder' ;
+NEG : '!';
+QMARK : '?';
+SLASH : '/';
+COLON : ':';
+DASH : '-';
+TILDE : '~';
+NEGTILDE : '!~';
+SINCE : 'since';
+UNTIL : 'until';
+IN : 'in';
+ON : 'on';
+WS : ( ' ' | '\t' | '\r' | '\n' )+ -> channel(HIDDEN);
+fragment NO_RE : ~[ \t\/];
+fragment ALPHABET : ~('\t' | ' ' | '/' | '*' | '?' | '+' | '{' | '}' | '[' | ']'
+ | '(' | ')' | '|' | '"' | ',' | '\'' | '\\' | '!' | '=' | '~' | '&' | '^' | '<' | '>' );
+// EM: allow ':' in ALPHABET
+fragment ALPHA : [a-zA-Z];
+
+
+DIGIT : [0-9];
+DATE
+: DIGIT DIGIT DIGIT DIGIT (DASH DIGIT DIGIT (DASH DIGIT DIGIT)?)?
+;
+
+NL : [\r\n] -> skip;
+//ws : WS+;
+
+WORD : ALPHABET+;
+//WORD : ALPHABET* ALPHA ALPHABET*; // needs to have at least one alphabetical letter (non-numeric)
+
+REGEX : SLASH .* SLASH;
+
+
+
diff --git a/src/main/antlr/collection/CollectionQueryParser.g4 b/src/main/antlr/collection/CollectionQueryParser.g4
new file mode 100644
index 0000000..949cbcb
--- /dev/null
+++ b/src/main/antlr/collection/CollectionQueryParser.g4
@@ -0,0 +1,116 @@
+parser grammar CollectionQueryParser;
+
+@header {package de.ids_mannheim.korap.query.parse.collection;}
+
+options
+{
+language=Java;
+tokenVocab=CollectionQueryLexer;
+}
+
+/*
+ * PARSER SECTION
+ */
+
+regex
+: REGEX
+;
+
+date
+: DATE
+;
+
+dateOp
+: SINCE
+| UNTIL
+| IN
+| ON
+;
+
+operator
+: (NEG? EQ) | LT | GT | LEQ | GEQ | TILDE | NEGTILDE;
+
+expr
+: constraint
+| dateConstraint
+| token
+;
+
+dateConstraint
+: field dateOp date
+//| date dateOp field dateOp date
+;
+
+constraint
+: field operator value flag?
+;
+
+token
+: LB (term|termGroup) RB
+;
+
+term
+: NEG* (foundry SLASH)? layer termOp key (COLON value)? flag?
+;
+
+termOp
+: (NEG? EQ? EQ | NEG? TILDE? TILDE)
+;
+
+termGroup
+: (term | LRB termGroup RRB) booleanOp (term | LRB termGroup RRB | termGroup)
+;
+
+key
+: WORD
+| regex
+;
+
+foundry
+: WORD
+;
+
+layer
+: WORD
+;
+
+booleanOp
+: AND
+| OR
+;
+
+flag
+: FLAG_xi
+| FLAG_ix
+;
+
+field
+: WORD
+;
+
+value
+: WORD
+| DIGIT+
+| DATE
+| multiword
+| regex
+;
+
+multiword
+//: '"' ~'"'* '"'
+: QUOTE ~QUOTE* QUOTE
+;
+
+relation
+: (expr|exprGroup) booleanOp (expr|exprGroup|relation)
+;
+
+exprGroup
+: LRB (expr | exprGroup | relation) RRB
+;
+
+start
+: expr EOF
+| exprGroup EOF
+| relation EOF
+;
diff --git a/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java b/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java
index dd8d9e0..b87d4a0 100644
--- a/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java
+++ b/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java
@@ -135,7 +135,7 @@
putIntoSuperObject(term);
}
- if (nodeCat.equals("dateconstraint")) {
+ if (nodeCat.equals("dateConstraint")) {
ParseTree fieldNode = getFirstChildWithCat(node, "field");
String field = fieldNode.getChild(0).toStringTree(parser);
ParseTree dateOpNode = getFirstChildWithCat(node, "dateOp");
@@ -554,6 +554,12 @@
// Tokenize input data
ANTLRInputStream input = new ANTLRInputStream(query);
lexer.setInputStream(input);
+
+// List<? extends Token> allTokens = lexer.getAllTokens();
+// for (Token token : allTokens){
+// System.out.println(token);
+// }
+
CommonTokenStream tokens = new CommonTokenStream(lexer);
parser = new CollectionQueryParser(tokens);
diff --git a/src/test/java/de/ids_mannheim/korap/query/test/collection/CollectionQueryProcessorTest.java b/src/test/java/de/ids_mannheim/korap/query/test/collection/CollectionQueryProcessorTest.java
index 52bd4d4..563603e 100644
--- a/src/test/java/de/ids_mannheim/korap/query/test/collection/CollectionQueryProcessorTest.java
+++ b/src/test/java/de/ids_mannheim/korap/query/test/collection/CollectionQueryProcessorTest.java
@@ -112,7 +112,20 @@
assertEquals("match:contains", res.at("/collection/match").asText());
}
-
+ @Test
+ public void testFlag () throws JsonProcessingException, IOException {
+ collection = "textClass=politik/i";
+ qs.setQuery(query, ql);
+ qs.setCollection(collection);
+ res = mapper.readTree(qs.toJSON());
+ System.out.println(res);
+ assertEquals("koral:doc", res.at("/collection/@type").asText());
+ assertEquals("textClass", res.at("/collection/key").asText());
+ assertEquals("politik", res.at("/collection/value").asText());
+ assertEquals("match:eq", res.at("/collection/match").asText());
+
+ }
+
@Test
public void testRegex () throws JsonProcessingException, IOException {
collection = "textClass=/politik/";