Fixed regex in collection query & added flags to field values. Change-Id: I917afb5190aa71a408a1567fe8bb47d9f60c7c3f

commit: de23f88d82b1cce7919a8368ca335153da273400 [log] [tgz]
author: margaretha <margaretha@ids-mannheim.de> Mon Jan 08 18:32:24 2018 +0100
committer: margaretha <margaretha@ids-mannheim.de> Mon Jan 08 18:32:24 2018 +0100
tree: 53a5dfdcb412bdd350c1cd5efab1e59f3f6aa5ad
parent: 9de87c2ae3818dccb98249de0c8439350a386d4f [diff]
diff --git a/src/main/antlr/collection/CollectionQuery.g4 b/src/main/antlr/collection/CollectionQuery.g4
deleted file mode 100644
index 3116d25..0000000
--- a/src/main/antlr/collection/CollectionQuery.g4
+++ /dev/null

@@ -1,186 +0,0 @@
-grammar CollectionQuery;
-
-@header {package de.ids_mannheim.korap.query.parse.collection;}
-
-/*
- -- author: jbingel
- -- date: 2014-05-11
-*/
-
-/*
- * LEXER SECTION
- */
-/*
- Regular expression
- /x allows submatches like /^.*?RE.*?$/
- /X forces full matches
- /i means case insensitivity
- /I forces case sensitivity
-*/
-FLAG_xi      : '/' ( ('x'|'X') ('i'|'I')? );
-FLAG_ix      : '/' ( ('i'|'I') ('x'|'X')? );
- 
- 
-LRB					: '(';
-RRB					: ')';
-LB					: '[';
-RB					: ']';
-LT					: '<';
-GT					: '>';
-LEQ					: '<=';
-GEQ					: '>=';
-EQ					: '=';
-AND					: '&' | 'AND' | 'and' | 'UND' | 'und' ;
-OR					: '|' | 'OR' | 'or' | 'ODER' | 'oder' ;
-NEG					: '!';
-QMARK				: '?';
-SLASH				: '/';
-COLON				: ':';
-DASH				: '-';
-TILDE				: '~';
-NEGTILDE			: '!~';
-SINCE				: 'since';
-UNTIL				: 'until';
-IN					: 'in';
-ON					: 'on';
-WS 					: ( ' ' | '\t' | '\r' | '\n' )+ -> channel(HIDDEN);
-fragment NO_RE      : ~[ \t\/];
-fragment ALPHABET   : ~('\t' | ' ' | '/' | '*' | '?' | '+' | '{' | '}' | '[' | ']'
-                    | '(' | ')' | '|' | '"' | ',' | '\'' | '\\' | '!' | '=' | '~' | '&' | '^' | '<' | '>' );
-// EM: allow ':' in ALPHABET 
-fragment ALPHA		: [a-zA-Z];
-
-
-DIGIT		: [0-9];
-DATE
-: DIGIT DIGIT DIGIT DIGIT (DASH DIGIT DIGIT (DASH DIGIT DIGIT)?)?
-;
-
-NL                  : [\r\n] -> skip;
-ws                  : WS+;
-
-WORD				: ALPHABET+;
-//WORD                : ALPHABET* ALPHA ALPHABET*;  // needs to have at least one alphabetical letter (non-numeric)
-
-
-/*
- * Regular expressions
- */
-fragment FOCC	     : '{' WS* ( [0-9]* WS* ',' WS* [0-9]+ | [0-9]+ WS* ','? ) WS* '}';
-fragment RE_char     : ~('*' | '?' | '+' | '{' | '}' | '[' | ']' | '/'
-         	            | '(' | ')' | '|' | '"' | ':' | '\'' | '\\');
-fragment RE_alter    : ((RE_char | ('(' REGEX ')') | RE_chgroup) '|' REGEX )+;
-fragment RE_chgroup  : '[' RE_char+ ']';
-fragment RE_quant	 : (RE_star | RE_plus | RE_occ) QMARK?;
-fragment RE_opt      : (RE_char | RE_chgroup | ( '(' REGEX ')')) '?';
-fragment RE_star     : (RE_char | RE_chgroup | ( '(' REGEX ')')) '*';
-fragment RE_plus     : (RE_char | RE_chgroup | ( '(' REGEX ')')) '+';
-fragment RE_occ      : (RE_char | RE_chgroup | ( '(' REGEX ')')) FOCC;
-fragment RE_group    : '(' REGEX ')';
-REGEX     		     : SLASH ('.' | RE_char | RE_alter | RE_chgroup | RE_opt | RE_quant | RE_group)* SLASH;
-
-/*
- * PARSER SECTION
- */
-
-regex
-: REGEX
-;
-
-date
-: DATE
-;
-
-dateOp
-: SINCE
-| UNTIL
-| IN
-| ON
-;
-
-operator
-:	(NEG? EQ) | LT | GT | LEQ | GEQ | TILDE | NEGTILDE;
-
-expr
-: constraint
-| dateconstraint
-| token
-;
-
-dateconstraint
-: field dateOp date
-//| date dateOp field dateOp date
-;
-
-constraint
-: field operator value
-;
-
-token
-: LB (term|termGroup) RB
-;
-
-term       
-: NEG* (foundry SLASH)? layer termOp key (COLON value)? flag? 
-;
-
-termOp
-: (NEG? EQ? EQ | NEG? TILDE? TILDE)
-;
-
-termGroup
-: (term | LRB termGroup RRB) booleanOp (term | LRB termGroup RRB | termGroup)
-;
-
-key
-: WORD
-| regex
-;
-
-foundry
-: WORD
-;
-
-layer
-: WORD
-;
-
-booleanOp
-: AND 
-| OR 
-;
-
-flag
-: FLAG_xi 
-| FLAG_ix
-;
-	
-field
-: WORD
-;
-	
-value
-: WORD
-| DIGIT+
-| DATE
-| multiword
-| regex
-;
-
-multiword
-: '"' ~'"'* '"'
-;
-
-relation
-:	(expr|exprGroup) booleanOp (expr|exprGroup|relation)
-; 
-
-exprGroup
-:	LRB (expr | exprGroup | relation) RRB
-;
-
-start
-: expr EOF
-| exprGroup EOF 
-| relation EOF
-;

diff --git a/src/main/antlr/collection/CollectionQueryLexer.g4 b/src/main/antlr/collection/CollectionQueryLexer.g4
new file mode 100644
index 0000000..418e87c
--- /dev/null
+++ b/src/main/antlr/collection/CollectionQueryLexer.g4

@@ -0,0 +1,69 @@
+lexer grammar CollectionQueryLexer;
+
+@header {package de.ids_mannheim.korap.query.parse.collection;}
+
+/*
+ -- author: jbingel
+ -- date: 2014-05-11
+*/
+
+/*
+ * LEXER SECTION
+ */
+ 
+/*
+ Regular expression
+ /x allows submatches like /^.*?RE.*?$/
+ /X forces full matches
+ /i means case insensitivity
+ /I forces case sensitivity
+*/
+FLAG_xi      : '/' (('x'|'X') ('i'|'I')? );
+FLAG_ix      : '/' (('i'|'I') ('x'|'X')? );
+ 
+QUOTE				: '"'; 
+LRB					: '(';
+RRB					: ')';
+LB					: '[';
+RB					: ']';
+LT					: '<';
+GT					: '>';
+LEQ					: '<=';
+GEQ					: '>=';
+EQ					: '=';
+AND					: '&' | 'AND' | 'and' | 'UND' | 'und' ;
+OR					: '|' | 'OR' | 'or' | 'ODER' | 'oder' ;
+NEG					: '!';
+QMARK				: '?';
+SLASH				: '/';
+COLON				: ':';
+DASH				: '-';
+TILDE				: '~';
+NEGTILDE			: '!~';
+SINCE				: 'since';
+UNTIL				: 'until';
+IN					: 'in';
+ON					: 'on';
+WS 					: ( ' ' | '\t' | '\r' | '\n' )+ -> channel(HIDDEN);
+fragment NO_RE      : ~[ \t\/];
+fragment ALPHABET   : ~('\t' | ' ' | '/' | '*' | '?' | '+' | '{' | '}' | '[' | ']'
+                    | '(' | ')' | '|' | '"' | ',' | '\'' | '\\' | '!' | '=' | '~' | '&' | '^' | '<' | '>' );
+// EM: allow ':' in ALPHABET 
+fragment ALPHA		: [a-zA-Z];
+
+
+DIGIT		: [0-9];
+DATE
+: DIGIT DIGIT DIGIT DIGIT (DASH DIGIT DIGIT (DASH DIGIT DIGIT)?)?
+;
+
+NL                  : [\r\n] -> skip;
+//ws                  : WS+;
+
+WORD				: ALPHABET+;
+//WORD                : ALPHABET* ALPHA ALPHABET*;  // needs to have at least one alphabetical letter (non-numeric)
+
+REGEX     		     : SLASH .* SLASH; 
+
+
+

diff --git a/src/main/antlr/collection/CollectionQueryParser.g4 b/src/main/antlr/collection/CollectionQueryParser.g4
new file mode 100644
index 0000000..949cbcb
--- /dev/null
+++ b/src/main/antlr/collection/CollectionQueryParser.g4

@@ -0,0 +1,116 @@
+parser grammar CollectionQueryParser;
+
+@header {package de.ids_mannheim.korap.query.parse.collection;}
+
+options
+{
+language=Java;
+tokenVocab=CollectionQueryLexer;
+}
+
+/*
+ * PARSER SECTION
+ */
+
+regex
+: REGEX
+;
+
+date
+: DATE
+;
+
+dateOp
+: SINCE
+| UNTIL
+| IN
+| ON
+;
+
+operator
+:	(NEG? EQ) | LT | GT | LEQ | GEQ | TILDE | NEGTILDE;
+
+expr
+: constraint
+| dateConstraint
+| token
+;
+
+dateConstraint
+: field dateOp date
+//| date dateOp field dateOp date
+;
+
+constraint
+: field operator value flag?
+;
+
+token
+: LB (term|termGroup) RB
+;
+
+term       
+: NEG* (foundry SLASH)? layer termOp key (COLON value)? flag? 
+;
+
+termOp
+: (NEG? EQ? EQ | NEG? TILDE? TILDE)
+;
+
+termGroup
+: (term | LRB termGroup RRB) booleanOp (term | LRB termGroup RRB | termGroup)
+;
+
+key
+: WORD
+| regex
+;
+
+foundry
+: WORD
+;
+
+layer
+: WORD
+;
+
+booleanOp
+: AND 
+| OR 
+;
+
+flag
+: FLAG_xi 
+| FLAG_ix
+;
+	
+field
+: WORD
+;
+	
+value
+: WORD
+| DIGIT+
+| DATE
+| multiword
+| regex
+;
+
+multiword
+//: '"' ~'"'* '"'
+: QUOTE ~QUOTE* QUOTE
+;
+
+relation
+:	(expr|exprGroup) booleanOp (expr|exprGroup|relation)
+; 
+
+exprGroup
+:	LRB (expr | exprGroup | relation) RRB
+;
+
+start
+: expr EOF
+| exprGroup EOF 
+| relation EOF
+;

diff --git a/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java b/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java
index dd8d9e0..b87d4a0 100644
--- a/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java
+++ b/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java

@@ -135,7 +135,7 @@
             putIntoSuperObject(term);
         }
 
-        if (nodeCat.equals("dateconstraint")) {
+        if (nodeCat.equals("dateConstraint")) {
             ParseTree fieldNode = getFirstChildWithCat(node, "field");
             String field = fieldNode.getChild(0).toStringTree(parser);
             ParseTree dateOpNode = getFirstChildWithCat(node, "dateOp");
@@ -554,6 +554,12 @@
             // Tokenize input data
             ANTLRInputStream input = new ANTLRInputStream(query);
             lexer.setInputStream(input);
+            
+//            List<? extends Token> allTokens = lexer.getAllTokens();
+//            for (Token token : allTokens){
+//                System.out.println(token);
+//            }
+            
             CommonTokenStream tokens = new CommonTokenStream(lexer);
             parser = new CollectionQueryParser(tokens);
 

diff --git a/src/test/java/de/ids_mannheim/korap/query/test/collection/CollectionQueryProcessorTest.java b/src/test/java/de/ids_mannheim/korap/query/test/collection/CollectionQueryProcessorTest.java
index 52bd4d4..563603e 100644
--- a/src/test/java/de/ids_mannheim/korap/query/test/collection/CollectionQueryProcessorTest.java
+++ b/src/test/java/de/ids_mannheim/korap/query/test/collection/CollectionQueryProcessorTest.java

@@ -112,7 +112,20 @@
         assertEquals("match:contains", res.at("/collection/match").asText());
     }
 
-
+    @Test
+    public void testFlag () throws JsonProcessingException, IOException {
+        collection = "textClass=politik/i";
+        qs.setQuery(query, ql);
+        qs.setCollection(collection);
+        res = mapper.readTree(qs.toJSON());
+        System.out.println(res);
+        assertEquals("koral:doc", res.at("/collection/@type").asText());
+        assertEquals("textClass", res.at("/collection/key").asText());
+        assertEquals("politik", res.at("/collection/value").asText());
+        assertEquals("match:eq", res.at("/collection/match").asText());
+        
+    }
+    
 	@Test
     public void testRegex () throws JsonProcessingException, IOException {
         collection = "textClass=/politik/";
commit	de23f88d82b1cce7919a8368ca335153da273400	[log] [tgz]
author	margaretha <margaretha@ids-mannheim.de>	Mon Jan 08 18:32:24 2018 +0100
committer	margaretha <margaretha@ids-mannheim.de>	Mon Jan 08 18:32:24 2018 +0100
tree	53a5dfdcb412bdd350c1cd5efab1e59f3f6aa5ad
parent	9de87c2ae3818dccb98249de0c8439350a386d4f [diff]