Fix #42 - Support verbatim string values in Poliqarp tokens
Change-Id: I4c0dd763f37f4f8f3eb454553e007858b34947e2
diff --git a/Changes b/Changes
index 4521dbb..e1a8869 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,7 @@
-0.29 2018-07-05
+0.29 2018-07-21
- Added check for errors on QuerySerializer object (diewald)
+ - Support verbatim string values in Poliqarp
+ (fixes #42; diewald)
0.28 2018-01-10
- Added some enums for koral:operation (margaretha)
diff --git a/src/main/antlr/poliqarpplus/PoliqarpPlusLexer.g4 b/src/main/antlr/poliqarpplus/PoliqarpPlusLexer.g4
index b3bedb1..d911511 100644
--- a/src/main/antlr/poliqarpplus/PoliqarpPlusLexer.g4
+++ b/src/main/antlr/poliqarpplus/PoliqarpPlusLexer.g4
@@ -51,7 +51,7 @@
/** Simple strings and Simple queries */
-WS : [ \t] -> skip ;
+WS : [ \t] -> channel(HIDDEN);
fragment FOCC : '{' WS* ( [0-9]* WS* ',' WS* [0-9]+ | [0-9]+ WS* ','? ) WS* '}';
fragment NO_RE : ~[ \t\/];
fragment ALPHABET : ~('\t' | ' ' | '/' | '*' | '?' | '+' | '{' | '}' | '[' | ']'
@@ -62,6 +62,7 @@
WORD : ALPHABET+;
+
/* Complex queries */
LPAREN : '[';
RPAREN : ']';
@@ -84,6 +85,8 @@
STAR : '*';
PLUS : '+';
EMPTYREL : '@';
+BACKSLASH : '\\';
+SQUOTE : '\'';
/* Regular expressions and Regex queries */
fragment RE_symbol : ~('*' | '?' | '+' | '{' | '}' | '[' | ']'
@@ -101,7 +104,9 @@
fragment RE_occ : (RE_char | RE_chgroup | ( '(' RE_expr ')')) FOCC;
fragment RE_group : '(' RE_expr ')';
fragment RE_expr : ('.' | RE_char | RE_alter | RE_chgroup | RE_opt | RE_quant | RE_group)+;
-fragment RE_dquote : '"' (RE_expr | '\'' | ':' )* '"';
-fragment RE_squote : '\'' (RE_expr | '\"' | ':' )* '\'';
+fragment RE_dquote : '"' (RE_expr | '\'' | ':' )* '"';
+// fragment RE_squote : '\'' (RE_expr | '\"' | ':' )* '\'';
-REGEX : ( RE_dquote | RE_squote );
+REGEX : RE_dquote;
+
+ESC_SQUOTE : BACKSLASH SQUOTE;
diff --git a/src/main/antlr/poliqarpplus/PoliqarpPlusParser.g4 b/src/main/antlr/poliqarpplus/PoliqarpPlusParser.g4
index 9a597ca..1236956 100644
--- a/src/main/antlr/poliqarpplus/PoliqarpPlusParser.g4
+++ b/src/main/antlr/poliqarpplus/PoliqarpPlusParser.g4
@@ -44,9 +44,14 @@
: REGEX
;
+verbatim
+: SQUOTE (~SQUOTE | ESC_SQUOTE)* SQUOTE;
+
+
key
: (WORD
| regex
+| verbatim
| NUMBER)
;
diff --git a/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java b/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java
index 9f449c9..835c2df 100644
--- a/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java
+++ b/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java
@@ -276,9 +276,9 @@
TokenStream stream = parser.getTokenStream();
String stm = stream.getText(valueNode.getChild(0).getSourceInterval());
- // todo: is this correct?
+
if (stm.startsWith("\"") && stm.endsWith("\""))
- stm = stm.replaceAll("\"", "");
+ stm = stm.substring(1, stm.length()-1);
if ("regex".equals(node_cat)) {
diff --git a/src/main/java/de/ids_mannheim/korap/query/serialize/PoliqarpPlusQueryProcessor.java b/src/main/java/de/ids_mannheim/korap/query/serialize/PoliqarpPlusQueryProcessor.java
index 3a2a594..7727011 100644
--- a/src/main/java/de/ids_mannheim/korap/query/serialize/PoliqarpPlusQueryProcessor.java
+++ b/src/main/java/de/ids_mannheim/korap/query/serialize/PoliqarpPlusQueryProcessor.java
@@ -799,8 +799,22 @@
// process foundry
if (foundryNode != null)
term.put("foundry", foundryNode.getText());
- // process key: 'normal' or regex?
- key = keyNode.getText();
+
+ // process key: 'normal', 'verbatim' or regex?
+ if (getNodeCat(keyNode.getChild(0)).equals("verbatim")) {
+
+ // Get stream from hidden channel
+ TokenStream stream = parser.getTokenStream();
+ key = stream.getText(keyNode.getChild(0).getSourceInterval());
+
+ if (key.startsWith("'") && key.endsWith("'"))
+ key = key.substring(1, key.length()-1);
+
+ }
+ else {
+ key = keyNode.getText();
+ };
+
if (getNodeCat(keyNode.getChild(0)).equals("regex")) {
isRegex = true;
term.put("type", "type:regex");
@@ -830,8 +844,10 @@
}
}
// process value
- if (valueNode != null)
+ if (valueNode != null) {
term.put("value", valueNode.getText());
+ };
+
// process operator ("match" property)
if (termOpNode != null) {
String termOp = termOpNode.getText();
diff --git a/src/test/java/de/ids_mannheim/korap/query/test/poliqarpplus/PoliqarpPlusQueryProcessorTest.java b/src/test/java/de/ids_mannheim/korap/query/test/poliqarpplus/PoliqarpPlusQueryProcessorTest.java
index f964f8b..9e8771e 100644
--- a/src/test/java/de/ids_mannheim/korap/query/test/poliqarpplus/PoliqarpPlusQueryProcessorTest.java
+++ b/src/test/java/de/ids_mannheim/korap/query/test/poliqarpplus/PoliqarpPlusQueryProcessorTest.java
@@ -97,12 +97,6 @@
res = mapper.readTree(qs.toJSON());
assertEquals(302, res.at("/errors/0/0").asInt());
assertEquals(302, res.at("/errors/1/0").asInt());
- /*
- assertEquals("koral:token", res.at("/query/@type").asText());
- assertEquals("Mann", res.at("/query/wrap/key").asText());
- assertEquals("lemma", res.at("/query/wrap/layer").asText());
- assertEquals("match:eq", res.at("/query/wrap/match").asText());
- */
}
@Test
@@ -132,6 +126,26 @@
assertEquals("match:eq", res.at("/query/wrap/match").asText());
}
+ @Test
+ public void testVerbatimKeys () throws JsonProcessingException, IOException {
+ query = "[mate/b='Der + Mann']";
+ qs.setQuery(query, "poliqarpplus");
+ assertFalse(qs.hasErrors());
+ res = mapper.readTree(qs.toJSON());
+ assertEquals("koral:token", res.at("/query/@type").asText());
+ assertEquals("koral:term", res.at("/query/wrap/@type").asText());
+ assertEquals("Der + Mann", res.at("/query/wrap/key").asText());
+ assertEquals("b", res.at("/query/wrap/layer").asText());
+ assertEquals("mate", res.at("/query/wrap/foundry").asText());
+ assertEquals("match:eq", res.at("/query/wrap/match").asText());
+
+ query = "[mate/b='D\\'Ma nn']";
+ qs.setQuery(query, "poliqarpplus");
+ assertFalse(qs.hasErrors());
+ res = mapper.readTree(qs.toJSON());
+ assertEquals("D\\'Ma nn", res.at("/query/wrap/key").asText());
+ }
+
// todo:
@Test