Support special verbatim strings in virtual corpus constraints, fixes #57
Change-Id: I3d874b8f4635abdfa9f5671ee82a39b2a938400d
diff --git a/Changes b/Changes
index f7cd0cf..e3acb5a 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.32 2018-12-13
+ - [bugfix] Support verbatim string queries (#57; diewald).
+
0.31 2018-10-31
- [bugfix] Security upgrade of Jackson for CVE-2017-17485 and
CVE-2018-7489 (diewald)
diff --git a/pom.xml b/pom.xml
index 8540867..3e5b271 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
<groupId>de.ids_mannheim.korap</groupId>
<artifactId>Koral</artifactId>
- <version>0.31</version>
+ <version>0.32</version>
<packaging>jar</packaging>
<name>Koral</name>
<url>http://maven.apache.org</url>
diff --git a/src/main/antlr/collection/CollectionQueryLexer.g4 b/src/main/antlr/collection/CollectionQueryLexer.g4
index b286288..80dc91c 100644
--- a/src/main/antlr/collection/CollectionQueryLexer.g4
+++ b/src/main/antlr/collection/CollectionQueryLexer.g4
@@ -48,14 +48,13 @@
REF : 'referTo';
WS : ( ' ' | '\t' | '\r' | '\n' )+ -> channel(HIDDEN);
fragment NO_RE : ~[ \t\/];
-fragment ALPHABET : ~('\t' | ' ' | '/' | '*' | '?' | '+' | '{' | '}' | '[' | ']'
- | '(' | ')' | '|' | '"' | ',' | '\'' | '\\' | '!' | '=' | '~' | '&' | '^' | '<' | '>' );
+fragment ALPHABET : ~('\t' |
+ ' ' | '/' | '*' | '?' | '+' | '{' | '}' | '[' | ']'
+ | '(' | ')' | '|' | ',' | '\'' | '\\' | '!' | '=' | '~' | '&' | '^' | '<' | '>'
+ | '"' );
// EM: allow ':' in ALPHABET
fragment ALPHA : [a-zA-Z];
-ESC_QUOTE : BACKSLASH QUOTE;
-
-
DIGIT : [0-9];
DATE
: DIGIT DIGIT DIGIT DIGIT (DASH DIGIT DIGIT (DASH DIGIT DIGIT)?)?
@@ -67,4 +66,5 @@
WORD : ALPHABET+;
//WORD : ALPHABET* ALPHA ALPHABET*; // needs to have at least one alphabetical letter (non-numeric)
-REGEX : SLASH .*? SLASH;
+REGEX : SLASH ~['/']*? (BACKSLASH SLASH ~['/']*?)* SLASH;
+MULTIWORD : QUOTE ~['"']*? (BACKSLASH QUOTE ~['"']*?)* QUOTE;
diff --git a/src/main/antlr/collection/CollectionQueryParser.g4 b/src/main/antlr/collection/CollectionQueryParser.g4
index 3ed74d2..bbe4d32 100644
--- a/src/main/antlr/collection/CollectionQueryParser.g4
+++ b/src/main/antlr/collection/CollectionQueryParser.g4
@@ -111,8 +111,7 @@
;
multiword
-//: '"' ~'"'* '"'
-: QUOTE (~QUOTE | ESC_QUOTE)* QUOTE
+: MULTIWORD
;
relation
diff --git a/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java b/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java
index ccb1e6e..3848f5b 100644
--- a/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java
+++ b/src/main/java/de/ids_mannheim/korap/query/serialize/CollectionQueryProcessor.java
@@ -318,7 +318,8 @@
// remove leading and trailing slashes
if (stm.startsWith("/") && stm.endsWith("/")) {
- stm = stm.substring(1, stm.length() - 1);
+ stm = stm.substring(1, stm.length() - 1)
+ .replaceAll("\\\\\\\\","\\\\").replaceAll("\\\\/", "/");
};
map.put("value", stm);
}
diff --git a/src/test/java/de/ids_mannheim/korap/query/test/collection/CollectionQueryProcessorTest.java b/src/test/java/de/ids_mannheim/korap/query/test/collection/CollectionQueryProcessorTest.java
index 991f923..3c37b34 100644
--- a/src/test/java/de/ids_mannheim/korap/query/test/collection/CollectionQueryProcessorTest.java
+++ b/src/test/java/de/ids_mannheim/korap/query/test/collection/CollectionQueryProcessorTest.java
@@ -7,6 +7,7 @@
import java.io.IOException;
import org.junit.Test;
+import org.junit.Ignore;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
@@ -118,7 +119,7 @@
assertEquals("match:ne", res.at("/collection/match").asText());
}
-
+
@Test
public void testSpecialCharacters () throws JsonProcessingException,
IOException {
@@ -179,6 +180,35 @@
}
@Test
+ public void testVerbatimSpecial () throws JsonProcessingException, IOException {
+ collection = "corpusAuthor=\"Goethe, Johann Wolfgang von\"";
+ qs.setQuery(query, ql);
+ qs.setCollection(collection);
+ res = mapper.readTree(qs.toJSON());
+ assertEquals("koral:doc", res.at("/collection/@type").asText());
+ assertEquals("corpusAuthor", res.at("/collection/key").asText());
+ assertEquals("Goethe, Johann Wolfgang von", res.at("/collection/value").asText());
+ assertEquals("match:eq", res.at("/collection/match").asText());
+ }
+
+ @Test
+ public void testVerbatimNonGreedy () throws JsonProcessingException, IOException {
+ collection = "foundries=\"corenlp/constituency\" & foundries=\"corenlp/morpho\"";
+ qs.setQuery(query, ql);
+ qs.setCollection(collection);
+ res = mapper.readTree(qs.toJSON());
+ assertEquals("koral:docGroup", res.at("/collection/@type").asText());
+ assertEquals("operation:and", res.at("/collection/operation").asText());
+ assertEquals("koral:doc", res.at("/collection/operands/0/@type").asText());
+ assertEquals("foundries", res.at("/collection/operands/0/key").asText());
+ assertEquals("corenlp/constituency", res.at("/collection/operands/0/value").asText());
+ assertEquals("koral:doc", res.at("/collection/operands/1/@type").asText());
+ assertEquals("foundries", res.at("/collection/operands/1/key").asText());
+ assertEquals("corenlp/morpho", res.at("/collection/operands/1/value").asText());
+ }
+
+
+ @Test
public void testFlag () throws JsonProcessingException, IOException {
collection = "textClass=politik/i";
qs.setQuery(query, ql);
@@ -236,6 +266,18 @@
assertEquals("match:eq", res.at("/collection/match").asText());
}
+ @Test
+ public void testRegexEscape () throws JsonProcessingException, IOException {
+ collection = "textClass=/po\\/litik/";
+ qs.setQuery(query, ql);
+ qs.setCollection(collection);
+ res = mapper.readTree(qs.toJSON());
+ assertEquals("koral:doc", res.at("/collection/@type").asText());
+ assertEquals("textClass", res.at("/collection/key").asText());
+ assertEquals("po/litik", res.at("/collection/value").asText());
+ assertEquals("type:regex", res.at("/collection/type").asText());
+ assertEquals("match:eq", res.at("/collection/match").asText());
+ }
@Test