Improve regex treatment of negative character classes
Change-Id: I4917fa2faae1b8a046ec24826a854b89a297d1f7
Reviewed-on: https://korap.ids-mannheim.de/gerrit/c/KorAP/Koral/+/6018
Reviewed-by: Nils Diewald <nils@diewald-online.de>
diff --git a/Changes b/Changes
index 213e3e0..dc38dac 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.38.2 2022-08-31
+ - [feature] Improve regex treatment of negative
+ character classes (diewald)
+
0.38.1 2022-10-05
- [security] Updated log4j (diewald)
- Fixed Annis OR group (resolved #96)
diff --git a/pom.xml b/pom.xml
index 10c9173..dceb9ca 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
<groupId>de.ids_mannheim.korap</groupId>
<artifactId>Koral</artifactId>
- <version>0.38.1</version>
+ <version>0.38.2</version>
<packaging>jar</packaging>
<name>Koral</name>
<url>https://korap.ids-mannheim.de</url>
diff --git a/src/main/antlr/poliqarpplus/PoliqarpPlusLexer.g4 b/src/main/antlr/poliqarpplus/PoliqarpPlusLexer.g4
index 420856e..2189657 100644
--- a/src/main/antlr/poliqarpplus/PoliqarpPlusLexer.g4
+++ b/src/main/antlr/poliqarpplus/PoliqarpPlusLexer.g4
@@ -94,20 +94,20 @@
/* Regular expressions and Regex queries */
fragment RE_symbol : ~('*' | '?' | '+' | '{' | '}' | '[' | ']'
- | '(' | ')' | '|' | '\\' | '"' | ':' | '\'');
+ | '(' | ')' | '|' | '\\' | '"' | ':' | '\'' | '^' );
fragment RE_esc : '\\' ('.' | '*' | '?' | '+' | '{' | '}' | '[' | ']'
- | '(' | ')' | '|' | '\\' | '"' | ':' | '\'');
+ | '(' | ')' | '|' | '\\' | '"' | ':' | '\'' | '^' );
fragment RE_char : (RE_symbol | RE_esc );
fragment RE_alter : ((RE_char | ('(' RE_expr ')') | RE_chgroup) '|' RE_expr )+;
-fragment RE_chgroup : '[' RE_char+ ']';
+fragment RE_chgroup : '[' '^'? RE_char+ ']';
fragment RE_quant : (RE_star | RE_plus | RE_occ) QMARK?;
fragment RE_opt : (RE_char | RE_chgroup | ( '(' RE_expr ')')) '?';
fragment RE_star : (RE_char | RE_chgroup | ( '(' RE_expr ')')) '*';
fragment RE_plus : (RE_char | RE_chgroup | ( '(' RE_expr ')')) '+';
fragment RE_occ : (RE_char | RE_chgroup | ( '(' RE_expr ')')) FOCC;
fragment RE_group : '(' RE_expr ')';
-fragment RE_expr : ('.' | RE_char | RE_alter | RE_chgroup | RE_opt | RE_quant | RE_group)+;
+fragment RE_expr : ('.' | RE_char | RE_alter | RE_chgroup | RE_opt | RE_quant | RE_group | '^' )+;
fragment RE_dquote : ('"'|'„'|'“') (RE_expr | '\'' | ':' )* ('"'|'“'|'”');
REGEX : RE_dquote;
diff --git a/src/test/java/de/ids_mannheim/korap/query/test/poliqarpplus/PoliqarpPlusQueryProcessorTest.java b/src/test/java/de/ids_mannheim/korap/query/test/poliqarpplus/PoliqarpPlusQueryProcessorTest.java
index dd4daad..675debc 100644
--- a/src/test/java/de/ids_mannheim/korap/query/test/poliqarpplus/PoliqarpPlusQueryProcessorTest.java
+++ b/src/test/java/de/ids_mannheim/korap/query/test/poliqarpplus/PoliqarpPlusQueryProcessorTest.java
@@ -258,6 +258,21 @@
assertEquals("type:regex", res.at("/query/wrap/type").asText());
assertEquals("orth", res.at("/query/wrap/layer").asText());
assertEquals("match:eq", res.at("/query/wrap/match").asText());
+
+ query = "\"d[^ae]r^\"";
+ qs.setQuery(query, "poliqarpplus");
+ res = mapper.readTree(qs.toJSON());
+ assertEquals("koral:token", res.at("/query/@type").asText());
+ assertEquals("koral:term", res.at("/query/wrap/@type").asText());
+ assertEquals("d[^ae]r^", res.at("/query/wrap/key").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+ assertEquals("match:eq", res.at("/query/wrap/match").asText());
+
+ query = "\"d[a^e]r\"";
+ qs.setQuery(query, "poliqarpplus");
+ res = mapper.readTree(qs.toJSON());
+ assertEquals("302", res.at("/errors/0/0").asText());
}
@Test