Issue #66: REG: missing #REG-Operator implemented: rebased.
Added new tests.
Moved general purpose methods to StringUtils.java.
Change-Id: I42f12251a73511fff07b48e06f6018ba1e181433
Reviewed-on: https://korap.ids-mannheim.de/gerrit/c/KorAP/Koral/+/7658
Reviewed-by: Nils Diewald <nils@diewald-online.de>
diff --git a/pom.xml b/pom.xml
index 6bdd557..8d22bf5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -177,6 +177,7 @@
<exclude>**/c2ps_opIN.java</exclude>
<exclude>**/c2ps_opOV.java</exclude>
<exclude>**/c2ps_opPROX.java</exclude>
+ <exclude>**/c2ps_opREG.java</exclude>
<exclude>**/c2ps_opWF.java</exclude>
<exclude>**/c2ps_optCase.java</exclude>
<exclude>**/.gitignore</exclude>
diff --git a/src/main/antlr/cosmas/c2ps.g b/src/main/antlr/cosmas/c2ps.g
index c264ea6..8908a49 100644
--- a/src/main/antlr/cosmas/c2ps.g
+++ b/src/main/antlr/cosmas/c2ps.g
@@ -1,16 +1,20 @@
- // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
-// //
-// COSMAS II zeilenorientierten Suchanfragesprache (C2 plain syntax) //
-// globale Grammatik (ruft lokale c2ps_x.g Grammatiken auf). //
-// 17.12.12/FB //
-// v-0.6 //
-// TODO: //
-// - se1: Einsetzen des Default-Operators in den kumulierten AST. //
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+//
+// COSMAS II zeilenorientierten Suchanfragesprache (C2 plain syntax)
+// globale Grammatik (ruft lokale c2ps_x.g Grammatiken auf).
+// 17.12.12/FB
+// v-0.6
+// TODO:
+// - se1: Einsetzen des Default-Operators in den kumulierten AST.
+//
+// v0.7 - 25.07.23/FB
+// - added: #REG(x)
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
grammar c2ps;
options { output=AST; backtrack=true; k=5;}
+// tokens that will appear as node names in the resulting AST:
tokens {C2PQ; OPBED; OPTS; OPBEG; OPEND; OPNHIT; OPALL; OPLEM; OPPROX;
ARG1; ARG2;
OPWF; OPLEM; OPANNOT;
@@ -21,6 +25,7 @@
OPNOT;
OPEXPR1;
OPMORPH; OPELEM;
+ OPREG;
}
@header {package de.ids_mannheim.korap.query.parse.cosmas;}
@@ -76,6 +81,14 @@
OP_OV : '#OV' | '#OV(' OP_OV_OPTS? ')' ;
+// #REG(abc['"]) or #REG('abc\'s') or #REG("abc\"s"):
+
+OP_REG : '#REG(' ' '* '\'' ('\\\''|~'\'')+ '\'' (' ')* ')'
+ |
+ '#REG(' ' '* '"' ('\\"'|~'"')+ '"' (' ')* ')'
+ |
+ '#REG(' ' '* ~('\''|'"'|' ') (~(')'))* ')';
+
// EAVEXP wird hier eingesetzt für eine beliebige Sequenz von Zeichen bis zu ')'.
fragment OP_IN_OPTS
: EAVEXPR ;
@@ -241,7 +254,7 @@
// OP1: Suchoperatoren mit 1 Argument:
// -----------------------------------
-op1 : opBEG | opEND | opNHIT | opALL | opBED;
+op1 : opBEG | opEND | opNHIT | opALL | opBED | opREG;
// #BED(serchExpr, B).
// B muss nachträglich in einer lokalen Grammatik überprüft werden.
@@ -259,3 +272,6 @@
opNHIT : ( '#NHIT(' | '#INKLUSIVE(' ) searchExpr ')' -> ^(OPNHIT searchExpr) ;
opALL : ( '#ALL(' | '#EXKLUSIVE(' ) searchExpr ')' -> ^(OPALL searchExpr) ;
+
+opREG : OP_REG -> ^(OPREG {c2ps_opREG.encode($OP_REG.text, OPREG)}) ;
+
diff --git a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java
index fb9df4e..35f6437 100644
--- a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java
+++ b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java
@@ -17,7 +17,8 @@
c2ps_opBEDParser.opBEDOpts_return c2PQReturn = null;
/*
- System.out.println("check opBED: " + index + ": " + input);
+ System.out.format("opBED: check: input='%s', index=%d.\n", input, index);
+ System.out.format("opBED: tokens ='%s'.\n", tokens.toString());
System.out.flush();
*/
@@ -68,7 +69,7 @@
public static void main (String args[]) throws Exception {
- String[] input = { ",sa,se,-ta,-te/pa,-pe)", ",sa)", ",/pa,-pe)" };
+ String[] input = { ",sa,se,-ta,-te/pa,-pe)", ",sa)", ",/pa,-pe)"};
Tree tree;
for (int i = 0; i < input.length; i++) {
diff --git a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java
new file mode 100644
index 0000000..a798647
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java
@@ -0,0 +1,235 @@
+package de.ids_mannheim.korap.query.parse.cosmas;
+
+import org.antlr.runtime.*;
+import org.antlr.runtime.tree.*;
+
+import de.ids_mannheim.korap.query.serialize.util.Antlr3DescriptiveErrorListener;
+import de.ids_mannheim.korap.util.StringUtils;
+
+/*
+ * 1. transforms and encodes a regular COSMAS II like expression #REG(regexpr)
+ * into a AST tree -> encode().
+ * 2. transforms tree into the corresponding Koral:token/Koral:term, like:
+ * e.g. #REG(abc[']?s) ->
+ * {
+ * "@type": "koral:term",
+ * "match": "match:eq",
+ * "type" : "type:regex",
+ * "key" : "abc[']?s",
+ * "layer": "orth"
+ * }...
+ *
+ * - see doc: http://korap.github.io/Koral/
+ * - generation of koral:term -> processOPREG().
+ * 06.09.23/FB
+ */
+
+public class c2ps_opREG
+
+{
+ private static boolean DEBUG = false;
+
+ /*
+ * encode():
+ *
+ * input = e.g. "#REG('abc(d|e)*')" -> return AST = (OPREG abc(d|e)*):
+ *
+ * Returned String: no enclosing "..." needed, so no escaping of " nor \ needed.
+ * 06.09.23/FB
+ */
+ public static Tree encode (String input, int tokenType)
+
+ {
+ if( DEBUG )
+ {
+ System.out.printf("opREG.encode: input = >>%s<<, token type=%d.\n", input, tokenType);
+ System.out.flush();
+ }
+
+ if( input.substring(0, 5).compareToIgnoreCase("#REG(") != 0 || input.charAt(input.length()-1) != ')' )
+ {
+ // error: '#REG(' and ')' not found: return input unchanged.
+ if( DEBUG ) System.out.printf("opREG.encode: unexpected input = >>%s<<: nothing encoded!\n", input);
+ return new CommonTree(new CommonToken(tokenType, input));
+ }
+
+
+ StringBuffer sb = new StringBuffer(input.substring(5));
+ sb.deleteCharAt(sb.length()-1);
+
+ // #REG("a"), #REG(a), #REG('a') -> >>a<<.
+ // enclosing ".." are appended at the end of this function.
+ // a. remove blanks around ".." and '..',
+ // e.g. a. #REG( ' abc ' ) -> #REG(' abc ').
+
+ StringUtils.removeBlanksAtBothSides(sb);
+
+ if( sb.charAt(0) == '\'' || sb.charAt(0) == '"')
+ {
+ // remove pairwise at both ends.
+ sb.deleteCharAt(0);
+ if( sb.charAt(sb.length()-1) == '\'' || sb.charAt(sb.length()-1) == '"' )
+ sb.deleteCharAt(sb.length()-1);
+ }
+
+ // b. remove blanks inside '..' or "..",
+ // E.g. #REG(' abc ') -> #REG('abc'):
+
+ StringUtils.removeBlanksAtBothSides(sb);
+
+ /* unescape >>'<<, >>"<< and >>\<<.
+ * e.g. #REG('that\'s') -> "that\'s" -> >>that's<<.
+ */
+
+ for(int i=0; i<sb.length()-1; i++)
+ {
+ if( sb.charAt(i) == '\\' &&
+ (sb.charAt(i+1) == '\'' || sb.charAt(i+1) == '"' || sb.charAt(i+1) == '\\' ))
+ sb.deleteCharAt(i);
+ }
+
+ /* old version:
+ for(int i=0; i<sb.length()-1; i++)
+ {
+ if( sb.charAt(i) == '\\' && sb.charAt(i+1) == '\'' )
+ sb.deleteCharAt(i);
+ }
+ */
+
+ /* old version:
+ * encode2DoubleQuoted(sb);
+ */
+
+ if( DEBUG )
+ System.out.printf("opREG.encode: encoded = >>%s<<.\n", sb.toString());
+
+ return new CommonTree(new CommonToken(tokenType, sb.toString()));
+
+ } // encode
+
+ /*
+ * printTokens:
+ * Notes:
+ * - must build a separate CommonTokenStream here, because
+ * tokens.fill() will consume all tokens.
+ * - prints to stdout list of tokens from lexer.
+ * - mainly for debugging.
+ * 14.09.23/FB
+ *
+ */
+
+ private static void printTokens(String query, Antlr3DescriptiveErrorListener errorListener)
+
+ {
+ ANTLRStringStream
+ ss = new ANTLRStringStream(query);
+ c2psLexer
+ lex = new c2psLexer(ss);
+ org.antlr.runtime.CommonTokenStream
+ tokens = new org.antlr.runtime.CommonTokenStream(lex); // v3
+
+ lex.setErrorReporter(errorListener);
+
+ // get all tokens from lexer:
+ tokens.fill();
+
+ System.out.printf("opREG.check: no. of tokens = %d.\n", tokens.size());
+ for(int i=0; i<tokens.size(); i++)
+ System.out.printf("opREG.check: token[%2d] = %s.\n", i, tokens.get(i).getText());
+
+ } // printTokens
+
+ /* check:
+ * Notes:
+ * - must build a separate CommonTokenStream here, because
+ * tokens.fill() will consume all tokens.
+ */
+
+ public static Tree check (String query, int index)
+
+ {
+ ANTLRStringStream
+ ss = new ANTLRStringStream(query);
+ c2psLexer
+ lex = new c2psLexer(ss);
+ org.antlr.runtime.CommonTokenStream
+ tokens = new org.antlr.runtime.CommonTokenStream(lex); // v3
+ c2psParser
+ g = new c2psParser(tokens);
+ Tree
+ tree = null;
+ Antlr3DescriptiveErrorListener errorListener =
+ new Antlr3DescriptiveErrorListener(query);
+
+ // Use custom error reporters for lex for use in printTokens(lex), or programm will break
+ // by broken input, e.g. >>#REG(\" a"s\")<<.
+ lex.setErrorReporter(errorListener);
+ ((c2psParser) g).setErrorReporter(errorListener);
+
+ if( DEBUG )
+ {
+ //System.out.format("opREG.check: input='%s', index=%d.\n", query, index);
+ printTokens(query, errorListener);
+ System.out.flush();
+ }
+
+
+ try {
+ c2psParser.c2ps_query_return
+ c2Return = ((c2psParser) g).c2ps_query(); // statt t().
+
+ // AST Tree anzeigen:
+ tree = (Tree) c2Return.getTree();
+ //if (DEBUG)
+ // System.out.printf("opREG.check: tree = '%s'.\n", tree.toStringTree());
+ }
+ catch (RecognitionException e) {
+ System.err.printf("c2po_opREG.check: Recognition Exception!\n");
+ }
+
+ return tree;
+ } // check
+
+
+ /**
+ * main
+ */
+
+ public static void main (String args[]) throws Exception
+
+ {
+ String input[] = { "#REG(abc)",
+ "#REG(def's)",
+ "#REG( def's )", // all blanks should be removed.
+ "#REG( ' def\\'s ' )", // same
+ "#REG( \" def's \" )", // same
+ "#REG(abc[\"]ef)",
+ "#REG('abc')", // ' fehlt: generates Syntax Error .
+ "#REG('abc\')", // User input = #REG('abc\') : OK, nothing escaped.
+ "#REG('abc\'')", // User input = #REG('abc\') : OK, nothing escaped.
+ "#REG('abc\\')", // User input = #REG('abc\') : OK, same behavior: \\ == \.
+ "#REG((a|b))", // broken input, should use ".." or '..'.
+ "#REG('(a|b)')", // OK.
+ "#REG(\"(a|b)\")", // OK.
+ "#REG(^[A-Z]+abc[\']*ung$)",
+ "#REG('ab(cd|ef)*')",
+ "#REG('abc(def|g)*[)(]')",
+ "#REG(\"abc(def|g)*[)(]\")",
+ "#REG('abc[\"]')", // User input = #REG('abc["]') : OK, needs escape => #REG("...\"...")
+ "#REG(\"abc[\\\"]\")", // User input = #REG("abc["]") : broken because of 2nd " -> syntax error.
+ "#REG(\"abc[\\\"]\")", // User input = #REG("abc[\"]"): OK, already escaped by user => #REG("...\"...")
+ "#REG(\"abc[\\\\\"]\")" // User input = #REG("abc[\\"]") : broken. with escaped " => #REG("...\"...")
+ };
+ Tree tree;
+
+ for (int i = 0; i < input.length; i++)
+ {
+ System.out.printf("c2ps_opREG: Parsing input %02d: >>%s<<\n", i, input[i]);
+ tree = check(input[i], 0);
+ System.out.printf("c2ps_opREG: tree %02d: >>%s<<.\n\n", i, tree.toStringTree());
+ }
+
+
+ } // main
+
+}
diff --git a/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java b/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java
index 69a6293..8bbfa35 100644
--- a/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java
+++ b/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java
@@ -15,6 +15,7 @@
import de.ids_mannheim.korap.query.serialize.util.KoralObjectGenerator;
import de.ids_mannheim.korap.query.serialize.util.ResourceMapper;
import de.ids_mannheim.korap.query.serialize.util.StatusCodes;
+import de.ids_mannheim.korap.util.StringUtils;
import org.antlr.runtime.ANTLRStringStream;
import org.antlr.runtime.RecognitionException;
@@ -127,7 +128,6 @@
public static Pattern wildcardPlusPattern = Pattern.compile("([+])");
public static Pattern wildcardQuestionPattern = Pattern.compile("([?])");
-
/**
* @param tree
* The syntax tree as returned by ANTLR
@@ -142,6 +142,7 @@
process(query);
if (DEBUG) {
log.debug(">>> " + requestMap.get("query") + " <<<");
+ System.out.printf("Cosmas2QueryProcessor: >>%s<<.\n", requestMap.get("query"));
}
}
@@ -151,14 +152,19 @@
Tree tree = null;
tree = parseCosmasQuery(query);
if (DEBUG) {
+ System.out.printf("\nProcessing COSMAS II query: %s.\n\n", query);
log.debug("Processing CosmasII query: " + query);
}
- if (tree != null) {
- if (DEBUG) {
- log.debug("ANTLR parse tree: " + tree.toStringTree());
- }
+ if (tree != null)
+ {
+
+ if (DEBUG) {
+ log.debug("ANTLR parse tree: " + tree.toStringTree());
+ System.out.printf("\nANTLR parse tree: %s.\n\n", tree.toStringTree());
+ }
+
processNode(tree);
- }
+ }
}
@@ -278,6 +284,11 @@
if (nodeCat.equals("OPBED")) {
processOPBED(node);
}
+
+ if (nodeCat.equals("OPREG")) {
+ processOPREG(node);
+ }
+
objectsToPop.push(stackedObjects);
toWrapsToPop.push(stackedToWrap);
@@ -444,6 +455,88 @@
}
}
+ /* processOPREG:
+ *
+ * - input Node structure is: (OPREG "regexpr").
+ * - transforms tree into the corresponding Koral:token/Koral:term, like:
+ * e.g. #REG(abc[']?s) ->
+ * {
+ * "@type": "koral:term",
+ * "match": "match:eq", // optional
+ * "type" : "type:regex",
+ * "key" : "abc[']?s",
+ * "layer": "orth"
+ * }.
+ *
+ * - see doc: http://korap.github.io/Koral/
+ *
+ * 06.09.23/FB
+ */
+
+ private void processOPREG (Tree node)
+
+ {
+ int
+ nChild = node.getChildCount() - 1;
+ Tree
+ nodeChild = node.getChild(0);
+ boolean
+ bDebug = false;
+
+ if( DEBUG )
+ {
+ //System.out.printf("Debug: processOPREG: node='%s' nChilds=%d.\n", node.toStringTree(), nChild+1);
+ System.out.printf("Debug: processOPREG: child: >>%s<< cat=%s type=%d.\n",
+ nodeChild.getText(), getNodeCat(node), nodeChild.getType());
+ }
+
+ // empty case (is that possible?):
+ if( nChild < 0 )
+ return;
+
+ // see processOPWF_OPWF_OPLEM
+ // for how to insert regexpr into Koral JSON-LD
+
+ Map<String, Object>
+ token = KoralObjectGenerator.makeToken();
+
+ objectStack.push(token);
+ stackedObjects++;
+
+ Map<String, Object>
+ fieldMap = KoralObjectGenerator.makeTerm();
+
+ token.put("wrap", fieldMap);
+
+ // make category-specific fieldMap entry:
+ /*
+ System.out.printf("Debug: processOPREG: before replaceALL: >>%s<<.\n", nodeChild.toStringTree());
+ String
+ value = nodeChild.toStringTree().replaceAll("\"", "");
+ System.out.printf("Debug: processOPREG: after replaceALL: >>%s<<.\n", value);
+ */
+
+ /* replace replaceALL() by replaceIfNotEscaped() to delete every occurence of >>"<<
+ * which is not escaped by >>\<<, as it is important to keep the escaped sequence for
+ * the argument of #REG().
+ * This is not possible with replaceALL().
+ */
+ String
+ value = nodeChild.toStringTree(); // old version: replaceDoubleQuotes(nodeChild.toStringTree());
+
+ if( bDebug )
+ System.out.printf("Debug: processOPREG: key: >>%s<<.\n", value);
+
+ fieldMap.put("key", value);
+ fieldMap.put("layer", "orth");
+ fieldMap.put("type", "type:regex");
+ fieldMap.put("match", "match:eq");
+
+ // decide where to put (objPos=1, not clear why, but it works only like that - 20.09.23/FB):
+ putIntoSuperObject(token,1);
+
+ } // processOPREG
+
private void processOPNHIT (Tree node) {
Integer[] classRef = new Integer[] { classCounter + 128 + 1,
@@ -1511,19 +1604,40 @@
@SuppressWarnings("unchecked")
- private void putIntoSuperObject (Map<String, Object> object,
- int objStackPosition) {
- if (objectStack.size() > objStackPosition) {
+ private void putIntoSuperObject (Map<String, Object> object, int objStackPosition)
+
+ {
+ if( DEBUG )
+ {
+ System.out.printf("Debug: putIntosuperObject(<>,int): objectStack.size=%d objStackPos=%d object=%s.\n",
+ objectStack.size(), objStackPosition, object == null ? "null" : "not null");
+
+ if( objectStack != null && objectStack.size() > 0 )
+ System.out.printf("Debug: putIntosuperObject: objectStack = %s.\n", objectStack.toString());
+
+ if( invertedOperandsLists != null )
+ System.out.printf("Debug: putIntosuperObject: invertedOperandsLists: [%s].\n", invertedOperandsLists.toString());
+ }
+
+
+ if (objectStack.size() > objStackPosition)
+ {
ArrayList<Object> topObjectOperands =
- (ArrayList<Object>) objectStack.get(objStackPosition)
- .get("operands");
- if (!invertedOperandsLists.contains(topObjectOperands)) {
+ (ArrayList<Object>) objectStack.get(objStackPosition).get("operands");
+
+ if( DEBUG )
+ System.out.printf("Debug: putIntosuperObject: topObjectOperands = [%s].\n", topObjectOperands == null ? "null" : "not null");
+
+ objectStack.get(objStackPosition);
+
+ if (!invertedOperandsLists.contains(topObjectOperands))
+ {
topObjectOperands.add(object);
- }
+ }
else {
topObjectOperands.add(0, object);
- }
- }
+ }
+ }
else {
requestMap.put("query", object);
}
@@ -1618,7 +1732,8 @@
private Tree parseCosmasQuery (String query) {
- query = rewritePositionQuery(query);
+
+ query = rewritePositionQuery(query);
Tree tree = null;
Antlr3DescriptiveErrorListener errorListener =
new Antlr3DescriptiveErrorListener(query);
@@ -1627,16 +1742,23 @@
c2psLexer lex = new c2psLexer(ss);
org.antlr.runtime.CommonTokenStream tokens =
new org.antlr.runtime.CommonTokenStream(lex); // v3
+
+ // System.out.printf("parseCosmasQuery: tokens = %d\n", tokens.size());
+ // System.out.printf("parseCosmasQuery: tokens = %s\n", tokens.toString());
+
parser = new c2psParser(tokens);
+
// Use custom error reporters
lex.setErrorReporter(errorListener);
((c2psParser) parser).setErrorReporter(errorListener);
+
c2psParser.c2ps_query_return c2Return =
((c2psParser) parser).c2ps_query(); // statt t().
+
// AST Tree anzeigen:
tree = (Tree) c2Return.getTree();
if (DEBUG) log.debug(tree.toStringTree());
- }
+ }
catch (RecognitionException e) {
log.error(
"Could not parse query. Please make sure it is well-formed.");
diff --git a/src/main/java/de/ids_mannheim/korap/query/serialize/QuerySerializer.java b/src/main/java/de/ids_mannheim/korap/query/serialize/QuerySerializer.java
index 8294dca..94bf15d 100644
--- a/src/main/java/de/ids_mannheim/korap/query/serialize/QuerySerializer.java
+++ b/src/main/java/de/ids_mannheim/korap/query/serialize/QuerySerializer.java
@@ -73,7 +73,9 @@
private List<Object> errors;
private List<Object> warnings;
private List<Object> messages;
-
+
+ private boolean DEBUG = false;
+
public QuerySerializer () {
this.errors = new ArrayList<>();
this.warnings = new ArrayList<>();
@@ -102,6 +104,8 @@
int i = 0;
String[] queries = null;
String ql = "poliqarpplus";
+ boolean bDebug = true;
+
if (args.length < 2) {
System.err
.println("Usage: QuerySerializer \"query\" queryLanguage");
@@ -114,7 +118,9 @@
for (String q : queries) {
i++;
try {
- jg.run(q, ql);
+ if( bDebug ) System.out.printf("QuerySerialize: query = >>%s<< lang = %s.\n", q, ql);
+
+ jg.run(q, ql);
System.out.println();
}
catch (NullPointerException npe) {
@@ -140,6 +146,9 @@
* @throws IOException
*/
public void run (String query, String queryLanguage) throws IOException {
+
+ ast.verbose = DEBUG ? true : false; // debugging: 01.09.23/FB
+
if (queryLanguage.equalsIgnoreCase("poliqarp")) {
ast = new PoliqarpPlusQueryProcessor(query);
}
diff --git a/src/main/java/de/ids_mannheim/korap/util/StringUtils.java b/src/main/java/de/ids_mannheim/korap/util/StringUtils.java
new file mode 100644
index 0000000..29410d1
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/util/StringUtils.java
@@ -0,0 +1,157 @@
+package de.ids_mannheim.korap.util;
+
+/* general String manipulation functions moved
+ * from de.ids_mannheim.de.korap.query.parse.cosmas.c2ps_opREG.java and Cosmas2QueryProcessor.java.
+ * 24.10.23/FB
+ */
+
+public final class StringUtils {
+
+ private static final boolean DEBUG = false;
+
+ /**
+ * replaceIfNotEscaped:
+ * - kind of adhoc alternative to String.replaceAll().
+ * - replaces every occurence of >>"<< in buf IF it isn't escaped by >>\<<.
+ * Notes:
+ * - first intention: replace String.replaceALL() in processOPREG() because
+ * replaceALL() cannot be used in that special case.
+ * Returns the replaced string.
+ * 25.09.23/FB
+ */
+
+ public static String replaceIfNotEscaped(String buf)
+
+ {
+ StringBuffer
+ sb = new StringBuffer(buf);
+
+ for(int i=0; i<sb.length(); i++)
+ {
+ //System.out.printf("ssb.length=%d ssb=%s.\n", ssb.length(), ssb);
+ if( sb.codePointAt(i) == '"' && (i==0 || sb.codePointBefore(i) != '\\') )
+ {
+ sb.deleteCharAt(i);
+ i--;
+ }
+ }
+
+ return sb.toString();
+
+ } // replaceIfNotEscaped
+
+
+ /**
+ * replaceDoubleQuotes:
+ * - kind of adhoc enhanced replacement function for >>"<< for #REG(expr)
+ * instead of String.replaceAll().
+ * - replaces every occurence of >>"<< in buf that is not escaped by >>\<<.
+ * - If the >>"<< is escaped, the escape char is removed: >>\"<< -> >>"<<.
+ * Notes:
+ * - the converted string is intented to be greped.
+ * E.g.:
+ * - >>"\"Abend\"-Ticket"<< -> >>"Abend"-Ticket<<.
+ * Returns the replaced string.
+ * 26.09.23/FB
+ */
+
+ public static String replaceDoubleQuotes(String buf)
+
+ {
+ StringBuffer
+ sb = new StringBuffer(buf);
+
+ if( DEBUG ) System.out.printf("replaceDoubleQuotes: input: >>%s<<.\n", buf);
+
+ for(int i=0; i<sb.length(); i++)
+ {
+ //System.out.printf("ssb.length=%d ssb=%s.\n", ssb.length(), ssb);
+ if( sb.codePointAt(i) == '\\' )
+ {
+ if( i+1 < sb.length() )
+ {
+ if( sb.codePointAt(i+1) == '"') // >>\"<< -> >>"<<.
+ sb.deleteCharAt(i);
+ else if( sb.codePointAt(i+1) == '\\' ) // >>\\<< unchanged.
+ i++; // keep >>\\<< unchanged.
+ }
+ }
+ else if( sb.codePointAt(i) == '"' )
+ {
+ sb.deleteCharAt(i); // unescaped >>"<< is removed.
+ i--;
+ }
+ }
+
+ if( DEBUG ) System.out.printf("replaceDoubleQuotes: output: >>%s<<.\n", sb.toString());
+
+ return sb.toString();
+
+ } // replaceDoubleQuotes
+
+ /* encode2DoubleQuoted:
+ * transforms an unquoted string into an double quoted string
+ * and escapes >>"<< and >>/<<.
+ * E.g. >>.."..<< -> >>"..\".."<<.
+ * E.g. >>..\..<< -> >>"..\\.."<<.
+ * E.g. >>..\"..<< -> >>"..\\\".."<<, etc.
+ *
+ * escaping >>"<< and >>\<<, because they will be
+ * enclosed in >>"..."<<.
+ * >>"<< -> >>\"<<
+ * >>\<< -> >>\\<<
+ *
+ * 28.09.23/FB
+ *
+ * E.g. from previous, olddated version:
+ * \\" -> \\\"
+ * \\\" -> \\\"
+ */
+
+ public static void encode2DoubleQuoted(StringBuffer sb)
+
+ {
+ if( DEBUG ) System.out.printf("encode2DoubleQuoted: input = >>%s<<.\n", sb.toString());
+
+ for(int i=0; i<sb.length()-1; i++)
+ {
+ if( sb.charAt(i) == '\\' )
+ {
+ sb.insert(i, '\\');
+ i++;
+ }
+ else if( sb.charAt(i) == '"')
+ {
+ sb.insert(i, '\\');
+ i++;
+ }
+ }
+
+ // enclose reg. expr. with "..." before returning:
+ sb.insert(0, '"');
+ sb.append('"');
+
+ if( DEBUG ) System.out.printf("encode2DoubleQuoted: output = >>%s<<.\n", sb.toString());
+ } // encode2DoubleQuoted
+
+ /*
+ * removeBlanksAtBothSides
+ * 28.09.23/FB
+ */
+
+ public static void removeBlanksAtBothSides(StringBuffer sb)
+
+ {
+ int len;
+
+ // remove leading blanks: >> abc << -> >>abc <<:
+ while( sb.length() > 0 && sb.charAt(0) == ' ')
+ sb.deleteCharAt(0);
+
+ // remove trailing blanks: >>abc << -> >>abc<<:
+ while( (len=sb.length()) > 0 && sb.charAt(len-1) == ' ' )
+ sb.deleteCharAt(len-1);
+
+ } // removeBlanksAtBothSides
+
+}
diff --git a/src/test/java/de/ids_mannheim/korap/test/cosmas2/Cosmas2QueryProcessorTest.java b/src/test/java/de/ids_mannheim/korap/test/cosmas2/Cosmas2QueryProcessorTest.java
index 0722c9b..759810f 100644
--- a/src/test/java/de/ids_mannheim/korap/test/cosmas2/Cosmas2QueryProcessorTest.java
+++ b/src/test/java/de/ids_mannheim/korap/test/cosmas2/Cosmas2QueryProcessorTest.java
@@ -14,12 +14,15 @@
import static org.junit.Assert.*;
+import static de.ids_mannheim.korap.query.parse.cosmas.c2ps_opREG.*;
+import de.ids_mannheim.korap.util.StringUtils;
/**
* Tests for JSON-LD serialization of Cosmas II queries.
*
* @author Joachim Bingel (bingel@ids-mannheim.de)
* @author Nils Diewald
- * @version 1.1
+ * @author Franck Bodmer
+ * @version 1.2 - 21.09.23
*/
public class Cosmas2QueryProcessorTest {
@@ -1702,4 +1705,224 @@
assertEquals("s", res.at("/query/distances/0/key").asText());
assertEquals("operation:sequence", res.at("/query/operation").asText());
}
+
+ /* Testing #REG(expr), #REG('expr') and #REG("expr").
+ * 21.09.23/FB
+ */
+
+ @Test
+ public void testREG () throws JsonProcessingException, IOException {
+
+ boolean debug = false;
+
+ query = "#REG(^aber$)";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText());
+ assertEquals("koral:token", res.at("/query/@type").asText());
+ assertEquals("koral:term", res.at("/query/wrap/@type").asText());
+ assertEquals("^aber$", res.at("/query/wrap/key").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("match:eq", res.at("/query/wrap/match").asText());
+
+ query = "#REG('été\\'')";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText());
+ assertEquals("été'" , res.at("/query/wrap/key").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+
+ query = "#REG('été\' )";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText());
+ assertEquals("été" , res.at("/query/wrap/key").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+
+ query = "#REG('été\\')";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText());
+ assertEquals("été\\", res.at("/query/wrap/key").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+
+ query = "#REG(l'été)";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText());
+ assertEquals("l'été", res.at("/query/wrap/key").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+
+ query = "#REG(l\\'été)";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText());
+ assertEquals("l'été", res.at("/query/wrap/key").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+
+ query = "#REG(\"l'été\")";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText());
+ assertEquals("l'été", res.at("/query/wrap/key").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+
+ query = "#REG(\"l\\'été\")";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText());
+ assertEquals("l'été", res.at("/query/wrap/key").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+
+ query = "#REG('l\\'été.*')";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText());
+ assertEquals("l'été.*", res.at("/query/wrap/key").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+
+ query = "#REG('\\\"été\\\"$')"; // means user input is #REG('\"été\"').
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText());
+ assertEquals("\"été\"$", res.at("/query/wrap/key").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+
+ // checks the >>"<<:
+ query = "#REG(\\\"Abend\\\"-Ticket)"; // means user input = #REG(\"Abend\"-Ticket).
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText());
+ assertEquals("\"Abend\"-Ticket",res.at("/query/wrap/key").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+
+ query = "#REG('\\\"Abend\\\"-Ticket')"; // means user input = #REG(\"Abend\"-Ticket).
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText());
+ assertEquals("\"Abend\"-Ticket",res.at("/query/wrap/key").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+
+ query = "#REG('\"Abend\"-Ticket')"; // means user input = #REG('"Abend"-Ticket').
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText());
+ assertEquals("\"Abend\"-Ticket",res.at("/query/wrap/key").asText()); // key must be escaped, because converted to in "...".
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+
+ query = "#REG(\"\\\"Abend\\\"-Ticket\")"; // means user input = #REG("\"Abend\"-Ticket") -> key: >>"Abend"-Ticket<<.
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ if( debug ) System.out.printf("testREG: query: >>%s<< -> key: >>%s<<.\n", query, res.at("/query/wrap/key").asText());
+ assertEquals("\"Abend\"-Ticket",res.at("/query/wrap/key").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+ //
+
+ query = "#REG('^(a|b)?+*$')";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ assertEquals("^(a|b)?+*$", res.at("/query/wrap/key").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+
+ query = "#REG(\"[A-Z()]\")";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ assertEquals("[A-Z()]", res.at("/query/wrap/key").asText());
+ assertEquals("orth", res.at("/query/wrap/layer").asText());
+ assertEquals("type:regex", res.at("/query/wrap/type").asText());
+
+ query = "#REG(^klein.*) /s0 #REG(A.*ung)";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ //System.out.printf("Debug: res: pretty: %s.\n", res.toPrettyString());
+
+ assertEquals("^klein.*", res.at("/query/operands/0/operands/0/wrap/key").asText());
+ assertEquals("orth", res.at("/query/operands/0/operands/0/wrap/layer").asText());
+ assertEquals("type:regex", res.at("/query/operands/0/operands/0/wrap/type").asText());
+
+ assertEquals("A.*ung", res.at("/query/operands/1/operands/0/wrap/key").asText());
+ assertEquals("orth", res.at("/query/operands/1/operands/0/wrap/layer").asText());
+ assertEquals("type:regex", res.at("/query/operands/1/operands/0/wrap/type").asText());
+
+ query = "#REG( ) ";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ assertTrue(res.toString().contains("Failing to parse"));
+
+ query = "#REG('' ) ";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ assertTrue(res.toString().contains("Failing to parse"));
+
+ query = "#REG(\"\") ";
+ qs.setQuery(query, "cosmas2");
+ res = mapper.readTree(qs.toJSON());
+
+ assertTrue(res.toString().contains("Failing to parse"));
+
+ }
+
+ @Test
+ public void testREGencode2DoubleQuoted () {
+ StringBuffer sb = new StringBuffer("..\"..");
+ StringUtils.encode2DoubleQuoted(sb);
+ assertEquals("\"..\\\"..\"",sb.toString());
+
+ sb = new StringBuffer("..\\..");
+ StringUtils.encode2DoubleQuoted(sb);
+ assertEquals("\"..\\\\..\"", sb.toString());
+
+ sb = new StringBuffer("..\"..");
+ StringUtils.encode2DoubleQuoted(sb);
+ assertEquals("\"..\\\"..\"", sb.toString());
+ }
+
+ @Test
+ public void testREGremoveBlanksAtBothSides () {
+ StringBuffer sb = new StringBuffer(" aabc cjs ss ");
+ StringUtils.removeBlanksAtBothSides(sb);
+ assertEquals("aabc cjs ss",sb.toString());
+
+ sb = new StringBuffer("abc ");
+ StringUtils.removeBlanksAtBothSides(sb);
+ assertEquals("abc",sb.toString());
+
+ sb = new StringBuffer(" abc");
+ StringUtils.removeBlanksAtBothSides(sb);
+ assertEquals("abc",sb.toString());
+ }
}