blob: 4ea14eb178ebdedefa5d2d9e471e5116fcc54b79 [file] [log] [blame]
package de.ids_mannheim.korap.query.serialize;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.antlr.runtime.ANTLRStringStream;
import org.antlr.runtime.RecognitionException;
import org.antlr.runtime.tree.Tree;
import de.ids_mannheim.korap.query.cosmas2.c2psLexer;
import de.ids_mannheim.korap.query.cosmas2.c2psParser;
/**
* Map representation of CosmasII syntax tree as returned by ANTLR
* @author joachim
*
*/
public class CosmasTree extends AbstractSyntaxTree {
private static c2psParser cosmasParser;
/*
* Following collections have the following functions:
* - the request is a map with two keys (meta/query): {meta=[], query=[]}
* - the query is a list of token group maps: {meta=[], query=[tg1=[], tg2=[]]}
* - each token group is a list of tokens: {meta=[], query=[tg1=[t1_1, t1_2], tg2=[t2_1, t2_2, t2_3]]}
* - each token corresponds to a single 'fields' linked list {meta=[], query=[tg1=[t1_1=[], t1_2=[]], ... ]}
* - each fields list contains a logical operator and 'field maps' defining attributes and values
* {meta=[], query=[tg1=[t1_1=[[disj, {base=foo}, {base=bar}]], t1_2=[]], ... ]}
*/
String query;
LinkedHashMap<String,Object> requestMap = new LinkedHashMap<String,Object>();
LinkedHashMap<String,Object> queryMap = new LinkedHashMap<String,Object>();
LinkedHashMap<String,Object> tokenGroup = new LinkedHashMap<String,Object>();
ArrayList<Object> fieldGroup = new ArrayList<Object>();
LinkedHashMap<String,Object> fieldMap;
ArrayList<List<Object>> distantTokens;
/**
* Makes it possible to store several distantTokenGroups
*/
LinkedList<ArrayList<List<Object>>> distantTokensStack = new LinkedList<ArrayList<List<Object>>>();
/**
* Field for repetition query (Kleene + or * operations, or min/max queries: {2,4}
*/
String repetition = "";
int tokenCount=0;
int tokenGroupCount=0;
/**
* Keeps track of open node categories
*/
LinkedList<String> openNodeCats = new LinkedList<String>();
/**
* Global control structure for fieldGroups, keeps track of open fieldGroups.
*/
LinkedList<ArrayList<Object>> openFieldGroups = new LinkedList<ArrayList<Object>>();
/**
* Global control structure for tokenGroups, keeps track of open tokenGroups.
*/
LinkedList<LinkedHashMap<String,Object>> tokenGroupsStack = new LinkedList<LinkedHashMap<String,Object>>();
/**
* Flag that indicates whether token fields or meta fields are currently being processed
*/
boolean inMeta = false;
boolean negate = false;
Tree cosmasTree;
LinkedHashMap<String,Object> treeMap = new LinkedHashMap<String,Object>();
/**
* Keeps track of all visited nodes in a tree
*/
List<Tree> visited = new ArrayList<Tree>();
/**
*
* @param tree The syntax tree as returned by ANTLR
* @param parser The ANTLR parser instance that generated the parse tree
*/
public CosmasTree(String query) {
this.query = query;
process(query);
System.out.println(requestMap);
}
@Override
public Map<String, Object> getRequestMap() {
return this.requestMap;
}
@Override
public void process(String query) {
Tree tree = parseCosmasQuery(query);
System.out.println("Processing Cosmas");
processNode(tree);
}
private void processNode(Tree node) {
// Top-down processing
if (visited.contains(node)) return;
else visited.add(node);
String nodeCat = getNodeCat(node);
openNodeCats.push(nodeCat);
System.out.println(openNodeCats);
System.out.println(distantTokensStack);
/* ***************************************
* Processing individual node categories *
*****************************************/
// C2QP is tree root
if (nodeCat.equals("C2PQ")) {
queryMap = new LinkedHashMap<String,Object>();
requestMap.put("query", queryMap);
}
// Nodes introducing tokens. Process all in the same manner, except for the fieldMap entry
if (nodeCat.equals("OPWF") || nodeCat.equals("OPLEM") || nodeCat.equals("OPMORPH")) {
if (tokenGroupsStack.isEmpty()) {
tokenGroup = new LinkedHashMap<String, Object>();
tokenCount=0;
tokenGroupCount++;
queryMap.put("tokenGroup"+tokenGroupCount, tokenGroup);
tokenGroupsStack.push(tokenGroup);
} else {
tokenGroup = tokenGroupsStack.getFirst();
}
// check if this token comes after a distant operator (like "/+w3:4") and if yes,
// insert the empty tokenGroups before the current token
if (openNodeCats.get(1).equals("ARG2")) {
if (openNodeCats.get(2).equals("OPPROX") && !distantTokensStack.isEmpty()) {
for (List<Object> distantTokenGroup : distantTokensStack.pop()) {
// if (tokenGroupsStack.isEmpty()) {
// queryMap.put("token"+tokenGroupCount+"_1", distantTokenGroup);
// } else {
tokenCount++;
tokenGroupsStack.getFirst().put("token"+tokenGroupCount+"_"+tokenCount, distantTokenGroup);
// }
// tokenGroupCount++;
}
}
// check negation of token by preceding OPNOT
// else if (openNodeCats.get(2).equals("OPNOT")) {
// negate = true;
// }
}
fieldGroup = new ArrayList<Object>();
tokenCount++;
tokenGroup.put("token"+tokenGroupCount+"_"+tokenCount, fieldGroup);
fieldMap = new LinkedHashMap<String, Object>();
fieldGroup.add(fieldMap);
// make category-specific fieldMap entry
if (nodeCat.equals("OPWF")) {
fieldMap.put("form", node.getChild(0).toStringTree());
}
if (nodeCat.equals("OPLEM")) {
fieldMap.put("lemma", node.getChild(0).toStringTree());
}
if (nodeCat.equals("OPMORPH")) {
fieldMap.put("morph", node.toStringTree());
//TODO decompose morphology query
}
// negate field (see above)
if (negate) {
fieldMap.put("relation", "!=");
}
// tokenGroupsStack.push(tokenGroup);
}
// negate every token that's under OPNOT > ARG2
if (nodeCat.equals("ARG2") && openNodeCats.get(1).equals("OPNOT")) {
negate = true;
}
if (nodeCat.equals("OPOR")) {
tokenGroup = new LinkedHashMap<String, Object>();
tokenCount=0;
tokenGroupCount++;
if (tokenGroupsStack.isEmpty()) {
queryMap.put("tokenGroup"+tokenGroupCount, tokenGroup);
} else {
tokenGroupsStack.getFirst().put("tokenGroup"+tokenGroupCount, tokenGroup);
}
tokenGroup.put("type", "disj");
tokenGroupsStack.push(tokenGroup);
}
if (nodeCat.equals("OPAND")) {
tokenGroup = new LinkedHashMap<String, Object>();
tokenCount=0;
tokenGroupCount++;
if (tokenGroupsStack.isEmpty()) {
queryMap.put("tokenGroup"+tokenGroupCount, tokenGroup);
} else {
tokenGroupsStack.getFirst().put("tokenGroup"+tokenGroupCount, tokenGroup);
}
tokenGroup.put("type", "conj");
tokenGroupsStack.push(tokenGroup);
}
if (nodeCat.equals("OPPROX")) {
distantTokens = new ArrayList<List<Object>>();
Tree prox_opts = node.getChild(0);
Tree typ = prox_opts.getChild(0);
System.err.println(typ.getChild(0).toStringTree());
Tree dist_list = prox_opts.getChild(1);
// get relevant information
String direction = dist_list.getChild(0).getChild(0).getChild(0).toStringTree();
String min = dist_list.getChild(0).getChild(1).getChild(0).toStringTree();
String max = dist_list.getChild(0).getChild(1).getChild(1).toStringTree();
if (min.equals("VAL0")) {
min=max;
}
// create empty tokens and put them on the stack to place them between arg1 and arg2
for (int i=0; i<Integer.parseInt(max)-1; i++) {
ArrayList<Object> emptyToken = new ArrayList<Object>();
LinkedHashMap<String,Object> emptyFieldMap = new LinkedHashMap<String,Object>();
emptyToken.add(emptyFieldMap);
tokenGroup.put("token"+tokenGroupCount+"_1", emptyToken);
// mark all tokens between min and max optional
if (i>=Integer.parseInt(min)) {
emptyFieldMap.put("optional", "true");
}
distantTokens.add(emptyToken);
}
distantTokensStack.push(distantTokens);
}
// System.err.println(tokenGroupsStack.size()+" "+tokenGroupsStack);
// recursion until 'query' node (root of tree) is processed
for (int i=0; i<node.getChildCount(); i++) {
Tree child = node.getChild(i);
processNode(child);
}
if (nodeCat.equals("ARG2") && openNodeCats.get(1).equals("OPNOT")) {
negate = false;
}
if (nodeCat.equals("OPAND") || nodeCat.equals("OPOR")) {
tokenGroupsStack.pop();
// tokenGroupCount--;
// tokenCount=0;
}
openNodeCats.pop();
}
/**
* Returns the category (or 'label') of the root of a ParseTree.
* @param node
* @return
*/
public String getNodeCat(Tree node) {
String nodeCat = node.toStringTree();
Pattern p = Pattern.compile("\\((.*?)\\s"); // from opening parenthesis to 1st whitespace
Matcher m = p.matcher(node.toStringTree());
if (m.find()) {
nodeCat = m.group(1);
}
return nodeCat;
}
private static Tree parseCosmasQuery(String p) {
Tree tree = null;
ANTLRStringStream
ss = new ANTLRStringStream(p);
c2psLexer
lex = new c2psLexer(ss);
org.antlr.runtime.CommonTokenStream tokens = //v3
new org.antlr.runtime.CommonTokenStream(lex);
cosmasParser = new c2psParser(tokens);
c2psParser.c2ps_query_return
c2Return = null;
try
{
c2Return = cosmasParser.c2ps_query(); // statt t().
}
catch (RecognitionException e)
{
e.printStackTrace();
}
// AST Tree anzeigen:
tree = (Tree)c2Return.getTree();
return tree;
}
/**
* @param args
*/
public static void main(String[] args) {
/*
* For testing
*/
String[] queries = new String[] {
/* COSMAS 2 */
// "&Mond",
// "Mond Sterne",
// "Mond*",
// "Mond oder Sterne",
// "(des oder eines) /+w2 (Bauern oder Bauers oder Bauerns)",
// "(Sonne /+w2 Mond) /+w2:3 Sterne",
// "Mond oder Sonne /w2 Sterne",
// "MORPH(V PCP)",
// "MORPH(V PCP) Baum" ,
// "Sonne %w2 Mond",
// "Sonne /w2 Mond",
// "Sonne nicht (Mond Stern)",
// "Sonne nicht (Mond oder Stern)",
// "Sonne /+w1:4 Mond",
"(sonne und mond) oder sterne",
"(stern oder (sonne und mond)) und MORPH(V PCP)",
"(sonne und (stern oder mond)) /+w2 luna???",
"(Tag /+w2 $offenen) /+w1 Tür",
"heißt /+w2 \"und\" ,"
};
for (String q : queries) {
try {
System.out.println(q);
System.out.println(parseCosmasQuery(q).toStringTree());
CosmasTree act = new CosmasTree(q);
System.out.println();
} catch (NullPointerException npe) {
npe.printStackTrace();
System.out.println("null\n");
}
}
}
}