src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opWF.java - KorAP/Koral - Gitiles

 package de.ids_mannheim.korap.query.parse.cosmas;

 import org.antlr.runtime.*;
 import org.antlr.runtime.tree.*;
 import de.ids_mannheim.korap.query.parse.cosmas.c2ps_opPROXLexer;
 import de.ids_mannheim.korap.query.parse.cosmas.c2ps_opPROX;
 import de.ids_mannheim.korap.query.serialize.util.StatusCodes;

 /*
  * parses prefixed and suffixed options of a search wordform.
  * E.g. :fi:Hendrix:sa/-pe.
  */

 public class c2ps_opWF

 {
 	static final boolean bDebug = false;

     /* check:
      * Arguments:
      * bStrip: true: 'input' contains "wort" -> strip " away -> wort.
      *        false: 'input' contains no " -> nothing to strip.
      * bLem: true: input contains a Lemma; generates tree ^(OPLEM...).
      *       false: input contains a Wordform; generates tree ^(OPWF...).
      * input: may be a single Lemma or Wform or a list of Wforms.
      */

     public static Tree check (String input, boolean bStrip, boolean bLem, int pos)
     {
     	if (bStrip)
             input = input.substring(1, input.length() - 1);

         if (bLem && input.charAt(0) == '&') {
             input = input.substring(1, input.length());
             //System.out.println("Lemma: strip '&' -> " + input);
         }

         ANTLRStringStream ss = new ANTLRStringStream(input);
         c2ps_opWFLexer lex = new c2ps_opWFLexer(ss);
         CommonTokenStream tokens = new CommonTokenStream(lex);
         c2ps_opWFParser g = new c2ps_opWFParser(tokens);
         c2ps_opWFParser.searchWFs_return c2PQWFReturn = null;
         c2ps_opWFParser.searchLEM_return c2PQLEMReturn = null;

         /*
         System.out.println("check opWF:" + index + ": " + input);
         System.out.flush();
         */

         try {
             if (bLem)
                 c2PQLEMReturn = g.searchLEM(pos);
             else
                 c2PQWFReturn = g.searchWFs(pos);
         }
         catch (RecognitionException e) {
             e.printStackTrace();
         }

         // AST Tree anzeigen:
         Tree tree = bLem ? (Tree)c2PQLEMReturn.getTree() : (Tree)c2PQWFReturn.getTree();

         if( bDebug && bLem )
         	 {
         	 System.out.printf("c2ps_opWF.check: %s: '%s'.\n", bLem ? "opLEM" : "opWF",
         			 tree.toStringTree() );
         	 System.out.flush();
         	 }

         return tree;
     }


     /* Wordform Encoding, e.g. to insert a Wordform into an AST.
      * a) wf ->  "wf".
      * b) remove escape char before ':': abc\: -> abc:.
      * Args:
      * wf			: wordform or lemma (expected lemma : "lemma" or "opts&lemma",
      * 				  the starting '&' has been removed before entering this function).
      * tokenType	: either OPWF or OPLEM.
      * pos			: start position of wf.
      * Notes:
      *  - &opts&lemma : may contain wildcards as options in the &opts& section only.
      *    reject if wildcards appear in the &lemma section.
      * Returns a Tree or an ErrorTree.
      */
     public static Tree encode (String wf, int tokenType, int pos)

     {
     	//System.out.printf("c2ps_opWF.encode: wf='%s' tokenType=%d pos=%d.\n",  wf, tokenType, pos);

         // b)
         StringBuffer sbWF = new StringBuffer(wf);

         for (int i = 0; i < sbWF.length()-1; i++)
         	{
             if (sbWF.charAt(i) == '\\' && sbWF.charAt(i + 1) == ':')
                 sbWF.deleteCharAt(i);
         	}

         // reject wildcards in lemmata:

         if( tokenType == c2ps_opWFLexer.OPLEM )
         	{
         	boolean hasOpts  = false; // true if a '&' occurs: e.g. "Fes+C&lemma"
         	boolean hasFound = false; // false for all wildcards found to the left of '&', true in all other cases.

         	for(int i=0; i< sbWF.length(); i++)
 	        	{
         		if( sbWF.charAt(i) == '&' )
         			{
         			hasOpts  = true;
         			hasFound = false;
         			}
         		else if (sbWF.charAt(i) == '?' || sbWF.charAt(i) == '*' || sbWF.charAt(i) == '+' )
         			{
         			hasFound = true;
         			}
         		}

         	// error if hasFound==true:
         	if( hasFound )
 	        	{
         		if( bDebug )
         			System.out.printf("c2ps_opWF.encode: Syntax error: '%s' contains wildcards inside lemma expression!\n", wf);
         		return StatusCodes.buildErrorTree(wf, StatusCodes.ERR_LEM_WILDCARDS, pos);
         		}
         	}

         return new CommonTree(new CommonToken(tokenType, "\"" + sbWF.toString() + "\""));
     }

     /*
      *                         main testprogram
      */

     public static void main (String args[]) throws Exception {
         String[] input = { ":fi:Hendrix:sa", ":FiOlDs:été:sa", "&Gitarre",
                 "&Gitarre:sa/-pe", " \"Institut für \\:Deutsche\\: Sprache\" ",
                 ":Fi:der:-sa Wilde:-se Western:/se" };
         Tree tree;
         boolean bLem;

         System.out.println("Tests von WF und Lemma-Optionen:\n");

         for (int i = 0; i < input.length; i++) {
             bLem = input[i].charAt(0) == '&' ? true : false;

             System.out.println(bLem ? "LEM: " : "WF: " + "input: " + input[i]);

             if (bLem)
                 tree = check(input[i], false, true, 0); // bStrip=false, bLem=true;
             else
                 tree = check(input[i], false, false, 0); // bStrip=false, bLem=false.

             System.out.println(bLem ? "LEM: " : "WF: " + "AST  : "
                     + tree.toStringTree() + "\n");
         }

     } // main

 }
	package de.ids_mannheim.korap.query.parse.cosmas;

	import org.antlr.runtime.*;
	import org.antlr.runtime.tree.*;
	import de.ids_mannheim.korap.query.parse.cosmas.c2ps_opPROXLexer;
	import de.ids_mannheim.korap.query.parse.cosmas.c2ps_opPROX;
	import de.ids_mannheim.korap.query.serialize.util.StatusCodes;

	/*
	* parses prefixed and suffixed options of a search wordform.
	* E.g. :fi:Hendrix:sa/-pe.
	*/

	public class c2ps_opWF

	{
	static final boolean bDebug = false;

	/* check:
	* Arguments:
	* bStrip: true: 'input' contains "wort" -> strip " away -> wort.
	* false: 'input' contains no " -> nothing to strip.
	* bLem: true: input contains a Lemma; generates tree ^(OPLEM...).
	* false: input contains a Wordform; generates tree ^(OPWF...).
	* input: may be a single Lemma or Wform or a list of Wforms.
	*/

	public static Tree check (String input, boolean bStrip, boolean bLem, int pos)
	{
	if (bStrip)
	input = input.substring(1, input.length() - 1);

	if (bLem && input.charAt(0) == '&') {
	input = input.substring(1, input.length());
	//System.out.println("Lemma: strip '&' -> " + input);
	}

	ANTLRStringStream ss = new ANTLRStringStream(input);
	c2ps_opWFLexer lex = new c2ps_opWFLexer(ss);
	CommonTokenStream tokens = new CommonTokenStream(lex);
	c2ps_opWFParser g = new c2ps_opWFParser(tokens);
	c2ps_opWFParser.searchWFs_return c2PQWFReturn = null;
	c2ps_opWFParser.searchLEM_return c2PQLEMReturn = null;

	/*
	System.out.println("check opWF:" + index + ": " + input);
	System.out.flush();
	*/

	try {
	if (bLem)
	c2PQLEMReturn = g.searchLEM(pos);
	else
	c2PQWFReturn = g.searchWFs(pos);
	}
	catch (RecognitionException e) {
	e.printStackTrace();
	}

	// AST Tree anzeigen:
	Tree tree = bLem ? (Tree)c2PQLEMReturn.getTree() : (Tree)c2PQWFReturn.getTree();

	if( bDebug && bLem )
	{
	System.out.printf("c2ps_opWF.check: %s: '%s'.\n", bLem ? "opLEM" : "opWF",
	tree.toStringTree() );
	System.out.flush();
	}

	return tree;
	}


	/* Wordform Encoding, e.g. to insert a Wordform into an AST.
	* a) wf -> "wf".
	* b) remove escape char before ':': abc\: -> abc:.
	* Args:
	* wf : wordform or lemma (expected lemma : "lemma" or "opts&lemma",
	* the starting '&' has been removed before entering this function).
	* tokenType : either OPWF or OPLEM.
	* pos : start position of wf.
	* Notes:
	* - &opts&lemma : may contain wildcards as options in the &opts& section only.
	* reject if wildcards appear in the &lemma section.
	* Returns a Tree or an ErrorTree.
	*/
	public static Tree encode (String wf, int tokenType, int pos)

	{
	//System.out.printf("c2ps_opWF.encode: wf='%s' tokenType=%d pos=%d.\n", wf, tokenType, pos);

	// b)
	StringBuffer sbWF = new StringBuffer(wf);

	for (int i = 0; i < sbWF.length()-1; i++)
	{
	if (sbWF.charAt(i) == '\\' && sbWF.charAt(i + 1) == ':')
	sbWF.deleteCharAt(i);
	}

	// reject wildcards in lemmata:

	if( tokenType == c2ps_opWFLexer.OPLEM )
	{
	boolean hasOpts = false; // true if a '&' occurs: e.g. "Fes+C&lemma"
	boolean hasFound = false; // false for all wildcards found to the left of '&', true in all other cases.

	for(int i=0; i< sbWF.length(); i++)
	{
	if( sbWF.charAt(i) == '&' )
	{
	hasOpts = true;
	hasFound = false;
	}
	else if (sbWF.charAt(i) == '?' \|\| sbWF.charAt(i) == '*' \|\| sbWF.charAt(i) == '+' )
	{
	hasFound = true;
	}
	}

	// error if hasFound==true:
	if( hasFound )
	{
	if( bDebug )
	System.out.printf("c2ps_opWF.encode: Syntax error: '%s' contains wildcards inside lemma expression!\n", wf);
	return StatusCodes.buildErrorTree(wf, StatusCodes.ERR_LEM_WILDCARDS, pos);
	}
	}

	return new CommonTree(new CommonToken(tokenType, "\"" + sbWF.toString() + "\""));
	}

	/*
	* main testprogram
	*/

	public static void main (String args[]) throws Exception {
	String[] input = { ":fi:Hendrix:sa", ":FiOlDs:été:sa", "&Gitarre",
	"&Gitarre:sa/-pe", " \"Institut für \\:Deutsche\\: Sprache\" ",
	":Fi:der:-sa Wilde:-se Western:/se" };
	Tree tree;
	boolean bLem;

	System.out.println("Tests von WF und Lemma-Optionen:\n");

	for (int i = 0; i < input.length; i++) {
	bLem = input[i].charAt(0) == '&' ? true : false;

	System.out.println(bLem ? "LEM: " : "WF: " + "input: " + input[i]);

	if (bLem)
	tree = check(input[i], false, true, 0); // bStrip=false, bLem=true;
	else
	tree = check(input[i], false, false, 0); // bStrip=false, bLem=false.

	System.out.println(bLem ? "LEM: " : "WF: " + "AST : "
	+ tree.toStringTree() + "\n");
	}

	} // main

	}