| Joachim Bingel | 6003b85 | 2014-12-18 14:20:55 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.parse.cosmas; |
| 2 | |
| 3 | import java.io.*; |
| 4 | import org.antlr.runtime.*; |
| 5 | import org.antlr.runtime.debug.DebugEventSocketProxy; |
| 6 | import org.antlr.runtime.tree.*; |
| 7 | |
| 8 | /* |
| 9 | * parses prefixed and suffixed options of a search wordform. |
| 10 | * E.g. :fi:Hendrix:sa/-pe. |
| 11 | */ |
| 12 | |
| 13 | public class c2ps_opWF |
| 14 | |
| 15 | { |
| 16 | /* Arguments: |
| 17 | * bStrip: true: 'input' contains "wort" -> strip " away -> wort. |
| 18 | * false: 'input' contains no " -> nothing to strip. |
| 19 | * bLem: true: input contains a Lemma; generates tree ^(OPLEM...). |
| 20 | * false: input contains a Wordform; generates tree ^(OPWF...). |
| 21 | * input: may be a single Lemma or Wform or a list of Wforms. |
| 22 | */ |
| 23 | |
| 24 | public static Tree check(String input, boolean bStrip, boolean bLem, int index) |
| 25 | { |
| 26 | if( bStrip ) |
| 27 | input = input.substring(1, input.length()-1); |
| 28 | |
| 29 | if( bLem && input.charAt(0) == '&' ) |
| 30 | { |
| 31 | input = input.substring(1, input.length()); |
| 32 | //System.out.println("Lemma: strip '&' -> " + input); |
| 33 | } |
| 34 | |
| 35 | ANTLRStringStream |
| 36 | ss = new ANTLRStringStream(input); |
| 37 | c2ps_opWFLexer |
| 38 | lex = new c2ps_opWFLexer(ss); |
| 39 | CommonTokenStream tokens = |
| 40 | new CommonTokenStream(lex); |
| 41 | c2ps_opWFParser |
| 42 | g = new c2ps_opWFParser(tokens); |
| 43 | c2ps_opWFParser.searchWFs_return |
| 44 | c2PQWFReturn = null; |
| 45 | c2ps_opWFParser.searchLEM_return |
| 46 | c2PQLEMReturn = null; |
| 47 | |
| 48 | /* |
| 49 | System.out.println("check opWF:" + index + ": " + input); |
| 50 | System.out.flush(); |
| 51 | */ |
| 52 | |
| 53 | try |
| 54 | { |
| 55 | if( bLem ) |
| 56 | c2PQLEMReturn = g.searchLEM(); |
| 57 | else |
| 58 | c2PQWFReturn = g.searchWFs(); |
| 59 | } |
| 60 | catch (RecognitionException e) |
| 61 | { |
| 62 | e.printStackTrace(); |
| 63 | } |
| 64 | |
| 65 | // AST Tree anzeigen: |
| 66 | Tree tree = bLem ? (Tree)c2PQLEMReturn.getTree() : (Tree)c2PQWFReturn.getTree(); |
| 67 | // System.out.println(bLem? "opLEM: " : "opWF: " + tree.toStringTree() ); |
| 68 | |
| 69 | return tree; |
| 70 | } |
| 71 | |
| 72 | /* Wordform Encoding, e.g. to insert a Wordform into an AST. |
| 73 | * a) wf -> "wf". |
| 74 | * b) remove escape char before ':': abc\: -> abc:. |
| 75 | * Returns a Tree. |
| 76 | */ |
| 77 | public static Tree encode(String wf, int tokenType) |
| 78 | |
| 79 | { |
| 80 | // b) |
| 81 | StringBuffer |
| 82 | sbWF = new StringBuffer(wf); |
| 83 | |
| 84 | for(int i=0; i<sbWF.length()-1; i++) |
| 85 | { |
| 86 | if( sbWF.charAt(i) == '\\' && sbWF.charAt(i+1) == ':' ) |
| 87 | sbWF.deleteCharAt(i); |
| 88 | } |
| 89 | |
| 90 | return new CommonTree(new CommonToken(tokenType, "\"" + sbWF.toString() + "\"")); |
| 91 | } |
| 92 | |
| 93 | /* |
| 94 | * main testprogram: |
| 95 | */ |
| 96 | |
| 97 | public static void main(String args[]) throws Exception |
| 98 | { |
| 99 | String[] |
| 100 | input = {":fi:Hendrix:sa", ":FiOlDs:été:sa", "&Gitarre", "&Gitarre:sa/-pe", |
| 101 | " \"Institut für \\:Deutsche\\: Sprache\" ", |
| 102 | ":Fi:der:-sa Wilde:-se Western:/se" }; |
| 103 | Tree |
| 104 | tree; |
| 105 | boolean |
| 106 | bLem; |
| 107 | |
| 108 | System.out.println("Tests von WF und Lemma-Optionen:\n"); |
| 109 | |
| 110 | for(int i=0; i<input.length; i++) |
| 111 | { |
| 112 | bLem = input[i].charAt(0) == '&' ? true : false; |
| 113 | |
| 114 | System.out.println(bLem? "LEM: " : "WF: " + "input: " + input[i]); |
| 115 | |
| 116 | if( bLem ) |
| 117 | tree = check(input[i], false, true, 0); // bStrip=false, bLem=true; |
| 118 | else |
| 119 | tree = check(input[i], false, false, 0); // bStrip=false, bLem=false. |
| 120 | |
| 121 | System.out.println(bLem? "LEM: " : "WF: " + "AST : " + tree.toStringTree() + "\n"); |
| 122 | } |
| 123 | |
| 124 | } // main |
| 125 | |
| 126 | } |