| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap; |
| 2 | |
| 3 | import java.util.*; |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 4 | import java.io.*; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 5 | |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 6 | import static org.junit.Assert.*; |
| 7 | |
| Nils Diewald | 0339d46 | 2015-02-26 14:53:56 +0000 | [diff] [blame] | 8 | import de.ids_mannheim.korap.KrillQuery; |
| Nils Diewald | 8904c1d | 2015-02-26 16:13:18 +0000 | [diff] [blame] | 9 | import de.ids_mannheim.korap.query.QueryBuilder; |
| Nils Diewald | e4986d7 | 2015-02-27 17:35:00 +0000 | [diff] [blame] | 10 | import de.ids_mannheim.korap.index.*; |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 11 | import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper; |
| 12 | import de.ids_mannheim.korap.util.QueryException; |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 13 | import de.ids_mannheim.korap.util.CorpusDataException; |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 14 | |
| Nils Diewald | c383ed0 | 2015-02-26 21:35:22 +0000 | [diff] [blame] | 15 | import static de.ids_mannheim.korap.util.KrillByte.*; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 16 | |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 17 | import org.apache.lucene.index.*; |
| 18 | import org.apache.lucene.document.*; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 19 | import org.apache.lucene.search.spans.Spans; |
| 20 | import org.apache.lucene.search.spans.SpanQuery; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 21 | import org.apache.lucene.util.Bits; |
| 22 | |
| 23 | /** |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 24 | * Helper class for testing the KrillIndex framework (Simple). |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 25 | * |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 26 | * @author diewald |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 27 | */ |
| 28 | public class TestSimple { |
| 29 | |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 30 | // Add document |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 31 | public static void addDoc (IndexWriter w, Map<String, String> m) |
| 32 | throws IOException { |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 33 | Document doc = new Document(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 34 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 35 | FieldType textFieldWithTermVectors = new FieldType( |
| 36 | TextField.TYPE_STORED); |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 37 | textFieldWithTermVectors.setStoreTermVectors(true); |
| 38 | /* |
| 39 | No offsets are stored. |
| 40 | textFieldWithTermVectors.setStoreTermVectorOffsets(true); |
| 41 | */ |
| 42 | textFieldWithTermVectors.setStoreTermVectorPositions(true); |
| 43 | textFieldWithTermVectors.setStoreTermVectorPayloads(true); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 44 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 45 | Field textFieldAnalyzed = new Field("text", m.get("textStr"), |
| 46 | textFieldWithTermVectors); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 47 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 48 | MultiTermTokenStream ts = getTermVector(m.get("text")); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 49 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 50 | textFieldAnalyzed.setTokenStream(ts); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 51 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 52 | doc.add(textFieldAnalyzed); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 53 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 54 | // Add document to writer |
| 55 | w.addDocument(doc); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 56 | }; |
| 57 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 58 | |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 59 | // Get Term Vector |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 60 | public static MultiTermTokenStream getTermVector (String stream) { |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 61 | MultiTermTokenStream ts = new MultiTermTokenStream(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 62 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 63 | int pos = 0; |
| 64 | for (String seg : stream.split(" ")) { |
| 65 | // System.err.println("** Prepare " + seg); |
| 66 | String[] tokens = seg.split("\\|"); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 67 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 68 | int i = 0; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 69 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 70 | while (tokens[i].length() == 0) |
| 71 | i++; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 72 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 73 | try { |
| 74 | MultiTermToken mtt = new MultiTermToken(tokens[i]); |
| 75 | // System.err.println("** Add term " + tokens[i]); |
| 76 | i++; |
| 77 | for (; i < tokens.length; i++) { |
| 78 | if (tokens[i].length() == 0) |
| 79 | continue; |
| 80 | mtt.add(tokens[i]); |
| 81 | }; |
| 82 | ts.addMultiTermToken(mtt); |
| 83 | } |
| 84 | catch (CorpusDataException cde) { |
| 85 | fail(cde.getErrorCode() + ": " + cde.getMessage()); |
| 86 | }; |
| 87 | }; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 88 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 89 | return ts; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 90 | }; |
| 91 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 92 | |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 93 | // Get query wrapper based on json file |
| 94 | public static SpanQueryWrapper getJSONQuery (String jsonFile) { |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 95 | SpanQueryWrapper sqwi; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 96 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 97 | try { |
| 98 | String json = getString(jsonFile); |
| Nils Diewald | 0339d46 | 2015-02-26 14:53:56 +0000 | [diff] [blame] | 99 | sqwi = new KrillQuery("tokens").fromJson(json); |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 100 | } |
| 101 | catch (QueryException e) { |
| 102 | fail(e.getMessage()); |
| Nils Diewald | 8904c1d | 2015-02-26 16:13:18 +0000 | [diff] [blame] | 103 | sqwi = new QueryBuilder("tokens").seg("???"); |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 104 | }; |
| 105 | return sqwi; |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 106 | }; |
| 107 | |
| 108 | |
| 109 | // Get string |
| 110 | public static String getString (String path) { |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 111 | StringBuilder contentBuilder = new StringBuilder(); |
| 112 | try { |
| 113 | BufferedReader in = new BufferedReader(new FileReader(path)); |
| 114 | String str; |
| 115 | while ((str = in.readLine()) != null) { |
| 116 | contentBuilder.append(str); |
| 117 | }; |
| 118 | in.close(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 119 | } |
| 120 | catch (IOException e) { |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 121 | fail(e.getMessage()); |
| 122 | } |
| 123 | return contentBuilder.toString(); |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 124 | }; |
| 125 | |
| 126 | |
| 127 | // getSpan Info |
| 128 | public static List<String> getSpanInfo (IndexReader reader, SpanQuery query) |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 129 | throws IOException { |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 130 | Map<Term, TermContext> termContexts = new HashMap<>(); |
| 131 | List<String> spanArray = new ArrayList<>(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 132 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 133 | for (AtomicReaderContext atomic : reader.leaves()) { |
| 134 | Bits bitset = atomic.reader().getLiveDocs(); |
| 135 | // Spans spans = NearSpansOrdered(); |
| 136 | Spans spans = query.getSpans(atomic, bitset, termContexts); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 137 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 138 | while (spans.next()) { |
| 139 | StringBuffer payloadString = new StringBuffer(); |
| 140 | int docid = atomic.docBase + spans.doc(); |
| 141 | if (spans.isPayloadAvailable()) { |
| 142 | for (byte[] payload : spans.getPayload()) { |
| 143 | /* retrieve payload for current matching span */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 144 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 145 | payloadString.append(byte2int(payload)).append(","); |
| 146 | payloadString.append(byte2int(payload, 2)); |
| 147 | // payloadString.append(byte2int(payload, 1)); |
| 148 | payloadString.append(" (" + payload.length + ")"); |
| 149 | payloadString.append(" | "); |
| 150 | }; |
| 151 | }; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 152 | spanArray.add("Doc: " + docid + " with " + spans.start() + "-" |
| 153 | + spans.end() + " || " + payloadString.toString()); |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 154 | }; |
| 155 | }; |
| 156 | return spanArray; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 157 | }; |
| 158 | }; |