blob: abda3c8e2b71a745c7502907f112b051fe992f30 [file] [log] [blame]
Nils Diewaldf399a672013-11-18 17:55:22 +00001package de.ids_mannheim.korap;
2
3import java.util.*;
Nils Diewald8db8f922014-10-24 17:43:13 +00004import java.io.*;
Nils Diewaldf399a672013-11-18 17:55:22 +00005
Nils Diewald8db8f922014-10-24 17:43:13 +00006import static org.junit.Assert.*;
7
Nils Diewald0339d462015-02-26 14:53:56 +00008import de.ids_mannheim.korap.KrillQuery;
Nils Diewald8904c1d2015-02-26 16:13:18 +00009import de.ids_mannheim.korap.query.QueryBuilder;
Nils Diewalde4986d72015-02-27 17:35:00 +000010import de.ids_mannheim.korap.index.*;
Nils Diewald8db8f922014-10-24 17:43:13 +000011import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
12import de.ids_mannheim.korap.util.QueryException;
Nils Diewald5c375702015-02-09 20:58:24 +000013import de.ids_mannheim.korap.util.CorpusDataException;
Nils Diewald8db8f922014-10-24 17:43:13 +000014
Nils Diewaldc383ed02015-02-26 21:35:22 +000015import static de.ids_mannheim.korap.util.KrillByte.*;
Nils Diewaldf399a672013-11-18 17:55:22 +000016
Nils Diewald8db8f922014-10-24 17:43:13 +000017import org.apache.lucene.index.*;
18import org.apache.lucene.document.*;
Nils Diewaldf399a672013-11-18 17:55:22 +000019import org.apache.lucene.search.spans.Spans;
20import org.apache.lucene.search.spans.SpanQuery;
Nils Diewaldf399a672013-11-18 17:55:22 +000021import org.apache.lucene.util.Bits;
22
23/**
Nils Diewalda14ecd62015-02-26 21:00:20 +000024 * Helper class for testing the KrillIndex framework (Simple).
Nils Diewaldbb33da22015-03-04 16:24:25 +000025 *
Nils Diewald5c375702015-02-09 20:58:24 +000026 * @author diewald
Nils Diewaldf399a672013-11-18 17:55:22 +000027 */
28public class TestSimple {
29
Nils Diewald8db8f922014-10-24 17:43:13 +000030 // Add document
Nils Diewaldbb33da22015-03-04 16:24:25 +000031 public static void addDoc (IndexWriter w, Map<String, String> m)
32 throws IOException {
Nils Diewald5c375702015-02-09 20:58:24 +000033 Document doc = new Document();
Nils Diewaldf399a672013-11-18 17:55:22 +000034
Nils Diewaldbb33da22015-03-04 16:24:25 +000035 FieldType textFieldWithTermVectors = new FieldType(
36 TextField.TYPE_STORED);
Nils Diewald5c375702015-02-09 20:58:24 +000037 textFieldWithTermVectors.setStoreTermVectors(true);
38 /*
39 No offsets are stored.
40 textFieldWithTermVectors.setStoreTermVectorOffsets(true);
41 */
42 textFieldWithTermVectors.setStoreTermVectorPositions(true);
43 textFieldWithTermVectors.setStoreTermVectorPayloads(true);
Nils Diewaldf399a672013-11-18 17:55:22 +000044
Nils Diewaldbb33da22015-03-04 16:24:25 +000045 Field textFieldAnalyzed = new Field("text", m.get("textStr"),
46 textFieldWithTermVectors);
Nils Diewaldf399a672013-11-18 17:55:22 +000047
Nils Diewald5c375702015-02-09 20:58:24 +000048 MultiTermTokenStream ts = getTermVector(m.get("text"));
Nils Diewaldf399a672013-11-18 17:55:22 +000049
Nils Diewaldbb33da22015-03-04 16:24:25 +000050 textFieldAnalyzed.setTokenStream(ts);
Nils Diewaldf399a672013-11-18 17:55:22 +000051
Nils Diewald5c375702015-02-09 20:58:24 +000052 doc.add(textFieldAnalyzed);
Nils Diewaldf399a672013-11-18 17:55:22 +000053
Nils Diewald5c375702015-02-09 20:58:24 +000054 // Add document to writer
55 w.addDocument(doc);
Nils Diewaldf399a672013-11-18 17:55:22 +000056 };
57
Nils Diewaldbb33da22015-03-04 16:24:25 +000058
Nils Diewald8db8f922014-10-24 17:43:13 +000059 // Get Term Vector
Nils Diewaldf399a672013-11-18 17:55:22 +000060 public static MultiTermTokenStream getTermVector (String stream) {
Nils Diewald5c375702015-02-09 20:58:24 +000061 MultiTermTokenStream ts = new MultiTermTokenStream();
Nils Diewaldf399a672013-11-18 17:55:22 +000062
Nils Diewald5c375702015-02-09 20:58:24 +000063 int pos = 0;
64 for (String seg : stream.split(" ")) {
65 // System.err.println("** Prepare " + seg);
66 String[] tokens = seg.split("\\|");
Nils Diewaldf399a672013-11-18 17:55:22 +000067
Nils Diewald5c375702015-02-09 20:58:24 +000068 int i = 0;
Nils Diewaldf399a672013-11-18 17:55:22 +000069
Nils Diewald5c375702015-02-09 20:58:24 +000070 while (tokens[i].length() == 0)
71 i++;
Nils Diewaldf399a672013-11-18 17:55:22 +000072
Nils Diewald5c375702015-02-09 20:58:24 +000073 try {
74 MultiTermToken mtt = new MultiTermToken(tokens[i]);
75 // System.err.println("** Add term " + tokens[i]);
76 i++;
77 for (; i < tokens.length; i++) {
78 if (tokens[i].length() == 0)
79 continue;
80 mtt.add(tokens[i]);
81 };
82 ts.addMultiTermToken(mtt);
83 }
84 catch (CorpusDataException cde) {
85 fail(cde.getErrorCode() + ": " + cde.getMessage());
86 };
87 };
Nils Diewaldbb33da22015-03-04 16:24:25 +000088
Nils Diewald5c375702015-02-09 20:58:24 +000089 return ts;
Nils Diewaldf399a672013-11-18 17:55:22 +000090 };
91
Nils Diewald5c375702015-02-09 20:58:24 +000092
Nils Diewald8db8f922014-10-24 17:43:13 +000093 // Get query wrapper based on json file
94 public static SpanQueryWrapper getJSONQuery (String jsonFile) {
Nils Diewald5c375702015-02-09 20:58:24 +000095 SpanQueryWrapper sqwi;
Nils Diewaldbb33da22015-03-04 16:24:25 +000096
Nils Diewald5c375702015-02-09 20:58:24 +000097 try {
98 String json = getString(jsonFile);
Nils Diewald0339d462015-02-26 14:53:56 +000099 sqwi = new KrillQuery("tokens").fromJson(json);
Nils Diewald5c375702015-02-09 20:58:24 +0000100 }
101 catch (QueryException e) {
102 fail(e.getMessage());
Nils Diewald8904c1d2015-02-26 16:13:18 +0000103 sqwi = new QueryBuilder("tokens").seg("???");
Nils Diewald5c375702015-02-09 20:58:24 +0000104 };
105 return sqwi;
Nils Diewald8db8f922014-10-24 17:43:13 +0000106 };
107
108
109 // Get string
110 public static String getString (String path) {
Nils Diewald5c375702015-02-09 20:58:24 +0000111 StringBuilder contentBuilder = new StringBuilder();
112 try {
113 BufferedReader in = new BufferedReader(new FileReader(path));
114 String str;
115 while ((str = in.readLine()) != null) {
116 contentBuilder.append(str);
117 };
118 in.close();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000119 }
120 catch (IOException e) {
Nils Diewald5c375702015-02-09 20:58:24 +0000121 fail(e.getMessage());
122 }
123 return contentBuilder.toString();
Nils Diewald8db8f922014-10-24 17:43:13 +0000124 };
125
126
127 // getSpan Info
128 public static List<String> getSpanInfo (IndexReader reader, SpanQuery query)
Nils Diewaldbb33da22015-03-04 16:24:25 +0000129 throws IOException {
Nils Diewald5c375702015-02-09 20:58:24 +0000130 Map<Term, TermContext> termContexts = new HashMap<>();
131 List<String> spanArray = new ArrayList<>();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000132
Nils Diewald5c375702015-02-09 20:58:24 +0000133 for (AtomicReaderContext atomic : reader.leaves()) {
134 Bits bitset = atomic.reader().getLiveDocs();
135 // Spans spans = NearSpansOrdered();
136 Spans spans = query.getSpans(atomic, bitset, termContexts);
Nils Diewaldf399a672013-11-18 17:55:22 +0000137
Nils Diewald5c375702015-02-09 20:58:24 +0000138 while (spans.next()) {
139 StringBuffer payloadString = new StringBuffer();
140 int docid = atomic.docBase + spans.doc();
141 if (spans.isPayloadAvailable()) {
142 for (byte[] payload : spans.getPayload()) {
143 /* retrieve payload for current matching span */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000144
Nils Diewald5c375702015-02-09 20:58:24 +0000145 payloadString.append(byte2int(payload)).append(",");
146 payloadString.append(byte2int(payload, 2));
147 // payloadString.append(byte2int(payload, 1));
148 payloadString.append(" (" + payload.length + ")");
149 payloadString.append(" | ");
150 };
151 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000152 spanArray.add("Doc: " + docid + " with " + spans.start() + "-"
153 + spans.end() + " || " + payloadString.toString());
Nils Diewald5c375702015-02-09 20:58:24 +0000154 };
155 };
156 return spanArray;
Nils Diewaldf399a672013-11-18 17:55:22 +0000157 };
158};