blob: 0243250e45c95d906a4584467bb7635dc4483eb9 [file] [log] [blame]
Nils Diewaldf399a672013-11-18 17:55:22 +00001package de.ids_mannheim.korap;
2
margaretha7f4fd652018-11-22 18:00:02 +01003import static de.ids_mannheim.korap.util.KrillByte.byte2int;
4import static org.junit.Assert.fail;
5
6import java.io.BufferedReader;
7import java.io.FileInputStream;
8import java.io.IOException;
9import java.io.InputStreamReader;
Eliza Margaretha805e27f2016-10-14 21:39:42 +020010import java.net.URLDecoder;
margaretha7f4fd652018-11-22 18:00:02 +010011import java.util.ArrayList;
12import java.util.HashMap;
13import java.util.List;
14import java.util.Map;
15import java.util.regex.Matcher;
16import java.util.regex.Pattern;
Nils Diewaldf399a672013-11-18 17:55:22 +000017
margaretha7f4fd652018-11-22 18:00:02 +010018import org.apache.lucene.document.Document;
19import org.apache.lucene.document.Field;
20import org.apache.lucene.document.FieldType;
21import org.apache.lucene.document.TextField;
22import org.apache.lucene.index.IndexReader;
23import org.apache.lucene.index.IndexWriter;
24import org.apache.lucene.index.LeafReaderContext;
25import org.apache.lucene.index.Term;
26import org.apache.lucene.index.TermContext;
Nils Diewaldf399a672013-11-18 17:55:22 +000027import org.apache.lucene.search.spans.SpanQuery;
margaretha7f4fd652018-11-22 18:00:02 +010028import org.apache.lucene.search.spans.Spans;
Nils Diewaldf399a672013-11-18 17:55:22 +000029import org.apache.lucene.util.Bits;
margaretha7d89c052017-05-10 19:11:45 +020030import org.slf4j.Logger;
31import org.slf4j.LoggerFactory;
Nils Diewaldf399a672013-11-18 17:55:22 +000032
margaretha7f4fd652018-11-22 18:00:02 +010033import de.ids_mannheim.korap.index.FieldDocument;
34import de.ids_mannheim.korap.index.MultiTermToken;
35import de.ids_mannheim.korap.index.MultiTermTokenStream;
36import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
37import de.ids_mannheim.korap.response.Result;
38import de.ids_mannheim.korap.util.CorpusDataException;
39import de.ids_mannheim.korap.util.QueryException;
40
Nils Diewaldf399a672013-11-18 17:55:22 +000041/**
Nils Diewalda14ecd62015-02-26 21:00:20 +000042 * Helper class for testing the KrillIndex framework (Simple).
Nils Diewaldbb33da22015-03-04 16:24:25 +000043 *
Nils Diewald5c375702015-02-09 20:58:24 +000044 * @author diewald
Nils Diewaldf399a672013-11-18 17:55:22 +000045 */
46public class TestSimple {
47
margaretha7d89c052017-05-10 19:11:45 +020048 private static Logger log = LoggerFactory.getLogger(TestSimple.class);
49
Nils Diewald8db8f922014-10-24 17:43:13 +000050 // Add document
Nils Diewaldbb33da22015-03-04 16:24:25 +000051 public static void addDoc (IndexWriter w, Map<String, String> m)
52 throws IOException {
Nils Diewald5c375702015-02-09 20:58:24 +000053 Document doc = new Document();
Nils Diewaldf399a672013-11-18 17:55:22 +000054
Nils Diewaldbb33da22015-03-04 16:24:25 +000055 FieldType textFieldWithTermVectors = new FieldType(
56 TextField.TYPE_STORED);
Nils Diewald5c375702015-02-09 20:58:24 +000057 textFieldWithTermVectors.setStoreTermVectors(true);
58 /*
59 No offsets are stored.
60 textFieldWithTermVectors.setStoreTermVectorOffsets(true);
61 */
62 textFieldWithTermVectors.setStoreTermVectorPositions(true);
63 textFieldWithTermVectors.setStoreTermVectorPayloads(true);
Nils Diewaldf399a672013-11-18 17:55:22 +000064
Nils Diewaldbb33da22015-03-04 16:24:25 +000065 Field textFieldAnalyzed = new Field("text", m.get("textStr"),
66 textFieldWithTermVectors);
Nils Diewaldf399a672013-11-18 17:55:22 +000067
Nils Diewald5c375702015-02-09 20:58:24 +000068 MultiTermTokenStream ts = getTermVector(m.get("text"));
Nils Diewaldf399a672013-11-18 17:55:22 +000069
Nils Diewaldbb33da22015-03-04 16:24:25 +000070 textFieldAnalyzed.setTokenStream(ts);
Nils Diewaldf399a672013-11-18 17:55:22 +000071
Nils Diewald5c375702015-02-09 20:58:24 +000072 doc.add(textFieldAnalyzed);
Nils Diewaldf399a672013-11-18 17:55:22 +000073
Nils Diewald5c375702015-02-09 20:58:24 +000074 // Add document to writer
75 w.addDocument(doc);
Nils Diewaldf399a672013-11-18 17:55:22 +000076 };
77
Akron69bdecc2018-11-01 11:14:15 +010078 public static FieldDocument simpleFieldDoc (String s) {
margaretha7f4fd652018-11-22 18:00:02 +010079 return simpleFieldDoc(s, "");
80 }
81
82 // Add document
83 public static FieldDocument simpleFieldDoc (String s, String delimiter) {
84 String[] characters = s.split(delimiter);
Akron69bdecc2018-11-01 11:14:15 +010085
86 FieldDocument fd = new FieldDocument();
87 String surface = "";
88 String annotation = "";
89
90 for (int i = 0; i < characters.length; i++) {
91 String fixChar = characters[i];
92 surface += fixChar;
Akron7a7319a2018-11-28 17:08:56 +010093 annotation += "[("+i+"-"+(i+1)+")s:"+fixChar;
94 if (i == 0)
95 annotation += "|<>:base/s:t$<b>64<i>0<i>" + characters.length + "<i>" + characters.length + "<b>0";
96 annotation += "|_"+i+"$<i>"+i+"<i>"+(i+1)+"]";
Akron69bdecc2018-11-01 11:14:15 +010097 };
98
99 fd.addTV("base",surface, annotation);
100 return fd;
101 };
Akron9526c2e2021-10-25 14:23:52 +0200102
Akron69bdecc2018-11-01 11:14:15 +0100103
margaretha7f4fd652018-11-22 18:00:02 +0100104 // Create a new FieldDocument with random data
Akron69bdecc2018-11-01 11:14:15 +0100105 public static FieldDocument simpleFuzzyFieldDoc (List<String> chars, int minLength, int maxLength) {
106 String surface = "";
107
108 for (int i = 0; i < (int)(Math.random() * (maxLength - minLength)) + minLength; i++) {
margaretha7f4fd652018-11-22 18:00:02 +0100109 String randomChar = chars.get((int)(Math.random() * chars.size()));
Akron69bdecc2018-11-01 11:14:15 +0100110 surface += randomChar;
111 };
112 return simpleFieldDoc(surface);
113
114 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000115
Akron9526c2e2021-10-25 14:23:52 +0200116 // Create a new FieldDocument with random data
117 public static FieldDocument annotatedFuzzyFieldDoc (List<String> chars, int minLength, int maxLength) {
118 FieldDocument fd = new FieldDocument();
119 String annotation = "";
120 String surface = "";
121
122 int l = (int)(Math.random() * (maxLength - minLength)) + minLength;
123
124 for (int i = 0; i < l; i++) {
125 String fixChar = chars.get((int)(Math.random() * chars.size()));
126 surface += fixChar;
127 annotation += "[("+i+"-"+(i+1)+")s:"+fixChar;
128 if (i == 0)
129 annotation += "|<>:base/s:t$<b>64<i>0<i>" + l + "<i>" + l + "<b>0";
130
131 for (int j = 0; j < (int)(Math.random() * 3); j++) {
132 fixChar = chars.get((int)(Math.random() * chars.size()));
133 annotation += "|a:" + fixChar;
134 };
135
136 annotation += "|_"+i+"$<i>"+i+"<i>"+(i+1)+"]";
137 };
138
139
140 fd.addTV("base",surface, annotation);
141 fd.addString("copy", annotation);
142 return fd;
143 };
144
Nils Diewald8db8f922014-10-24 17:43:13 +0000145 // Get Term Vector
Nils Diewaldf399a672013-11-18 17:55:22 +0000146 public static MultiTermTokenStream getTermVector (String stream) {
Nils Diewald5c375702015-02-09 20:58:24 +0000147 MultiTermTokenStream ts = new MultiTermTokenStream();
Nils Diewaldf399a672013-11-18 17:55:22 +0000148
Nils Diewald5c375702015-02-09 20:58:24 +0000149 int pos = 0;
150 for (String seg : stream.split(" ")) {
151 // System.err.println("** Prepare " + seg);
152 String[] tokens = seg.split("\\|");
Nils Diewaldf399a672013-11-18 17:55:22 +0000153
Nils Diewald5c375702015-02-09 20:58:24 +0000154 int i = 0;
Nils Diewaldf399a672013-11-18 17:55:22 +0000155
Nils Diewald5c375702015-02-09 20:58:24 +0000156 while (tokens[i].length() == 0)
157 i++;
Nils Diewaldf399a672013-11-18 17:55:22 +0000158
Nils Diewald5c375702015-02-09 20:58:24 +0000159 try {
160 MultiTermToken mtt = new MultiTermToken(tokens[i]);
161 // System.err.println("** Add term " + tokens[i]);
162 i++;
163 for (; i < tokens.length; i++) {
164 if (tokens[i].length() == 0)
165 continue;
166 mtt.add(tokens[i]);
167 };
168 ts.addMultiTermToken(mtt);
169 }
170 catch (CorpusDataException cde) {
171 fail(cde.getErrorCode() + ": " + cde.getMessage());
172 };
173 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000174
Nils Diewald5c375702015-02-09 20:58:24 +0000175 return ts;
Nils Diewaldf399a672013-11-18 17:55:22 +0000176 };
177
Nils Diewald5c375702015-02-09 20:58:24 +0000178
Nils Diewald8db8f922014-10-24 17:43:13 +0000179 // Get query wrapper based on json file
Akrond6f5f592018-06-19 15:58:16 +0200180 public static SpanQueryWrapper getJsonQuery (String jsonFile) throws QueryException {
Nils Diewald5c375702015-02-09 20:58:24 +0000181 SpanQueryWrapper sqwi;
Akrond6f5f592018-06-19 15:58:16 +0200182 String json = getJsonString(jsonFile);
183 sqwi = new KrillQuery("tokens").fromKoral(json);
Nils Diewald5c375702015-02-09 20:58:24 +0000184 return sqwi;
Nils Diewald8db8f922014-10-24 17:43:13 +0000185 };
186
187
188 // Get string
Eliza Margaretha805e27f2016-10-14 21:39:42 +0200189 public static String getJsonString (String path) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200190
Nils Diewald5c375702015-02-09 20:58:24 +0000191 StringBuilder contentBuilder = new StringBuilder();
Akron798e6a22018-06-18 15:29:35 +0200192 try {
Akron67d2ff02018-06-19 10:51:16 +0200193 BufferedReader in = new BufferedReader(
194 new InputStreamReader(
195 new FileInputStream(URLDecoder.decode(path, "UTF-8")),
196 "UTF-8"
197 )
198 );
Nils Diewald5c375702015-02-09 20:58:24 +0000199 String str;
200 while ((str = in.readLine()) != null) {
201 contentBuilder.append(str);
202 };
203 in.close();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000204 }
205 catch (IOException e) {
Nils Diewald5c375702015-02-09 20:58:24 +0000206 fail(e.getMessage());
207 }
208 return contentBuilder.toString();
Nils Diewald8db8f922014-10-24 17:43:13 +0000209 };
210
211
212 // getSpan Info
213 public static List<String> getSpanInfo (IndexReader reader, SpanQuery query)
Nils Diewaldbb33da22015-03-04 16:24:25 +0000214 throws IOException {
Nils Diewald5c375702015-02-09 20:58:24 +0000215 Map<Term, TermContext> termContexts = new HashMap<>();
216 List<String> spanArray = new ArrayList<>();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000217
Akron700c1eb2015-09-25 16:57:30 +0200218 for (LeafReaderContext atomic : reader.leaves()) {
Nils Diewald5c375702015-02-09 20:58:24 +0000219 Bits bitset = atomic.reader().getLiveDocs();
220 // Spans spans = NearSpansOrdered();
221 Spans spans = query.getSpans(atomic, bitset, termContexts);
Nils Diewaldf399a672013-11-18 17:55:22 +0000222
Nils Diewald5c375702015-02-09 20:58:24 +0000223 while (spans.next()) {
224 StringBuffer payloadString = new StringBuffer();
225 int docid = atomic.docBase + spans.doc();
226 if (spans.isPayloadAvailable()) {
227 for (byte[] payload : spans.getPayload()) {
228 /* retrieve payload for current matching span */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000229
Nils Diewald5c375702015-02-09 20:58:24 +0000230 payloadString.append(byte2int(payload)).append(",");
231 payloadString.append(byte2int(payload, 2));
232 // payloadString.append(byte2int(payload, 1));
233 payloadString.append(" (" + payload.length + ")");
234 payloadString.append(" | ");
235 };
236 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000237 spanArray.add("Doc: " + docid + " with " + spans.start() + "-"
238 + spans.end() + " || " + payloadString.toString());
Nils Diewald5c375702015-02-09 20:58:24 +0000239 };
240 };
241 return spanArray;
Nils Diewaldf399a672013-11-18 17:55:22 +0000242 };
margaretha7f4fd652018-11-22 18:00:02 +0100243
244
245 // Simple fuzzing test
246 public static void fuzzingTest (List<String> chars, Pattern resultPattern,
247 SpanQuery sq, int minTextLength, int maxTextLength, int maxDocs)
248 throws IOException, QueryException {
249
250 Krill ks = new Krill(sq);
251 String lastFailureConf = "";
252
253 // Multiple runs of corpus creation and query checks
254 for (int x = 0; x < 100000; x++) {
255 KrillIndex ki = new KrillIndex();
256 ArrayList<String> list = new ArrayList<String>();
257 int c = 0;
258
259 // Create a corpus of <= maxDocs fuzzy docs
260 for (int i = 0; i < (int) (Math.random() * maxDocs); i++) {
261 FieldDocument testDoc = simpleFuzzyFieldDoc(chars,
262 minTextLength, maxTextLength);
263 String testString = testDoc.doc.getField("base").stringValue();
264 Matcher m = resultPattern.matcher(testString);
265 list.add(testString);
266 int offset = 0;
267 while (m.find(offset)) {
268 c++;
269 offset = Math.max(0, m.start() + 1);
270 }
271 ki.addDoc(testDoc);
272 };
273
274 ki.commit();
275 Result kr = ks.apply(ki);
276
277 // Check if the regex-calculated matches are correct,
278 // otherwise
279 // spit out the corpus configurations
280 if (c != kr.getTotalResults()) {
281 String failureConf = "expected:" + c + ", actual:"
282 + kr.getTotalResults() + ", docs:" + list.toString();
283
284 // Try to keep the failing configuration small
285 if (lastFailureConf.length() == 0
286 || failureConf.length() < lastFailureConf.length()) {
287 System.err.println(failureConf);
288 lastFailureConf = failureConf;
289 minTextLength--;
290 maxDocs--;
291 };
292 };
293 };
294 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000295};