| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap; |
| 2 | |
| margaretha | 7f4fd65 | 2018-11-22 18:00:02 +0100 | [diff] [blame] | 3 | import static de.ids_mannheim.korap.util.KrillByte.byte2int; |
| 4 | import static org.junit.Assert.fail; |
| 5 | |
| 6 | import java.io.BufferedReader; |
| 7 | import java.io.FileInputStream; |
| 8 | import java.io.IOException; |
| 9 | import java.io.InputStreamReader; |
| Eliza Margaretha | 805e27f | 2016-10-14 21:39:42 +0200 | [diff] [blame] | 10 | import java.net.URLDecoder; |
| margaretha | 7f4fd65 | 2018-11-22 18:00:02 +0100 | [diff] [blame] | 11 | import java.util.ArrayList; |
| 12 | import java.util.HashMap; |
| 13 | import java.util.List; |
| 14 | import java.util.Map; |
| 15 | import java.util.regex.Matcher; |
| 16 | import java.util.regex.Pattern; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 17 | |
| margaretha | 7f4fd65 | 2018-11-22 18:00:02 +0100 | [diff] [blame] | 18 | import org.apache.lucene.document.Document; |
| 19 | import org.apache.lucene.document.Field; |
| 20 | import org.apache.lucene.document.FieldType; |
| 21 | import org.apache.lucene.document.TextField; |
| 22 | import org.apache.lucene.index.IndexReader; |
| 23 | import org.apache.lucene.index.IndexWriter; |
| 24 | import org.apache.lucene.index.LeafReaderContext; |
| 25 | import org.apache.lucene.index.Term; |
| 26 | import org.apache.lucene.index.TermContext; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 27 | import org.apache.lucene.search.spans.SpanQuery; |
| margaretha | 7f4fd65 | 2018-11-22 18:00:02 +0100 | [diff] [blame] | 28 | import org.apache.lucene.search.spans.Spans; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 29 | import org.apache.lucene.util.Bits; |
| margaretha | 7d89c05 | 2017-05-10 19:11:45 +0200 | [diff] [blame] | 30 | import org.slf4j.Logger; |
| 31 | import org.slf4j.LoggerFactory; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 32 | |
| margaretha | 7f4fd65 | 2018-11-22 18:00:02 +0100 | [diff] [blame] | 33 | import de.ids_mannheim.korap.index.FieldDocument; |
| 34 | import de.ids_mannheim.korap.index.MultiTermToken; |
| 35 | import de.ids_mannheim.korap.index.MultiTermTokenStream; |
| 36 | import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper; |
| 37 | import de.ids_mannheim.korap.response.Result; |
| 38 | import de.ids_mannheim.korap.util.CorpusDataException; |
| 39 | import de.ids_mannheim.korap.util.QueryException; |
| 40 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 41 | /** |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 42 | * Helper class for testing the KrillIndex framework (Simple). |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 43 | * |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 44 | * @author diewald |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 45 | */ |
| 46 | public class TestSimple { |
| 47 | |
| margaretha | 7d89c05 | 2017-05-10 19:11:45 +0200 | [diff] [blame] | 48 | private static Logger log = LoggerFactory.getLogger(TestSimple.class); |
| 49 | |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 50 | // Add document |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 51 | public static void addDoc (IndexWriter w, Map<String, String> m) |
| 52 | throws IOException { |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 53 | Document doc = new Document(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 54 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 55 | FieldType textFieldWithTermVectors = new FieldType( |
| 56 | TextField.TYPE_STORED); |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 57 | textFieldWithTermVectors.setStoreTermVectors(true); |
| 58 | /* |
| 59 | No offsets are stored. |
| 60 | textFieldWithTermVectors.setStoreTermVectorOffsets(true); |
| 61 | */ |
| 62 | textFieldWithTermVectors.setStoreTermVectorPositions(true); |
| 63 | textFieldWithTermVectors.setStoreTermVectorPayloads(true); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 64 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 65 | Field textFieldAnalyzed = new Field("text", m.get("textStr"), |
| 66 | textFieldWithTermVectors); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 67 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 68 | MultiTermTokenStream ts = getTermVector(m.get("text")); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 69 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 70 | textFieldAnalyzed.setTokenStream(ts); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 71 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 72 | doc.add(textFieldAnalyzed); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 73 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 74 | // Add document to writer |
| 75 | w.addDocument(doc); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 76 | }; |
| 77 | |
| Akron | 69bdecc | 2018-11-01 11:14:15 +0100 | [diff] [blame] | 78 | public static FieldDocument simpleFieldDoc (String s) { |
| margaretha | 7f4fd65 | 2018-11-22 18:00:02 +0100 | [diff] [blame] | 79 | return simpleFieldDoc(s, ""); |
| 80 | } |
| 81 | |
| 82 | // Add document |
| 83 | public static FieldDocument simpleFieldDoc (String s, String delimiter) { |
| 84 | String[] characters = s.split(delimiter); |
| Akron | 69bdecc | 2018-11-01 11:14:15 +0100 | [diff] [blame] | 85 | |
| 86 | FieldDocument fd = new FieldDocument(); |
| 87 | String surface = ""; |
| 88 | String annotation = ""; |
| 89 | |
| 90 | for (int i = 0; i < characters.length; i++) { |
| 91 | String fixChar = characters[i]; |
| 92 | surface += fixChar; |
| Akron | 7a7319a | 2018-11-28 17:08:56 +0100 | [diff] [blame] | 93 | annotation += "[("+i+"-"+(i+1)+")s:"+fixChar; |
| 94 | if (i == 0) |
| 95 | annotation += "|<>:base/s:t$<b>64<i>0<i>" + characters.length + "<i>" + characters.length + "<b>0"; |
| 96 | annotation += "|_"+i+"$<i>"+i+"<i>"+(i+1)+"]"; |
| Akron | 69bdecc | 2018-11-01 11:14:15 +0100 | [diff] [blame] | 97 | }; |
| 98 | |
| 99 | fd.addTV("base",surface, annotation); |
| 100 | return fd; |
| 101 | }; |
| Akron | 9526c2e | 2021-10-25 14:23:52 +0200 | [diff] [blame] | 102 | |
| Akron | 69bdecc | 2018-11-01 11:14:15 +0100 | [diff] [blame] | 103 | |
| margaretha | 7f4fd65 | 2018-11-22 18:00:02 +0100 | [diff] [blame] | 104 | // Create a new FieldDocument with random data |
| Akron | 69bdecc | 2018-11-01 11:14:15 +0100 | [diff] [blame] | 105 | public static FieldDocument simpleFuzzyFieldDoc (List<String> chars, int minLength, int maxLength) { |
| 106 | String surface = ""; |
| 107 | |
| 108 | for (int i = 0; i < (int)(Math.random() * (maxLength - minLength)) + minLength; i++) { |
| margaretha | 7f4fd65 | 2018-11-22 18:00:02 +0100 | [diff] [blame] | 109 | String randomChar = chars.get((int)(Math.random() * chars.size())); |
| Akron | 69bdecc | 2018-11-01 11:14:15 +0100 | [diff] [blame] | 110 | surface += randomChar; |
| 111 | }; |
| 112 | return simpleFieldDoc(surface); |
| 113 | |
| 114 | }; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 115 | |
| Akron | 9526c2e | 2021-10-25 14:23:52 +0200 | [diff] [blame] | 116 | // Create a new FieldDocument with random data |
| 117 | public static FieldDocument annotatedFuzzyFieldDoc (List<String> chars, int minLength, int maxLength) { |
| 118 | FieldDocument fd = new FieldDocument(); |
| 119 | String annotation = ""; |
| 120 | String surface = ""; |
| 121 | |
| 122 | int l = (int)(Math.random() * (maxLength - minLength)) + minLength; |
| 123 | |
| 124 | for (int i = 0; i < l; i++) { |
| 125 | String fixChar = chars.get((int)(Math.random() * chars.size())); |
| 126 | surface += fixChar; |
| 127 | annotation += "[("+i+"-"+(i+1)+")s:"+fixChar; |
| 128 | if (i == 0) |
| 129 | annotation += "|<>:base/s:t$<b>64<i>0<i>" + l + "<i>" + l + "<b>0"; |
| 130 | |
| 131 | for (int j = 0; j < (int)(Math.random() * 3); j++) { |
| 132 | fixChar = chars.get((int)(Math.random() * chars.size())); |
| 133 | annotation += "|a:" + fixChar; |
| 134 | }; |
| 135 | |
| 136 | annotation += "|_"+i+"$<i>"+i+"<i>"+(i+1)+"]"; |
| 137 | }; |
| 138 | |
| 139 | |
| 140 | fd.addTV("base",surface, annotation); |
| 141 | fd.addString("copy", annotation); |
| 142 | return fd; |
| 143 | }; |
| 144 | |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 145 | // Get Term Vector |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 146 | public static MultiTermTokenStream getTermVector (String stream) { |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 147 | MultiTermTokenStream ts = new MultiTermTokenStream(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 148 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 149 | int pos = 0; |
| 150 | for (String seg : stream.split(" ")) { |
| 151 | // System.err.println("** Prepare " + seg); |
| 152 | String[] tokens = seg.split("\\|"); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 153 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 154 | int i = 0; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 155 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 156 | while (tokens[i].length() == 0) |
| 157 | i++; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 158 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 159 | try { |
| 160 | MultiTermToken mtt = new MultiTermToken(tokens[i]); |
| 161 | // System.err.println("** Add term " + tokens[i]); |
| 162 | i++; |
| 163 | for (; i < tokens.length; i++) { |
| 164 | if (tokens[i].length() == 0) |
| 165 | continue; |
| 166 | mtt.add(tokens[i]); |
| 167 | }; |
| 168 | ts.addMultiTermToken(mtt); |
| 169 | } |
| 170 | catch (CorpusDataException cde) { |
| 171 | fail(cde.getErrorCode() + ": " + cde.getMessage()); |
| 172 | }; |
| 173 | }; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 174 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 175 | return ts; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 176 | }; |
| 177 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 178 | |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 179 | // Get query wrapper based on json file |
| Akron | d6f5f59 | 2018-06-19 15:58:16 +0200 | [diff] [blame] | 180 | public static SpanQueryWrapper getJsonQuery (String jsonFile) throws QueryException { |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 181 | SpanQueryWrapper sqwi; |
| Akron | d6f5f59 | 2018-06-19 15:58:16 +0200 | [diff] [blame] | 182 | String json = getJsonString(jsonFile); |
| 183 | sqwi = new KrillQuery("tokens").fromKoral(json); |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 184 | return sqwi; |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 185 | }; |
| 186 | |
| 187 | |
| 188 | // Get string |
| Eliza Margaretha | 805e27f | 2016-10-14 21:39:42 +0200 | [diff] [blame] | 189 | public static String getJsonString (String path) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 190 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 191 | StringBuilder contentBuilder = new StringBuilder(); |
| Akron | 798e6a2 | 2018-06-18 15:29:35 +0200 | [diff] [blame] | 192 | try { |
| Akron | 67d2ff0 | 2018-06-19 10:51:16 +0200 | [diff] [blame] | 193 | BufferedReader in = new BufferedReader( |
| 194 | new InputStreamReader( |
| 195 | new FileInputStream(URLDecoder.decode(path, "UTF-8")), |
| 196 | "UTF-8" |
| 197 | ) |
| 198 | ); |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 199 | String str; |
| 200 | while ((str = in.readLine()) != null) { |
| 201 | contentBuilder.append(str); |
| 202 | }; |
| 203 | in.close(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 204 | } |
| 205 | catch (IOException e) { |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 206 | fail(e.getMessage()); |
| 207 | } |
| 208 | return contentBuilder.toString(); |
| Nils Diewald | 8db8f92 | 2014-10-24 17:43:13 +0000 | [diff] [blame] | 209 | }; |
| 210 | |
| 211 | |
| 212 | // getSpan Info |
| 213 | public static List<String> getSpanInfo (IndexReader reader, SpanQuery query) |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 214 | throws IOException { |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 215 | Map<Term, TermContext> termContexts = new HashMap<>(); |
| 216 | List<String> spanArray = new ArrayList<>(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 217 | |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 218 | for (LeafReaderContext atomic : reader.leaves()) { |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 219 | Bits bitset = atomic.reader().getLiveDocs(); |
| 220 | // Spans spans = NearSpansOrdered(); |
| 221 | Spans spans = query.getSpans(atomic, bitset, termContexts); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 222 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 223 | while (spans.next()) { |
| 224 | StringBuffer payloadString = new StringBuffer(); |
| 225 | int docid = atomic.docBase + spans.doc(); |
| 226 | if (spans.isPayloadAvailable()) { |
| 227 | for (byte[] payload : spans.getPayload()) { |
| 228 | /* retrieve payload for current matching span */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 229 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 230 | payloadString.append(byte2int(payload)).append(","); |
| 231 | payloadString.append(byte2int(payload, 2)); |
| 232 | // payloadString.append(byte2int(payload, 1)); |
| 233 | payloadString.append(" (" + payload.length + ")"); |
| 234 | payloadString.append(" | "); |
| 235 | }; |
| 236 | }; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 237 | spanArray.add("Doc: " + docid + " with " + spans.start() + "-" |
| 238 | + spans.end() + " || " + payloadString.toString()); |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 239 | }; |
| 240 | }; |
| 241 | return spanArray; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 242 | }; |
| margaretha | 7f4fd65 | 2018-11-22 18:00:02 +0100 | [diff] [blame] | 243 | |
| 244 | |
| 245 | // Simple fuzzing test |
| 246 | public static void fuzzingTest (List<String> chars, Pattern resultPattern, |
| 247 | SpanQuery sq, int minTextLength, int maxTextLength, int maxDocs) |
| 248 | throws IOException, QueryException { |
| 249 | |
| 250 | Krill ks = new Krill(sq); |
| 251 | String lastFailureConf = ""; |
| 252 | |
| 253 | // Multiple runs of corpus creation and query checks |
| 254 | for (int x = 0; x < 100000; x++) { |
| 255 | KrillIndex ki = new KrillIndex(); |
| 256 | ArrayList<String> list = new ArrayList<String>(); |
| 257 | int c = 0; |
| 258 | |
| 259 | // Create a corpus of <= maxDocs fuzzy docs |
| 260 | for (int i = 0; i < (int) (Math.random() * maxDocs); i++) { |
| 261 | FieldDocument testDoc = simpleFuzzyFieldDoc(chars, |
| 262 | minTextLength, maxTextLength); |
| 263 | String testString = testDoc.doc.getField("base").stringValue(); |
| 264 | Matcher m = resultPattern.matcher(testString); |
| 265 | list.add(testString); |
| 266 | int offset = 0; |
| 267 | while (m.find(offset)) { |
| 268 | c++; |
| 269 | offset = Math.max(0, m.start() + 1); |
| 270 | } |
| 271 | ki.addDoc(testDoc); |
| 272 | }; |
| 273 | |
| 274 | ki.commit(); |
| 275 | Result kr = ks.apply(ki); |
| 276 | |
| 277 | // Check if the regex-calculated matches are correct, |
| 278 | // otherwise |
| 279 | // spit out the corpus configurations |
| 280 | if (c != kr.getTotalResults()) { |
| 281 | String failureConf = "expected:" + c + ", actual:" |
| 282 | + kr.getTotalResults() + ", docs:" + list.toString(); |
| 283 | |
| 284 | // Try to keep the failing configuration small |
| 285 | if (lastFailureConf.length() == 0 |
| 286 | || failureConf.length() < lastFailureConf.length()) { |
| 287 | System.err.println(failureConf); |
| 288 | lastFailureConf = failureConf; |
| 289 | minTextLength--; |
| 290 | maxDocs--; |
| 291 | }; |
| 292 | }; |
| 293 | }; |
| 294 | }; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 295 | }; |