blob: d1dd6085986305d930c2be68c02542c434f3e238 [file] [log] [blame]
Eliza Margaretha01929182014-02-19 11:48:59 +00001package de.ids_mannheim.korap.index;
2
Nils Diewaldf399a672013-11-18 17:55:22 +00003import java.util.*;
4import java.io.*;
5
6import de.ids_mannheim.korap.analysis.MultiTermToken;
7import de.ids_mannheim.korap.query.wrap.SpanSegmentQueryWrapper;
8import de.ids_mannheim.korap.query.wrap.SpanRegexQueryWrapper;
9import de.ids_mannheim.korap.query.wrap.SpanSequenceQueryWrapper;
10import de.ids_mannheim.korap.query.SpanWithinQuery;
11
12import static de.ids_mannheim.korap.Test.*;
13
14import org.apache.lucene.analysis.standard.StandardAnalyzer;
15import org.apache.lucene.analysis.TokenFilter;
16import org.apache.lucene.analysis.TokenStream;
17import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
18
19import org.apache.lucene.index.Term;
20import org.apache.lucene.index.TermsEnum;
21import org.apache.lucene.index.TermContext;
22
23import org.apache.lucene.index.DocsAndPositionsEnum;
24import org.apache.lucene.index.DirectoryReader;
25import org.apache.lucene.index.IndexWriter;
26import org.apache.lucene.index.IndexWriterConfig;
27import org.apache.lucene.index.IndexWriterConfig.OpenMode;
28import org.apache.lucene.index.AtomicReaderContext;
29
30import org.apache.lucene.queryparser.classic.ParseException;
31import org.apache.lucene.queryparser.classic.QueryParser;
32
33import org.apache.lucene.search.IndexSearcher;
34import org.apache.lucene.search.Query;
35import org.apache.lucene.search.TermQuery;
36import org.apache.lucene.search.BooleanClause;
37import org.apache.lucene.search.BooleanQuery;
38import org.apache.lucene.search.PhraseQuery;
39import org.apache.lucene.search.NumericRangeQuery;
40import org.apache.lucene.search.spans.Spans;
41import org.apache.lucene.search.spans.SpanQuery;
42import org.apache.lucene.search.spans.SpanOrQuery;
43import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
44import org.apache.lucene.search.spans.SpanTermQuery;
45import org.apache.lucene.search.spans.SpanNearQuery;
46import org.apache.lucene.search.spans.SpanNotQuery;
47import org.apache.lucene.search.spans.NearSpansOrdered;
48import org.apache.lucene.search.WildcardQuery;
49import org.apache.lucene.search.ScoreDoc;
50import org.apache.lucene.search.TopScoreDocCollector;
51import org.apache.lucene.search.TopDocs;
52import org.apache.lucene.search.RegexpQuery;
53
54import org.apache.lucene.store.Directory;
55import org.apache.lucene.store.RAMDirectory;
56import org.apache.lucene.store.SimpleFSDirectory; // temporary
57
58import org.apache.lucene.util.Version;
59import org.apache.lucene.util.BytesRef;
60import org.apache.lucene.util.Bits;
61
62import static org.junit.Assert.*;
63import org.junit.Test;
64import org.junit.Ignore;
65import org.junit.runner.RunWith;
66import org.junit.runners.JUnit4;
67
68@RunWith(JUnit4.class)
69public class TestIndex { // extends LuceneTestCase {
70 // Create index in RAM
71 // private Directory index = new RAMDirectory();
72
73 private Directory index = new RAMDirectory();
74
75 @Test
76 public void multiTermToken () {
77 MultiTermToken test = new MultiTermToken("hunde", "pos:n", "m:gen:pl");
78 assertEquals(test.terms.get(0).term, "hunde");
79 assertEquals(test.terms.get(1).term, "pos:n");
80 assertEquals(test.terms.get(2).term, "m:gen:pl");
81 assertEquals(test.terms.get(0).posIncr, 1, 1);
82 assertEquals(test.terms.get(1).posIncr, 0, 1);
83 assertEquals(test.terms.get(2).posIncr, 0, 1);
84
85 test = new MultiTermToken("hunde", "pos:n", "m:gen:pl");
86 assertEquals(test.terms.get(0).term, "hunde");
87 assertEquals(test.terms.get(1).term, "pos:n");
88 assertEquals(test.terms.get(2).term, "m:gen:pl");
89 assertEquals(test.terms.get(0).posIncr, 1, 1);
90 assertEquals(test.terms.get(1).posIncr, 0, 1);
91 assertEquals(test.terms.get(2).posIncr, 0, 1);
92 };
93
94 private List initIndexer () throws IOException {
95 List<Map<String, String>> list = new ArrayList<>();
96
97 Map<String, String> d1 = new HashMap<String, String>();
98 d1.put("id", "w1");
99 d1.put("corpus", "wiki");
100 d1.put("author", "Nils Diewald");
101 d1.put("title", "Wikipedia");
102 d1.put("subtitle", "A test");
103 d1.put("pubDate", "20130701");
104 d1.put("pubPlace", "Mannheim");
105 d1.put("textClass", "news sports");
106 d1.put("textStr", "Er nahm den Hunden die Angst.");
107 d1.put("text", "Er#0-2|PPER|er|c:nom;p:3;n:sg;g:masc|<>:s#0-29$<i>7 " +
108 "nahm#3-7|VVFIN|nehmen|p:3;n:sg;t:past;m:ind| " +
109 "den#8-11|ART|der|c:acc;n:sg;g:masc| " +
110 "Hunden#12-18|NN|hund|c:acc;n:sg;g:masc| " +
111 "die#19-22|ART|der|c:nom;n:sg;g:fem| " +
112 "Angst#23-28|NN|angst|c:nom;n:sg;g:fem| " +
113 ".#28-29|$.|.||");
114 list.add(d1);
115
116 Map<String, String> d2 = new HashMap<String, String>();
117
118 d2.put("id", "w2");
119 d2.put("corpus", "wiki");
120 d2.put("author", "Peter Thomas");
121 d2.put("title", "Waldartikel");
122 d2.put("subtitle", "Another test");
123 d2.put("pubDate", "20130723");
124 d2.put("pubPlace", "Bielefeld");
125 d2.put("textClass", "news");
126 d2.put("textStr", "Sie liefen durch den Wald.");
127 d2.put("text", "Sie#0-3|PPER|sie|c:nom;p:3;n:pl;g:all|<>:s#0-26$<i>6 " +
128 "liefen#4-10|VVFIN|laufen|p:3;n:pl;t:past;m:ind| " +
129 "durch#11-16|APPR|durch|| " +
130 "den#17-20|ART|der|c:acc;n:sg;g:masc| " +
131 "Wald#21-25|NN|wald|c:acc;n:sg;g:masc| " +
132 ".#25-26|$.|.||");
133 list.add(d2);
134
135 Map<String, String> d3 = new HashMap<String, String>();
136 d3.put("id", "w3");
137 d3.put("corpus", "zeitung");
138 d3.put("author", "Michael Meier");
139 d3.put("title", "Angst");
140 d3.put("subtitle", "Starr vor Angst");
141 d3.put("pubDate", "20130713");
142 d3.put("pubPlace", "Bielefeld");
143 d3.put("textClass", "sports");
144 d3.put("textStr", "Er wagte nicht, sich zu ruehren. Er war starr vor Angst.");
145 d3.put("text", "Er#0-2|PPER|er|c:nom;n:sg;g:masc;p:3|<>:s#0-32$<i>8 " +
146 "wagte#3-8|VVFIN|wagen|p:3;n:sg;t:past;m:ind| " +
147 "nicht#9-14|PTKNEG|nicht|| " +
148 ",#14-15|$,|,|| " +
149 "sich#16-20|PRF|sich|c:acc;p:3;n:sg| " +
150 "zu#21-23|PTKZU|zu|| " +
151 "ruehren#24-31|VVFIN|ruehren|| " +
152 ".#31-32|$.|.|| " +
153 "Er#33-35|PPER|er|c:nom;p:3;n:sg;g:masc|<>:s#33-56$<i>14 " +
154 "war#36-39|VAFIN|sein|p:3;n:sg;t:past;m:ind| " +
155 "starr#40-45|ADJD|starr|comp:pos| " +
156 "vor#46-49|APPR|vor|| " +
157 "Angst#50-55|NN|angst|c:dat;n:sg;g:fem| " +
158 ".#55-56|$.|.||");
159 list.add(d3);
160
161 return list;
162 };
163
164 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +0000165 public void indexLucene () throws Exception {
Nils Diewaldf399a672013-11-18 17:55:22 +0000166
167 // Base analyzer for searching and indexing
168 StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
169
170 // Based on
171 // http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/
172 // analysis/Analyzer.html?is-external=true
173
174 // Create configuration with base analyzer
175 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, analyzer);
176
177 // Add a document 1 with the correct fields
178 IndexWriter w = new IndexWriter(index, config);
179
180 Collection docs = initIndexer();
181
182 @SuppressWarnings("unchecked")
183 Iterator<Map<String,String>> i = (Iterator<Map<String,String>>) docs.iterator();
184
185 for (; i.hasNext(); ) {
186 addDoc(w, i.next());
187 };
188
189 assertEquals(3, w.numDocs());
190
191 w.close();
192
193 // Check directory
194 DirectoryReader reader = DirectoryReader.open( index );
195 assertEquals(docs.size(), reader.maxDoc());
196 assertEquals(docs.size(), reader.numDocs());
197
198 // Check searcher
199 IndexSearcher searcher = new IndexSearcher( reader );
200
201 // textClass
202 // All texts of text class "news"
203 assertEquals(2,
204 searcher.search(
205 new TermQuery(
206 new Term("textClass", "news")
207 ), 10
208 ).totalHits
209 );
210
211 // textClass
212 // All texts of text class "sports"
213 assertEquals(2,
214 searcher.search(
215 new TermQuery(
216 new Term("textClass", "sports")
217 ), 10
218 ).totalHits
219 );
220
221 // TextIndex
222 // All docs containing "l:nehmen"
223 assertEquals(1,
224 searcher.search(
225 new TermQuery(
226 new Term("text", "l:nehmen")
227 ), 10
228 ).totalHits
229 );
230
231 // TextIndex
232 // All docs containing "s:den"
233 assertEquals(2,
234 searcher.search(
235 new TermQuery(
236 new Term("text", "s:den")
237 ), 10
238 ).totalHits
239 );
240
241 /*
242 assertEquals(3,
243 searcher.search(
244 new TermQuery(
245 new Term("text", "T")
246 ), 10
247 ).totalHits
248 );
249 */
250
251 // BooleanQuery
252 // All docs containing "s:den" and "l:sie"
253 TermQuery s_den = new TermQuery(new Term("text", "s:den"));
254 TermQuery l_sie = new TermQuery(new Term("text", "l:sie"));
255 BooleanQuery bool = new BooleanQuery();
256 bool.add(s_den, BooleanClause.Occur.MUST);
257 bool.add(l_sie, BooleanClause.Occur.MUST);
258
259 assertEquals(1, searcher.search(bool, 10).totalHits);
260
261 // BooleanQuery
262 // All docs containing "s:den" or "l:sie"
263 bool = new BooleanQuery();
264 bool.add(s_den, BooleanClause.Occur.SHOULD);
265 bool.add(l_sie, BooleanClause.Occur.SHOULD);
266 assertEquals(2, searcher.search(bool, 10).totalHits);
267
268
269 // RegexpQuery
270 // All docs containing ".{4}en" (liefen und Hunden)
271 RegexpQuery srquery = new RegexpQuery(
272 new Term("text", "s:.{4}en")
273 );
274 assertEquals(2, searcher.search(srquery, 10).totalHits);
275
276 // RegexpQuery
277 // All docs containing "E." (Er) (2x)
278 srquery = new RegexpQuery(
279 new Term("text", "s:E.")
280 );
281 assertEquals(2, searcher.search(srquery, 10).totalHits);
282
283 SpanRegexQueryWrapper ssrquery = new SpanRegexQueryWrapper("text", "s:E.");
284 assertEquals(2, searcher.search(ssrquery.toQuery(), 10).totalHits);
285
286
287 // RegexpQuery
288 // All docs containing "E." (er) (0x)
289 srquery = new RegexpQuery(
290 new Term("text", "s:e.")
291 );
292 assertEquals(0, searcher.search(srquery, 10).totalHits);
293
294 ssrquery = new SpanRegexQueryWrapper("text", "s:e.");
295 assertEquals(0, searcher.search(ssrquery.toQuery(), 10).totalHits);
296
297 // Check http://comments.gmane.org/gmane.comp.jakarta.lucene.user/52283
298 // for Carstens question on wildcards
299
300 // RegexpQuery
301 // All docs containing "E."/i ([Ee]r) (2x)
302 srquery = new RegexpQuery(
303 new Term("text", "i:e.")
304 );
305 assertEquals(2, searcher.search(srquery, 10).totalHits);
306
307 ssrquery = new SpanRegexQueryWrapper("text", "s:e.", true);
308 assertEquals("SpanMultiTermQueryWrapper(text:/i:e./)", ssrquery.toQuery().toString());
309 assertEquals(2, searcher.search(ssrquery.toQuery(), 10).totalHits);
310
311 // All docs containing "ng"/x (Angst) (2x)
312 srquery = new RegexpQuery(
313 new Term("text", "s:.*ng.*")
314 );
315 assertEquals(2, searcher.search(srquery, 10).totalHits);
316
317 // [base=angst]
318 SpanTermQuery stq = new SpanTermQuery(new Term("text", "l:angst"));
319 assertEquals(2, searcher.search(srquery, 10).totalHits);
320
321 // vor Angst
322 // [orth=vor][orth=Angst]
323 SpanNearQuery snquery = new SpanNearQuery(
324 new SpanQuery[] {
325 new SpanTermQuery(new Term("text", "s:vor")),
326 new SpanTermQuery(new Term("text", "s:Angst"))
327 },
328 1,
329 true
330 );
331 assertEquals(1, searcher.search(snquery, 10).totalHits);
332
333 // Spannearquery [p:VVFIN][]{,5}[m:nom:sg:fem]
334 snquery = new SpanNearQuery(
335 new SpanQuery[] {
336 new SpanTermQuery(new Term("text", "p:VVFIN")),
337 new SpanSegmentQueryWrapper("text", "m:c:nom", "m:n:sg", "m:g:fem").toQuery()
338 },
339 5, // slop
340 true // inOrder
341 // Possible: CollectPayloads
342 );
343 assertEquals(1, searcher.search(snquery, 10).totalHits);
344
345
346 // Spannearquery [p:VVFIN][m:acc:sg:masc]
347 snquery = new SpanNearQuery(
348 new SpanQuery[] {
349 new SpanTermQuery(new Term("text", "p:VVFIN")),
350 new SpanNearQuery(
351 new SpanQuery[] {
352 new SpanTermQuery(new Term("text", "m:c:acc")),
353 new SpanNearQuery(
354 new SpanQuery[] {
355 new SpanTermQuery(new Term("text", "m:n:sg")),
356 new SpanTermQuery(new Term("text", "m:g:masc"))
357 },
358 -1,
359 false
360 )
361 },
362 -1, // slop
363 false // inOrder
364 // Possible: CollectPayloads
365 )
366 // new SpanTermQuery(new Term("text", "m:-acc:--sg:masc"))
367 },
368 0, // slop
369 true // inOrder
370 // Possible: CollectPayloads
371 );
372 assertEquals(1, searcher.search(snquery, 10).totalHits);
373
374
375 // Spannearquery [p:VVFIN|m:3:sg:past:ind]
376 // Exact match!
377 snquery = new SpanNearQuery(
378 new SpanQuery[] {
379 new SpanTermQuery(new Term("text", "p:VVFIN")),
380 new SpanNearQuery(
381 new SpanQuery[] {
382 new SpanTermQuery(new Term("text", "m:p:3")),
383 new SpanNearQuery(
384 new SpanQuery[] {
385 new SpanTermQuery(new Term("text", "m:n:sg")),
386 new SpanNearQuery(
387 new SpanQuery[] {
388 new SpanTermQuery(new Term("text", "m:t:past")),
389 new SpanTermQuery(new Term("text", "m:m:ind")),
390 },
391 -1,
392 false
393 )
394 },
395 -1,
396 false
397 )
398 },
399 -1,
400 false
401 )
402 },
403 // new SpanTermQuery(new Term("text", "m:---3:--sg:past:-ind"))
404 -1, // slop
405 false // inOrder
406 // Possible: CollectPayloads
407 );
408 assertEquals(2, searcher.search(snquery, 10).totalHits);
409
410 // To make sure, this is not equal:
411 // Spannearquery [p:VVFIN & m:3:sg:past:ind]
412 // Exact match!
413 // Maybe it IS equal
414 snquery = new SpanNearQuery(
415 new SpanQuery[] {
416 new SpanTermQuery(new Term("text", "p:VVFIN")),
417 new SpanTermQuery(new Term("text", "m:p:3")),
418 new SpanTermQuery(new Term("text", "m:n:sg")),
419 new SpanTermQuery(new Term("text", "m:t:past")),
420 new SpanTermQuery(new Term("text", "m:m:ind")),
421 },
422 -1, // slop
423 false // inOrder
424 // Possible: CollectPayloads
425 );
426 assertNotEquals(2, searcher.search(snquery, 10).totalHits);
427 // assertEquals(2, searcher.search(snquery, 10).totalHits);
428
429 // Spannearquery [p:VVFIN & m:3:sg & past:ind]
430 SpanSegmentQueryWrapper sniquery = new SpanSegmentQueryWrapper(
431 "text",
432 "p:VVFIN",
433 "m:p:3",
434 "m:n:sg",
435 "m:t:past",
436 "m:m:ind"
437 );
438 assertEquals(2, searcher.search(sniquery.toQuery(), 10).totalHits);
439
440
441 // Todo:
442
443 /*
444 sniquery = new SpanSegmentQuery(
445 "text",
446 "p:VVFIN",
447 "m:p:3",
448 "m:n:sg",
449 "m:t:past",
450 "m:m:ind"
451 );
452 */
453
454 // Spannearquery [p:VVFIN][]{,5}[m:nom:sg:fem]
455 snquery = new SpanNearQuery(
456 new SpanQuery[] {
457 new SpanTermQuery(new Term("text", "p:VVFIN")),
458 new SpanSegmentQueryWrapper("text", "m:c:nom", "m:n:sg", "m:g:fem").toQuery()
459 },
460 5, // slop
461 true // inOrder
462 // Possible: CollectPayloads
463 );
464 assertEquals(1, searcher.search(snquery, 10).totalHits);
465
466 sniquery = new SpanSegmentQueryWrapper("text", "p:VVFIN", "m:p:3", "m:t:past", "m:m:ind", "m:n:sg");
467 assertEquals(2, searcher.search(sniquery.toQuery(), 10).totalHits);
468
469 // [p = VVFIN & m:p = 3 & m:t = past & m:n != pl] or
470 // [p = VVFIN & m:p = 3 & m:t = past & !m:n = pl]
471 // TODO: Problem: What should happen in case the category does not exist?
472 // pssible solution: & ( m:n != pl & exists(m:n))
473 sniquery = new SpanSegmentQueryWrapper("text", "p:VVFIN", "m:p:3", "m:t:past");
474 SpanQuery snqquery = new SpanNotQuery(sniquery.toQuery(), new SpanTermQuery(new Term("text", "m:n:pl")));
475 assertEquals(2, searcher.search(snqquery, 10).totalHits);
476
477 // [p = NN & (m:c: = dat | m:c = acc)]
478 snquery = new SpanNearQuery(
479 new SpanQuery[] {
480 new SpanTermQuery(new Term("text", "p:NN")),
481 new SpanOrQuery(
482 new SpanTermQuery( new Term("text", "m:c:nom" )),
483 new SpanTermQuery( new Term("text", "m:c:acc" ))
484 )
485 },
486 -1,
487 false
488 );
489
490 assertEquals(2, searcher.search(snqquery, 10).totalHits);
491
492 // [p = NN & !(m:c: = nom | m:c = acc)]
493 snqquery = new SpanNotQuery(
494 new SpanTermQuery(new Term("text", "p:NN")),
495 new SpanOrQuery(
496 new SpanTermQuery( new Term("text", "m:c:nom" )),
497 new SpanTermQuery( new Term("text", "m:c:acc" ))
498 )
499 );
500 assertEquals(1, searcher.search(snqquery, 10).totalHits);
501
502 // [p = NN & !(m:c = nom)]
503 snqquery = new SpanNotQuery(
504 new SpanTermQuery( new Term("text", "p:NN")),
505 new SpanTermQuery( new Term("text", "m:c:nom" ))
506 );
507 assertEquals(3, searcher.search(snqquery, 10).totalHits);
508
509 // [p=NN & !(m:c = acc)]
510 snqquery = new SpanNotQuery(
511 new SpanTermQuery( new Term("text", "p:NN")),
512 new SpanTermQuery( new Term("text", "m:c:acc" ))
513 );
514 assertEquals(2, searcher.search(snqquery, 10).totalHits);
515
516 // [p=PPER][][p=ART]
517 snquery = new SpanNearQuery(
518 new SpanQuery[] {
519 new SpanTermQuery( new Term("text", "p:PPER")),
520 new SpanNearQuery(
521 new SpanQuery[] {
522 new SpanTermQuery( new Term("text", "T")),
523 new SpanTermQuery( new Term("text", "p:ART"))
524 },
525 0,
526 true),
527 },
528 0,
529 true
530 );
531 assertEquals(1, searcher.search(snquery, 10).totalHits);
532
533
534 // Todo:
535 // [orth=się][]{2,4}[base=bać]
536 // [orth=się][orth!="[.!?,:]"]{,5}[base=bać]|[base=bać][base="on|ja|ty|my|wy"]?[orth=się]
537 // [pos=subst & orth="a.*"]{2}
538 // [tag=subst:sg:nom:n]
539 // [case==acc & case==gen] ??
540 // [case~acc & case~gen]
541 // [case~~acc]
542 // [base=bać][orth!=się]+[orth=się] within s
543
544 // [][][p:VAFIN] within s
545 // [][p:VAFIN] within s
546
547
548 // [][][p:VAFIN]
549 snquery = new SpanNearQuery(
550 new SpanQuery[] {
551 new SpanNearQuery(
552 new SpanQuery[] {
553 new SpanTermQuery( new Term("text", "T") ),
554 new SpanTermQuery( new Term("text", "T") )
555 },
556 0,
557 true
558 ),
559 new SpanTermQuery( new Term("text", "p:VAFIN") )
560 },
561 0,
562 true
563 );
564 assertEquals(1, searcher.search(snquery, 10).totalHits);
565
566/*
567http://stackoverflow.com/questions/1311199/finding-the-position-of-search-hits-from-lucene
568*/
569
570 StringBuilder payloadString = new StringBuilder();
571 Map<Term, TermContext> termContexts = new HashMap<>();
572 for (AtomicReaderContext atomic : reader.leaves()) {
573 Bits bitset = atomic.reader().getLiveDocs();
574 // Spans spans = NearSpansOrdered();
575 Spans spans = snquery.getSpans(atomic, bitset, termContexts);
576
577 while (spans.next()) {
578 int docid = atomic.docBase + spans.doc();
579 if (spans.isPayloadAvailable()) {
580 for (byte[] payload : spans.getPayload()) {
581 /* retrieve payload for current matching span */
582 payloadString.append(new String(payload));
583 payloadString.append(" | ");
584 };
585 };
586 };
587 };
588 // assertEquals(33, payloadString.length());
589 assertEquals(0, payloadString.length());
590
591
592
593 // [][][p:VAFIN]
594 // without collecting payloads
595 snquery = new SpanNearQuery(
596 new SpanQuery[] {
597 new SpanNearQuery(
598 new SpanQuery[] {
599 new SpanTermQuery( new Term("text", "T") ),
600 new SpanTermQuery( new Term("text", "T") )
601 },
602 0,
603 true,
604 false
605 ),
606 new SpanTermQuery( new Term("text", "p:VAFIN") )
607 },
608 0,
609 true,
610 false
611 );
612 assertEquals(1, searcher.search(snquery, 10).totalHits);
613
614 payloadString = new StringBuilder();
615 termContexts = new HashMap<>();
616 for (AtomicReaderContext atomic : reader.leaves()) {
617 Bits bitset = atomic.reader().getLiveDocs();
618 // Spans spans = NearSpansOrdered();
619 Spans spans = snquery.getSpans(atomic, bitset, termContexts);
620
621 while (spans.next()) {
622 int docid = atomic.docBase + spans.doc();
623 for (byte[] payload : spans.getPayload()) {
624 /* retrieve payload for current matching span */
625 payloadString.append(new String(payload));
626 payloadString.append(" | ");
627 };
628 };
629 };
630 assertEquals(0, payloadString.length());
631
632
633 // [][][p:VAFIN] in s
Nils Diewaldcc7c0b32014-07-31 19:58:22 +0000634 // ([e:s:<][]*[T] | [T & e:s:<]) [T] ([p:VAFIN & e:s:>] | [T][]*[e:s:>]
Nils Diewaldf399a672013-11-18 17:55:22 +0000635
636 /*
Nils Diewaldcc7c0b32014-07-31 19:58:22 +0000637
Nils Diewaldf399a672013-11-18 17:55:22 +0000638 SpanSegmentWithinQuery ssequery = new SpanSegmentWithinQuery(
639 "text","s", new SpanSegmentSequenceQuery("text", "T", "T", "p:VAFIN")
640 );
641 assertEquals(0, searcher.search(ssequery.toQuery(), 10).totalHits);
642
643 payloadString = new StringBuilder();
644 termContexts = new HashMap<>();
645 for (AtomicReaderContext atomic : reader.leaves()) {
646 Bits bitset = atomic.reader().getLiveDocs();
647 // Spans spans = NearSpansOrdered();
648 Spans spans = ssequery.toQuery().getSpans(atomic, bitset, termContexts);
649
650 while (spans.next()) {
651 int docid = atomic.docBase + spans.doc();
652 for (byte[] payload : spans.getPayload()) {
653 /// retrieve payload for current matching span
654 payloadString.append(new String(payload));
655 payloadString.append(" | ");
656 };
657 };
658 };
659 assertEquals(0, payloadString.length(), 1);
660
661 ssequery = new SpanSegmentWithinQuery(
662 "text","s", new SpanSegmentSequenceQuery("text", "T", "p:VAFIN")
663 );
664
665 assertEquals("for " + ssequery.toQuery(),
666 1, searcher.search(ssequery.toQuery(), 10).totalHits);
667
668 payloadString = new StringBuilder();
669 termContexts = new HashMap<>();
670 for (AtomicReaderContext atomic : reader.leaves()) {
671 Bits bitset = atomic.reader().getLiveDocs();
672 // Spans spans = NearSpansOrdered();
673 Spans spans = ssequery.toQuery().getSpans(atomic, bitset, termContexts);
674
675 while (spans.next()) {
676 int docid = atomic.docBase + spans.doc();
677 for (byte[] payload : spans.getPayload()) {
678 // retrieve payload for current matching span
679 payloadString.append(new String(payload));
680 payloadString.append(" | ");
681 };
682 fail("Doc: " + docid + " with " + spans.start() + "-" + spans.end() + " || " + payloadString.toString());
683 };
684 };
685 assertEquals(20, payloadString.length());
686
687 */
688
689 // --------------------______>
690
691
692
693 // Spans spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), ssequery.toQuery());
694 /*
695 TopDocs topDocs = is.search(snq, 1);
696 Set<String> payloadSet = new HashSet<String>();
697 for (int i = 0; i < topDocs.scoreDocs.length; i++) {
698 while (spans.next()) {
699 Collection<byte[]> payloads = spans.getPayload();
700
701 for (final byte [] payload : payloads) {
702 payloadSet.add(new String(payload, "UTF-8"));
703 }
704 }
705 }
706 */
707
708
709 /*
710Alternativ:
711 IndexReader reader = writer.getReader();
712 writer.close();
713 IndexSearcher searcher = newSearcher(reader);
714
715 PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getTopReaderContext());
716
717 Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(PayloadHelper.FIELD, "rr")));
718 if(VERBOSE)
719 System.out.println("Num payloads:" + payloads.size());
720 for (final byte [] bytes : payloads) {
721 if(VERBOSE)
722 System.out.println(new String(bytes, "UTF-8"));
723 }
724*/
725
726
727
728 /* new: */
729
730 // PayloadHelper helper = new PayloadHelper();
731
732 // Map<Term, TermContext> termContexts = new HashMap<>();
733//Spans spans;
734//spans = snquery.getSpans(searcher.getIndexReader());
735// searcher = helper.setUp(similarity, 1000);
736 /*
737 IndexReader reader = search.getReader(querycontainer.getFoundry());
738 Spans luceneSpans;
739 Bits bitset = atomic.reader().getLiveDocs();
740 for (byte[] payload : luceneSpans.getPayload())
741
742 /* Iterate over all matching documents */
743 /*
744 while (luceneSpans.next() && total < config.getMaxhits()) {
745 Span matchSpan;
746 StringBuilder payloadString = new StringBuilder();
747 int docid = atomic.docBase + luceneSpans.doc();
748 String docname = search.retrieveDocname(docid,
749 querycontainer.getFoundry());
750 total++;
751
752 for (byte[] payload : luceneSpans.getPayload())
753 */
754 /* retrieve payload for current matching span */
755 // payloadString.append(new String(payload));
756
757 /* create span containing result */
758 /*
759 matchSpan = new Span(docname);
760 matchSpan.setIndexdocid(docid);
761 matchSpan.setLayer(querycontainer.getLayer());
762 matchSpan.storePayloads(payloadString.toString());
763 matchSpans.add(matchSpan);
764*/
765 /*
766 * topdocs = searcher.search(new ConstantScoreQuery(corpusQ add
767 * position to list of positions to be considered for later
768 * searches
769 */
770 /*
771 validValues.put(docname,
772 matchSpan.getPayload(config.getPrefix()));
773 }
774*/
775
776
777 // Todo: API made by add() typisiert für queries, strings
778
779 // SpanPayloadCheckQuery for sentences!
780
781 /* Support regular expression in SpanSegmentQuery */
782 // new Regexp();
783 // new Term();
784
785 /*
786 Vielleicht: spanSegmentQuery(new Term(), new Wildcard(), new Regex());
787 */
788
789 // And Not ->
790 // SpanTermDiffQuery
791
792 /*
793 SpanNearQuery poquery = new SpanNearQuery(
794
795 );
796 */
797
798 reader.close();
799
800
801 };
802};