| package de.ids_mannheim.korap.index; |
| |
| import java.util.*; |
| import java.io.*; |
| |
| import de.ids_mannheim.korap.analysis.MultiTermToken; |
| import de.ids_mannheim.korap.query.wrap.SpanSegmentQueryWrapper; |
| import de.ids_mannheim.korap.query.wrap.SpanRegexQueryWrapper; |
| import de.ids_mannheim.korap.query.wrap.SpanSequenceQueryWrapper; |
| import de.ids_mannheim.korap.query.SpanWithinQuery; |
| |
| import static de.ids_mannheim.korap.Test.*; |
| |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.index.TermContext; |
| |
| import org.apache.lucene.index.DocsAndPositionsEnum; |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.IndexWriter; |
| import org.apache.lucene.index.IndexWriterConfig; |
| import org.apache.lucene.index.IndexWriterConfig.OpenMode; |
| import org.apache.lucene.index.AtomicReaderContext; |
| |
| import org.apache.lucene.queryparser.classic.ParseException; |
| import org.apache.lucene.queryparser.classic.QueryParser; |
| |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.search.BooleanClause; |
| import org.apache.lucene.search.BooleanQuery; |
| import org.apache.lucene.search.PhraseQuery; |
| import org.apache.lucene.search.NumericRangeQuery; |
| import org.apache.lucene.search.spans.Spans; |
| import org.apache.lucene.search.spans.SpanQuery; |
| import org.apache.lucene.search.spans.SpanOrQuery; |
| import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; |
| import org.apache.lucene.search.spans.SpanTermQuery; |
| import org.apache.lucene.search.spans.SpanNearQuery; |
| import org.apache.lucene.search.spans.SpanNotQuery; |
| import org.apache.lucene.search.spans.NearSpansOrdered; |
| import org.apache.lucene.search.WildcardQuery; |
| import org.apache.lucene.search.ScoreDoc; |
| import org.apache.lucene.search.TopScoreDocCollector; |
| import org.apache.lucene.search.TopDocs; |
| import org.apache.lucene.search.RegexpQuery; |
| |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.RAMDirectory; |
| import org.apache.lucene.store.SimpleFSDirectory; // temporary |
| |
| import org.apache.lucene.util.Version; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.Bits; |
| |
| import static org.junit.Assert.*; |
| import org.junit.Test; |
| import org.junit.Ignore; |
| import org.junit.runner.RunWith; |
| import org.junit.runners.JUnit4; |
| |
| @RunWith(JUnit4.class) |
| public class TestIndex { // extends LuceneTestCase { |
| // Create index in RAM |
| // private Directory index = new RAMDirectory(); |
| |
| private Directory index = new RAMDirectory(); |
| |
| @Test |
| public void multiTermToken () { |
| MultiTermToken test = new MultiTermToken("hunde", "pos:n", "m:gen:pl"); |
| assertEquals(test.terms.get(0).term, "hunde"); |
| assertEquals(test.terms.get(1).term, "pos:n"); |
| assertEquals(test.terms.get(2).term, "m:gen:pl"); |
| assertEquals(test.terms.get(0).posIncr, 1, 1); |
| assertEquals(test.terms.get(1).posIncr, 0, 1); |
| assertEquals(test.terms.get(2).posIncr, 0, 1); |
| |
| test = new MultiTermToken("hunde", "pos:n", "m:gen:pl"); |
| assertEquals(test.terms.get(0).term, "hunde"); |
| assertEquals(test.terms.get(1).term, "pos:n"); |
| assertEquals(test.terms.get(2).term, "m:gen:pl"); |
| assertEquals(test.terms.get(0).posIncr, 1, 1); |
| assertEquals(test.terms.get(1).posIncr, 0, 1); |
| assertEquals(test.terms.get(2).posIncr, 0, 1); |
| }; |
| |
| private List initIndexer () throws IOException { |
| List<Map<String, String>> list = new ArrayList<>(); |
| |
| Map<String, String> d1 = new HashMap<String, String>(); |
| d1.put("id", "w1"); |
| d1.put("corpus", "wiki"); |
| d1.put("author", "Nils Diewald"); |
| d1.put("title", "Wikipedia"); |
| d1.put("subtitle", "A test"); |
| d1.put("pubDate", "20130701"); |
| d1.put("pubPlace", "Mannheim"); |
| d1.put("textClass", "news sports"); |
| d1.put("textStr", "Er nahm den Hunden die Angst."); |
| d1.put("text", "Er#0-2|PPER|er|c:nom;p:3;n:sg;g:masc|<>:s#0-29$<i>7 " + |
| "nahm#3-7|VVFIN|nehmen|p:3;n:sg;t:past;m:ind| " + |
| "den#8-11|ART|der|c:acc;n:sg;g:masc| " + |
| "Hunden#12-18|NN|hund|c:acc;n:sg;g:masc| " + |
| "die#19-22|ART|der|c:nom;n:sg;g:fem| " + |
| "Angst#23-28|NN|angst|c:nom;n:sg;g:fem| " + |
| ".#28-29|$.|.||"); |
| list.add(d1); |
| |
| Map<String, String> d2 = new HashMap<String, String>(); |
| |
| d2.put("id", "w2"); |
| d2.put("corpus", "wiki"); |
| d2.put("author", "Peter Thomas"); |
| d2.put("title", "Waldartikel"); |
| d2.put("subtitle", "Another test"); |
| d2.put("pubDate", "20130723"); |
| d2.put("pubPlace", "Bielefeld"); |
| d2.put("textClass", "news"); |
| d2.put("textStr", "Sie liefen durch den Wald."); |
| d2.put("text", "Sie#0-3|PPER|sie|c:nom;p:3;n:pl;g:all|<>:s#0-26$<i>6 " + |
| "liefen#4-10|VVFIN|laufen|p:3;n:pl;t:past;m:ind| " + |
| "durch#11-16|APPR|durch|| " + |
| "den#17-20|ART|der|c:acc;n:sg;g:masc| " + |
| "Wald#21-25|NN|wald|c:acc;n:sg;g:masc| " + |
| ".#25-26|$.|.||"); |
| list.add(d2); |
| |
| Map<String, String> d3 = new HashMap<String, String>(); |
| d3.put("id", "w3"); |
| d3.put("corpus", "zeitung"); |
| d3.put("author", "Michael Meier"); |
| d3.put("title", "Angst"); |
| d3.put("subtitle", "Starr vor Angst"); |
| d3.put("pubDate", "20130713"); |
| d3.put("pubPlace", "Bielefeld"); |
| d3.put("textClass", "sports"); |
| d3.put("textStr", "Er wagte nicht, sich zu ruehren. Er war starr vor Angst."); |
| d3.put("text", "Er#0-2|PPER|er|c:nom;n:sg;g:masc;p:3|<>:s#0-32$<i>8 " + |
| "wagte#3-8|VVFIN|wagen|p:3;n:sg;t:past;m:ind| " + |
| "nicht#9-14|PTKNEG|nicht|| " + |
| ",#14-15|$,|,|| " + |
| "sich#16-20|PRF|sich|c:acc;p:3;n:sg| " + |
| "zu#21-23|PTKZU|zu|| " + |
| "ruehren#24-31|VVFIN|ruehren|| " + |
| ".#31-32|$.|.|| " + |
| "Er#33-35|PPER|er|c:nom;p:3;n:sg;g:masc|<>:s#33-56$<i>14 " + |
| "war#36-39|VAFIN|sein|p:3;n:sg;t:past;m:ind| " + |
| "starr#40-45|ADJD|starr|comp:pos| " + |
| "vor#46-49|APPR|vor|| " + |
| "Angst#50-55|NN|angst|c:dat;n:sg;g:fem| " + |
| ".#55-56|$.|.||"); |
| list.add(d3); |
| |
| return list; |
| }; |
| |
| @Test |
| public void indexLucene () throws IOException { |
| |
| // Base analyzer for searching and indexing |
| StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); |
| |
| // Based on |
| // http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/ |
| // analysis/Analyzer.html?is-external=true |
| |
| // Create configuration with base analyzer |
| IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, analyzer); |
| |
| // Add a document 1 with the correct fields |
| IndexWriter w = new IndexWriter(index, config); |
| |
| Collection docs = initIndexer(); |
| |
| @SuppressWarnings("unchecked") |
| Iterator<Map<String,String>> i = (Iterator<Map<String,String>>) docs.iterator(); |
| |
| for (; i.hasNext(); ) { |
| addDoc(w, i.next()); |
| }; |
| |
| assertEquals(3, w.numDocs()); |
| |
| w.close(); |
| |
| // Check directory |
| DirectoryReader reader = DirectoryReader.open( index ); |
| assertEquals(docs.size(), reader.maxDoc()); |
| assertEquals(docs.size(), reader.numDocs()); |
| |
| // Check searcher |
| IndexSearcher searcher = new IndexSearcher( reader ); |
| |
| // textClass |
| // All texts of text class "news" |
| assertEquals(2, |
| searcher.search( |
| new TermQuery( |
| new Term("textClass", "news") |
| ), 10 |
| ).totalHits |
| ); |
| |
| // textClass |
| // All texts of text class "sports" |
| assertEquals(2, |
| searcher.search( |
| new TermQuery( |
| new Term("textClass", "sports") |
| ), 10 |
| ).totalHits |
| ); |
| |
| // TextIndex |
| // All docs containing "l:nehmen" |
| assertEquals(1, |
| searcher.search( |
| new TermQuery( |
| new Term("text", "l:nehmen") |
| ), 10 |
| ).totalHits |
| ); |
| |
| // TextIndex |
| // All docs containing "s:den" |
| assertEquals(2, |
| searcher.search( |
| new TermQuery( |
| new Term("text", "s:den") |
| ), 10 |
| ).totalHits |
| ); |
| |
| /* |
| assertEquals(3, |
| searcher.search( |
| new TermQuery( |
| new Term("text", "T") |
| ), 10 |
| ).totalHits |
| ); |
| */ |
| |
| // BooleanQuery |
| // All docs containing "s:den" and "l:sie" |
| TermQuery s_den = new TermQuery(new Term("text", "s:den")); |
| TermQuery l_sie = new TermQuery(new Term("text", "l:sie")); |
| BooleanQuery bool = new BooleanQuery(); |
| bool.add(s_den, BooleanClause.Occur.MUST); |
| bool.add(l_sie, BooleanClause.Occur.MUST); |
| |
| assertEquals(1, searcher.search(bool, 10).totalHits); |
| |
| // BooleanQuery |
| // All docs containing "s:den" or "l:sie" |
| bool = new BooleanQuery(); |
| bool.add(s_den, BooleanClause.Occur.SHOULD); |
| bool.add(l_sie, BooleanClause.Occur.SHOULD); |
| assertEquals(2, searcher.search(bool, 10).totalHits); |
| |
| |
| // RegexpQuery |
| // All docs containing ".{4}en" (liefen und Hunden) |
| RegexpQuery srquery = new RegexpQuery( |
| new Term("text", "s:.{4}en") |
| ); |
| assertEquals(2, searcher.search(srquery, 10).totalHits); |
| |
| // RegexpQuery |
| // All docs containing "E." (Er) (2x) |
| srquery = new RegexpQuery( |
| new Term("text", "s:E.") |
| ); |
| assertEquals(2, searcher.search(srquery, 10).totalHits); |
| |
| SpanRegexQueryWrapper ssrquery = new SpanRegexQueryWrapper("text", "s:E."); |
| assertEquals(2, searcher.search(ssrquery.toQuery(), 10).totalHits); |
| |
| |
| // RegexpQuery |
| // All docs containing "E." (er) (0x) |
| srquery = new RegexpQuery( |
| new Term("text", "s:e.") |
| ); |
| assertEquals(0, searcher.search(srquery, 10).totalHits); |
| |
| ssrquery = new SpanRegexQueryWrapper("text", "s:e."); |
| assertEquals(0, searcher.search(ssrquery.toQuery(), 10).totalHits); |
| |
| // Check http://comments.gmane.org/gmane.comp.jakarta.lucene.user/52283 |
| // for Carstens question on wildcards |
| |
| // RegexpQuery |
| // All docs containing "E."/i ([Ee]r) (2x) |
| srquery = new RegexpQuery( |
| new Term("text", "i:e.") |
| ); |
| assertEquals(2, searcher.search(srquery, 10).totalHits); |
| |
| ssrquery = new SpanRegexQueryWrapper("text", "s:e.", true); |
| assertEquals("SpanMultiTermQueryWrapper(text:/i:e./)", ssrquery.toQuery().toString()); |
| assertEquals(2, searcher.search(ssrquery.toQuery(), 10).totalHits); |
| |
| // All docs containing "ng"/x (Angst) (2x) |
| srquery = new RegexpQuery( |
| new Term("text", "s:.*ng.*") |
| ); |
| assertEquals(2, searcher.search(srquery, 10).totalHits); |
| |
| // [base=angst] |
| SpanTermQuery stq = new SpanTermQuery(new Term("text", "l:angst")); |
| assertEquals(2, searcher.search(srquery, 10).totalHits); |
| |
| // vor Angst |
| // [orth=vor][orth=Angst] |
| SpanNearQuery snquery = new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery(new Term("text", "s:vor")), |
| new SpanTermQuery(new Term("text", "s:Angst")) |
| }, |
| 1, |
| true |
| ); |
| assertEquals(1, searcher.search(snquery, 10).totalHits); |
| |
| // Spannearquery [p:VVFIN][]{,5}[m:nom:sg:fem] |
| snquery = new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery(new Term("text", "p:VVFIN")), |
| new SpanSegmentQueryWrapper("text", "m:c:nom", "m:n:sg", "m:g:fem").toQuery() |
| }, |
| 5, // slop |
| true // inOrder |
| // Possible: CollectPayloads |
| ); |
| assertEquals(1, searcher.search(snquery, 10).totalHits); |
| |
| |
| // Spannearquery [p:VVFIN][m:acc:sg:masc] |
| snquery = new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery(new Term("text", "p:VVFIN")), |
| new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery(new Term("text", "m:c:acc")), |
| new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery(new Term("text", "m:n:sg")), |
| new SpanTermQuery(new Term("text", "m:g:masc")) |
| }, |
| -1, |
| false |
| ) |
| }, |
| -1, // slop |
| false // inOrder |
| // Possible: CollectPayloads |
| ) |
| // new SpanTermQuery(new Term("text", "m:-acc:--sg:masc")) |
| }, |
| 0, // slop |
| true // inOrder |
| // Possible: CollectPayloads |
| ); |
| assertEquals(1, searcher.search(snquery, 10).totalHits); |
| |
| |
| // Spannearquery [p:VVFIN|m:3:sg:past:ind] |
| // Exact match! |
| snquery = new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery(new Term("text", "p:VVFIN")), |
| new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery(new Term("text", "m:p:3")), |
| new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery(new Term("text", "m:n:sg")), |
| new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery(new Term("text", "m:t:past")), |
| new SpanTermQuery(new Term("text", "m:m:ind")), |
| }, |
| -1, |
| false |
| ) |
| }, |
| -1, |
| false |
| ) |
| }, |
| -1, |
| false |
| ) |
| }, |
| // new SpanTermQuery(new Term("text", "m:---3:--sg:past:-ind")) |
| -1, // slop |
| false // inOrder |
| // Possible: CollectPayloads |
| ); |
| assertEquals(2, searcher.search(snquery, 10).totalHits); |
| |
| // To make sure, this is not equal: |
| // Spannearquery [p:VVFIN & m:3:sg:past:ind] |
| // Exact match! |
| // Maybe it IS equal |
| snquery = new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery(new Term("text", "p:VVFIN")), |
| new SpanTermQuery(new Term("text", "m:p:3")), |
| new SpanTermQuery(new Term("text", "m:n:sg")), |
| new SpanTermQuery(new Term("text", "m:t:past")), |
| new SpanTermQuery(new Term("text", "m:m:ind")), |
| }, |
| -1, // slop |
| false // inOrder |
| // Possible: CollectPayloads |
| ); |
| assertNotEquals(2, searcher.search(snquery, 10).totalHits); |
| // assertEquals(2, searcher.search(snquery, 10).totalHits); |
| |
| // Spannearquery [p:VVFIN & m:3:sg & past:ind] |
| SpanSegmentQueryWrapper sniquery = new SpanSegmentQueryWrapper( |
| "text", |
| "p:VVFIN", |
| "m:p:3", |
| "m:n:sg", |
| "m:t:past", |
| "m:m:ind" |
| ); |
| assertEquals(2, searcher.search(sniquery.toQuery(), 10).totalHits); |
| |
| |
| // Todo: |
| |
| /* |
| sniquery = new SpanSegmentQuery( |
| "text", |
| "p:VVFIN", |
| "m:p:3", |
| "m:n:sg", |
| "m:t:past", |
| "m:m:ind" |
| ); |
| */ |
| |
| // Spannearquery [p:VVFIN][]{,5}[m:nom:sg:fem] |
| snquery = new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery(new Term("text", "p:VVFIN")), |
| new SpanSegmentQueryWrapper("text", "m:c:nom", "m:n:sg", "m:g:fem").toQuery() |
| }, |
| 5, // slop |
| true // inOrder |
| // Possible: CollectPayloads |
| ); |
| assertEquals(1, searcher.search(snquery, 10).totalHits); |
| |
| sniquery = new SpanSegmentQueryWrapper("text", "p:VVFIN", "m:p:3", "m:t:past", "m:m:ind", "m:n:sg"); |
| assertEquals(2, searcher.search(sniquery.toQuery(), 10).totalHits); |
| |
| // [p = VVFIN & m:p = 3 & m:t = past & m:n != pl] or |
| // [p = VVFIN & m:p = 3 & m:t = past & !m:n = pl] |
| // TODO: Problem: What should happen in case the category does not exist? |
| // pssible solution: & ( m:n != pl & exists(m:n)) |
| sniquery = new SpanSegmentQueryWrapper("text", "p:VVFIN", "m:p:3", "m:t:past"); |
| SpanQuery snqquery = new SpanNotQuery(sniquery.toQuery(), new SpanTermQuery(new Term("text", "m:n:pl"))); |
| assertEquals(2, searcher.search(snqquery, 10).totalHits); |
| |
| // [p = NN & (m:c: = dat | m:c = acc)] |
| snquery = new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery(new Term("text", "p:NN")), |
| new SpanOrQuery( |
| new SpanTermQuery( new Term("text", "m:c:nom" )), |
| new SpanTermQuery( new Term("text", "m:c:acc" )) |
| ) |
| }, |
| -1, |
| false |
| ); |
| |
| assertEquals(2, searcher.search(snqquery, 10).totalHits); |
| |
| // [p = NN & !(m:c: = nom | m:c = acc)] |
| snqquery = new SpanNotQuery( |
| new SpanTermQuery(new Term("text", "p:NN")), |
| new SpanOrQuery( |
| new SpanTermQuery( new Term("text", "m:c:nom" )), |
| new SpanTermQuery( new Term("text", "m:c:acc" )) |
| ) |
| ); |
| assertEquals(1, searcher.search(snqquery, 10).totalHits); |
| |
| // [p = NN & !(m:c = nom)] |
| snqquery = new SpanNotQuery( |
| new SpanTermQuery( new Term("text", "p:NN")), |
| new SpanTermQuery( new Term("text", "m:c:nom" )) |
| ); |
| assertEquals(3, searcher.search(snqquery, 10).totalHits); |
| |
| // [p=NN & !(m:c = acc)] |
| snqquery = new SpanNotQuery( |
| new SpanTermQuery( new Term("text", "p:NN")), |
| new SpanTermQuery( new Term("text", "m:c:acc" )) |
| ); |
| assertEquals(2, searcher.search(snqquery, 10).totalHits); |
| |
| // [p=PPER][][p=ART] |
| snquery = new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery( new Term("text", "p:PPER")), |
| new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery( new Term("text", "T")), |
| new SpanTermQuery( new Term("text", "p:ART")) |
| }, |
| 0, |
| true), |
| }, |
| 0, |
| true |
| ); |
| assertEquals(1, searcher.search(snquery, 10).totalHits); |
| |
| |
| // Todo: |
| // [orth=się][]{2,4}[base=bać] |
| // [orth=się][orth!="[.!?,:]"]{,5}[base=bać]|[base=bać][base="on|ja|ty|my|wy"]?[orth=się] |
| // [pos=subst & orth="a.*"]{2} |
| // [tag=subst:sg:nom:n] |
| // [case==acc & case==gen] ?? |
| // [case~acc & case~gen] |
| // [case~~acc] |
| // [base=bać][orth!=się]+[orth=się] within s |
| |
| // [][][p:VAFIN] within s |
| // [][p:VAFIN] within s |
| |
| |
| // [][][p:VAFIN] |
| snquery = new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery( new Term("text", "T") ), |
| new SpanTermQuery( new Term("text", "T") ) |
| }, |
| 0, |
| true |
| ), |
| new SpanTermQuery( new Term("text", "p:VAFIN") ) |
| }, |
| 0, |
| true |
| ); |
| assertEquals(1, searcher.search(snquery, 10).totalHits); |
| |
| /* |
| http://stackoverflow.com/questions/1311199/finding-the-position-of-search-hits-from-lucene |
| */ |
| |
| StringBuilder payloadString = new StringBuilder(); |
| Map<Term, TermContext> termContexts = new HashMap<>(); |
| for (AtomicReaderContext atomic : reader.leaves()) { |
| Bits bitset = atomic.reader().getLiveDocs(); |
| // Spans spans = NearSpansOrdered(); |
| Spans spans = snquery.getSpans(atomic, bitset, termContexts); |
| |
| while (spans.next()) { |
| int docid = atomic.docBase + spans.doc(); |
| if (spans.isPayloadAvailable()) { |
| for (byte[] payload : spans.getPayload()) { |
| /* retrieve payload for current matching span */ |
| payloadString.append(new String(payload)); |
| payloadString.append(" | "); |
| }; |
| }; |
| }; |
| }; |
| // assertEquals(33, payloadString.length()); |
| assertEquals(0, payloadString.length()); |
| |
| |
| |
| // [][][p:VAFIN] |
| // without collecting payloads |
| snquery = new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanNearQuery( |
| new SpanQuery[] { |
| new SpanTermQuery( new Term("text", "T") ), |
| new SpanTermQuery( new Term("text", "T") ) |
| }, |
| 0, |
| true, |
| false |
| ), |
| new SpanTermQuery( new Term("text", "p:VAFIN") ) |
| }, |
| 0, |
| true, |
| false |
| ); |
| assertEquals(1, searcher.search(snquery, 10).totalHits); |
| |
| payloadString = new StringBuilder(); |
| termContexts = new HashMap<>(); |
| for (AtomicReaderContext atomic : reader.leaves()) { |
| Bits bitset = atomic.reader().getLiveDocs(); |
| // Spans spans = NearSpansOrdered(); |
| Spans spans = snquery.getSpans(atomic, bitset, termContexts); |
| |
| while (spans.next()) { |
| int docid = atomic.docBase + spans.doc(); |
| for (byte[] payload : spans.getPayload()) { |
| /* retrieve payload for current matching span */ |
| payloadString.append(new String(payload)); |
| payloadString.append(" | "); |
| }; |
| }; |
| }; |
| assertEquals(0, payloadString.length()); |
| |
| |
| // [][][p:VAFIN] in s |
| //([e:s:<][]*[T] | [T & e:s:<]) [T] ([p:VAFIN & e:s:>] | [T][]*[e:s:>] |
| |
| /* |
| SpanSegmentWithinQuery ssequery = new SpanSegmentWithinQuery( |
| "text","s", new SpanSegmentSequenceQuery("text", "T", "T", "p:VAFIN") |
| ); |
| assertEquals(0, searcher.search(ssequery.toQuery(), 10).totalHits); |
| |
| payloadString = new StringBuilder(); |
| termContexts = new HashMap<>(); |
| for (AtomicReaderContext atomic : reader.leaves()) { |
| Bits bitset = atomic.reader().getLiveDocs(); |
| // Spans spans = NearSpansOrdered(); |
| Spans spans = ssequery.toQuery().getSpans(atomic, bitset, termContexts); |
| |
| while (spans.next()) { |
| int docid = atomic.docBase + spans.doc(); |
| for (byte[] payload : spans.getPayload()) { |
| /// retrieve payload for current matching span |
| payloadString.append(new String(payload)); |
| payloadString.append(" | "); |
| }; |
| }; |
| }; |
| assertEquals(0, payloadString.length(), 1); |
| |
| ssequery = new SpanSegmentWithinQuery( |
| "text","s", new SpanSegmentSequenceQuery("text", "T", "p:VAFIN") |
| ); |
| |
| assertEquals("for " + ssequery.toQuery(), |
| 1, searcher.search(ssequery.toQuery(), 10).totalHits); |
| |
| payloadString = new StringBuilder(); |
| termContexts = new HashMap<>(); |
| for (AtomicReaderContext atomic : reader.leaves()) { |
| Bits bitset = atomic.reader().getLiveDocs(); |
| // Spans spans = NearSpansOrdered(); |
| Spans spans = ssequery.toQuery().getSpans(atomic, bitset, termContexts); |
| |
| while (spans.next()) { |
| int docid = atomic.docBase + spans.doc(); |
| for (byte[] payload : spans.getPayload()) { |
| // retrieve payload for current matching span |
| payloadString.append(new String(payload)); |
| payloadString.append(" | "); |
| }; |
| fail("Doc: " + docid + " with " + spans.start() + "-" + spans.end() + " || " + payloadString.toString()); |
| }; |
| }; |
| assertEquals(20, payloadString.length()); |
| |
| */ |
| |
| // --------------------______> |
| |
| |
| |
| // Spans spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), ssequery.toQuery()); |
| /* |
| TopDocs topDocs = is.search(snq, 1); |
| Set<String> payloadSet = new HashSet<String>(); |
| for (int i = 0; i < topDocs.scoreDocs.length; i++) { |
| while (spans.next()) { |
| Collection<byte[]> payloads = spans.getPayload(); |
| |
| for (final byte [] payload : payloads) { |
| payloadSet.add(new String(payload, "UTF-8")); |
| } |
| } |
| } |
| */ |
| |
| |
| /* |
| Alternativ: |
| IndexReader reader = writer.getReader(); |
| writer.close(); |
| IndexSearcher searcher = newSearcher(reader); |
| |
| PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getTopReaderContext()); |
| |
| Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(PayloadHelper.FIELD, "rr"))); |
| if(VERBOSE) |
| System.out.println("Num payloads:" + payloads.size()); |
| for (final byte [] bytes : payloads) { |
| if(VERBOSE) |
| System.out.println(new String(bytes, "UTF-8")); |
| } |
| */ |
| |
| |
| |
| /* new: */ |
| |
| // PayloadHelper helper = new PayloadHelper(); |
| |
| // Map<Term, TermContext> termContexts = new HashMap<>(); |
| //Spans spans; |
| //spans = snquery.getSpans(searcher.getIndexReader()); |
| // searcher = helper.setUp(similarity, 1000); |
| /* |
| IndexReader reader = search.getReader(querycontainer.getFoundry()); |
| Spans luceneSpans; |
| Bits bitset = atomic.reader().getLiveDocs(); |
| for (byte[] payload : luceneSpans.getPayload()) |
| |
| /* Iterate over all matching documents */ |
| /* |
| while (luceneSpans.next() && total < config.getMaxhits()) { |
| Span matchSpan; |
| StringBuilder payloadString = new StringBuilder(); |
| int docid = atomic.docBase + luceneSpans.doc(); |
| String docname = search.retrieveDocname(docid, |
| querycontainer.getFoundry()); |
| total++; |
| |
| for (byte[] payload : luceneSpans.getPayload()) |
| */ |
| /* retrieve payload for current matching span */ |
| // payloadString.append(new String(payload)); |
| |
| /* create span containing result */ |
| /* |
| matchSpan = new Span(docname); |
| matchSpan.setIndexdocid(docid); |
| matchSpan.setLayer(querycontainer.getLayer()); |
| matchSpan.storePayloads(payloadString.toString()); |
| matchSpans.add(matchSpan); |
| */ |
| /* |
| * topdocs = searcher.search(new ConstantScoreQuery(corpusQ add |
| * position to list of positions to be considered for later |
| * searches |
| */ |
| /* |
| validValues.put(docname, |
| matchSpan.getPayload(config.getPrefix())); |
| } |
| */ |
| |
| |
| // Todo: API made by add() typisiert für queries, strings |
| |
| // SpanPayloadCheckQuery for sentences! |
| |
| /* Support regular expression in SpanSegmentQuery */ |
| // new Regexp(); |
| // new Term(); |
| |
| /* |
| Vielleicht: spanSegmentQuery(new Term(), new Wildcard(), new Regex()); |
| */ |
| |
| // And Not -> |
| // SpanTermDiffQuery |
| |
| /* |
| SpanNearQuery poquery = new SpanNearQuery( |
| |
| ); |
| */ |
| |
| reader.close(); |
| |
| |
| }; |
| }; |