| Eliza Margaretha | 0192918 | 2014-02-19 11:48:59 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.index; |
| 2 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 3 | import java.util.*; |
| 4 | import java.io.*; |
| 5 | |
| Akron | c63697c | 2015-06-17 22:32:02 +0200 | [diff] [blame] | 6 | import de.ids_mannheim.korap.index.MultiTerm; |
| Nils Diewald | e4986d7 | 2015-02-27 17:35:00 +0000 | [diff] [blame] | 7 | import de.ids_mannheim.korap.index.MultiTermToken; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 8 | import de.ids_mannheim.korap.query.wrap.SpanSegmentQueryWrapper; |
| 9 | import de.ids_mannheim.korap.query.wrap.SpanRegexQueryWrapper; |
| 10 | import de.ids_mannheim.korap.query.wrap.SpanSequenceQueryWrapper; |
| 11 | import de.ids_mannheim.korap.query.SpanWithinQuery; |
| 12 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 13 | import de.ids_mannheim.korap.util.CorpusDataException; |
| 14 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 15 | import static de.ids_mannheim.korap.Test.*; |
| 16 | |
| 17 | import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| 18 | import org.apache.lucene.analysis.TokenFilter; |
| 19 | import org.apache.lucene.analysis.TokenStream; |
| 20 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| 21 | |
| 22 | import org.apache.lucene.index.Term; |
| 23 | import org.apache.lucene.index.TermsEnum; |
| 24 | import org.apache.lucene.index.TermContext; |
| 25 | |
| 26 | import org.apache.lucene.index.DocsAndPositionsEnum; |
| 27 | import org.apache.lucene.index.DirectoryReader; |
| 28 | import org.apache.lucene.index.IndexWriter; |
| 29 | import org.apache.lucene.index.IndexWriterConfig; |
| 30 | import org.apache.lucene.index.IndexWriterConfig.OpenMode; |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 31 | import org.apache.lucene.index.LeafReaderContext; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 32 | |
| 33 | import org.apache.lucene.queryparser.classic.ParseException; |
| 34 | import org.apache.lucene.queryparser.classic.QueryParser; |
| 35 | |
| 36 | import org.apache.lucene.search.IndexSearcher; |
| 37 | import org.apache.lucene.search.Query; |
| 38 | import org.apache.lucene.search.TermQuery; |
| 39 | import org.apache.lucene.search.BooleanClause; |
| 40 | import org.apache.lucene.search.BooleanQuery; |
| 41 | import org.apache.lucene.search.PhraseQuery; |
| 42 | import org.apache.lucene.search.NumericRangeQuery; |
| 43 | import org.apache.lucene.search.spans.Spans; |
| 44 | import org.apache.lucene.search.spans.SpanQuery; |
| 45 | import org.apache.lucene.search.spans.SpanOrQuery; |
| 46 | import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; |
| 47 | import org.apache.lucene.search.spans.SpanTermQuery; |
| 48 | import org.apache.lucene.search.spans.SpanNearQuery; |
| 49 | import org.apache.lucene.search.spans.SpanNotQuery; |
| 50 | import org.apache.lucene.search.spans.NearSpansOrdered; |
| 51 | import org.apache.lucene.search.WildcardQuery; |
| 52 | import org.apache.lucene.search.ScoreDoc; |
| 53 | import org.apache.lucene.search.TopScoreDocCollector; |
| 54 | import org.apache.lucene.search.TopDocs; |
| 55 | import org.apache.lucene.search.RegexpQuery; |
| 56 | |
| 57 | import org.apache.lucene.store.Directory; |
| 58 | import org.apache.lucene.store.RAMDirectory; |
| 59 | import org.apache.lucene.store.SimpleFSDirectory; // temporary |
| 60 | |
| 61 | import org.apache.lucene.util.Version; |
| 62 | import org.apache.lucene.util.BytesRef; |
| 63 | import org.apache.lucene.util.Bits; |
| 64 | |
| 65 | import static org.junit.Assert.*; |
| 66 | import org.junit.Test; |
| 67 | import org.junit.Ignore; |
| 68 | import org.junit.runner.RunWith; |
| 69 | import org.junit.runners.JUnit4; |
| 70 | |
| 71 | @RunWith(JUnit4.class) |
| 72 | public class TestIndex { // extends LuceneTestCase { |
| 73 | // Create index in RAM |
| 74 | // private Directory index = new RAMDirectory(); |
| 75 | |
| 76 | private Directory index = new RAMDirectory(); |
| 77 | |
| Akron | bb5d173 | 2015-06-22 01:22:40 +0200 | [diff] [blame] | 78 | |
| Akron | c63697c | 2015-06-17 22:32:02 +0200 | [diff] [blame] | 79 | @Test |
| 80 | public void multiTerm () throws CorpusDataException { |
| 81 | MultiTerm test = new MultiTerm("test"); |
| 82 | assertEquals(test.getTerm(), "test"); |
| 83 | assertEquals(test.getPayload(), null); |
| 84 | assertEquals(test.getStart(), 0); |
| 85 | assertEquals(test.getEnd(), 0); |
| 86 | assertFalse(test.hasStoredOffsets()); |
| 87 | assertEquals(test.toString(), "test"); |
| 88 | |
| 89 | test = new MultiTerm("test#0-4"); |
| 90 | assertEquals(test.getTerm(), "test"); |
| 91 | assertEquals(test.getPayload(), null); |
| 92 | assertEquals(test.getStart(), 0); |
| 93 | assertEquals(test.getEnd(), 4); |
| 94 | assertFalse(test.hasStoredOffsets()); |
| 95 | assertEquals(test.toString(), "test#0-4"); |
| 96 | |
| 97 | test = new MultiTerm("<>:s:test#0-4$<i>67"); |
| 98 | assertEquals(test.getTerm(), "<>:s:test"); |
| 99 | assertEquals(test.getPayload().toString(), "[0 0 0 43]"); |
| 100 | assertEquals(test.getStart(), 0); |
| 101 | assertEquals(test.getEnd(), 4); |
| 102 | assertFalse(test.hasStoredOffsets()); |
| 103 | assertTrue(test.toString().startsWith("<>:s:test#0-4$")); |
| 104 | |
| 105 | test = new MultiTerm("xip/l:\\#normal#0-5$<i>3999"); |
| 106 | assertEquals(test.getTerm(), "xip/l:#normal"); |
| 107 | assertEquals(test.getPayload().toString(), "[0 0 f 9f]"); |
| 108 | assertEquals(test.getStart(), 0); |
| 109 | assertEquals(test.getEnd(), 5); |
| 110 | assertFalse(test.hasStoredOffsets()); |
| 111 | assertTrue(test.toString().startsWith("xip/l:\\#normal#0-5$")); |
| 112 | }; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 113 | |
| Akron | bb5d173 | 2015-06-22 01:22:40 +0200 | [diff] [blame] | 114 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 115 | @Test |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 116 | public void multiTermToken () throws CorpusDataException { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 117 | MultiTermToken test = new MultiTermToken("hunde", "pos:n", "m:gen:pl"); |
| 118 | assertEquals(test.terms.get(0).term, "hunde"); |
| 119 | assertEquals(test.terms.get(1).term, "pos:n"); |
| 120 | assertEquals(test.terms.get(2).term, "m:gen:pl"); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 121 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 122 | test = new MultiTermToken("hunde", "pos:n", "m:gen:pl"); |
| 123 | assertEquals(test.terms.get(0).term, "hunde"); |
| 124 | assertEquals(test.terms.get(1).term, "pos:n"); |
| 125 | assertEquals(test.terms.get(2).term, "m:gen:pl"); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 126 | }; |
| 127 | |
| Akron | bb5d173 | 2015-06-22 01:22:40 +0200 | [diff] [blame] | 128 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 129 | private List initIndexer () throws IOException { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 130 | List<Map<String, String>> list = new ArrayList<>(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 131 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 132 | Map<String, String> d1 = new HashMap<String, String>(); |
| 133 | d1.put("id", "w1"); |
| 134 | d1.put("corpus", "wiki"); |
| 135 | d1.put("author", "Nils Diewald"); |
| 136 | d1.put("title", "Wikipedia"); |
| 137 | d1.put("subtitle", "A test"); |
| 138 | d1.put("pubDate", "20130701"); |
| 139 | d1.put("pubPlace", "Mannheim"); |
| 140 | d1.put("textClass", "news sports"); |
| 141 | d1.put("textStr", "Er nahm den Hunden die Angst."); |
| 142 | d1.put("text", "Er#0-2|PPER|er|c:nom;p:3;n:sg;g:masc|<>:s#0-29$<i>7 " |
| 143 | + "nahm#3-7|VVFIN|nehmen|p:3;n:sg;t:past;m:ind| " |
| 144 | + "den#8-11|ART|der|c:acc;n:sg;g:masc| " |
| 145 | + "Hunden#12-18|NN|hund|c:acc;n:sg;g:masc| " |
| 146 | + "die#19-22|ART|der|c:nom;n:sg;g:fem| " |
| 147 | + "Angst#23-28|NN|angst|c:nom;n:sg;g:fem| " + ".#28-29|$.|.||"); |
| 148 | list.add(d1); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 149 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 150 | Map<String, String> d2 = new HashMap<String, String>(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 151 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 152 | d2.put("id", "w2"); |
| 153 | d2.put("corpus", "wiki"); |
| 154 | d2.put("author", "Peter Thomas"); |
| 155 | d2.put("title", "Waldartikel"); |
| 156 | d2.put("subtitle", "Another test"); |
| 157 | d2.put("pubDate", "20130723"); |
| 158 | d2.put("pubPlace", "Bielefeld"); |
| 159 | d2.put("textClass", "news"); |
| 160 | d2.put("textStr", "Sie liefen durch den Wald."); |
| 161 | d2.put("text", "Sie#0-3|PPER|sie|c:nom;p:3;n:pl;g:all|<>:s#0-26$<i>6 " |
| 162 | + "liefen#4-10|VVFIN|laufen|p:3;n:pl;t:past;m:ind| " |
| 163 | + "durch#11-16|APPR|durch|| " |
| 164 | + "den#17-20|ART|der|c:acc;n:sg;g:masc| " |
| 165 | + "Wald#21-25|NN|wald|c:acc;n:sg;g:masc| " + ".#25-26|$.|.||"); |
| 166 | list.add(d2); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 167 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 168 | Map<String, String> d3 = new HashMap<String, String>(); |
| 169 | d3.put("id", "w3"); |
| 170 | d3.put("corpus", "zeitung"); |
| 171 | d3.put("author", "Michael Meier"); |
| 172 | d3.put("title", "Angst"); |
| 173 | d3.put("subtitle", "Starr vor Angst"); |
| 174 | d3.put("pubDate", "20130713"); |
| 175 | d3.put("pubPlace", "Bielefeld"); |
| 176 | d3.put("textClass", "sports"); |
| 177 | d3.put("textStr", |
| 178 | "Er wagte nicht, sich zu ruehren. Er war starr vor Angst."); |
| 179 | d3.put("text", "Er#0-2|PPER|er|c:nom;n:sg;g:masc;p:3|<>:s#0-32$<i>8 " |
| 180 | + "wagte#3-8|VVFIN|wagen|p:3;n:sg;t:past;m:ind| " |
| 181 | + "nicht#9-14|PTKNEG|nicht|| " + ",#14-15|$,|,|| " |
| 182 | + "sich#16-20|PRF|sich|c:acc;p:3;n:sg| " |
| 183 | + "zu#21-23|PTKZU|zu|| " + "ruehren#24-31|VVFIN|ruehren|| " |
| 184 | + ".#31-32|$.|.|| " |
| 185 | + "Er#33-35|PPER|er|c:nom;p:3;n:sg;g:masc|<>:s#33-56$<i>14 " |
| 186 | + "war#36-39|VAFIN|sein|p:3;n:sg;t:past;m:ind| " |
| 187 | + "starr#40-45|ADJD|starr|comp:pos| " + "vor#46-49|APPR|vor|| " |
| 188 | + "Angst#50-55|NN|angst|c:dat;n:sg;g:fem| " + ".#55-56|$.|.||"); |
| 189 | list.add(d3); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 190 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 191 | return list; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 192 | }; |
| 193 | |
| 194 | @Test |
| Nils Diewald | be5943e | 2014-10-21 19:35:34 +0000 | [diff] [blame] | 195 | public void indexLucene () throws Exception { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 196 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 197 | // Base analyzer for searching and indexing |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 198 | StandardAnalyzer analyzer = new StandardAnalyzer(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 199 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 200 | // Based on |
| 201 | // http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/ |
| 202 | // analysis/Analyzer.html?is-external=true |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 203 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 204 | // Create configuration with base analyzer |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 205 | IndexWriterConfig config = new IndexWriterConfig(analyzer); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 206 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 207 | // Add a document 1 with the correct fields |
| 208 | IndexWriter w = new IndexWriter(index, config); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 209 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 210 | Collection docs = initIndexer(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 211 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 212 | @SuppressWarnings("unchecked") |
| 213 | Iterator<Map<String, String>> i = (Iterator<Map<String, String>>) docs |
| 214 | .iterator(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 215 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 216 | for (; i.hasNext();) { |
| 217 | addDoc(w, i.next()); |
| 218 | }; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 219 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 220 | assertEquals(3, w.numDocs()); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 221 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 222 | w.close(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 223 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 224 | // Check directory |
| 225 | DirectoryReader reader = DirectoryReader.open(index); |
| 226 | assertEquals(docs.size(), reader.maxDoc()); |
| 227 | assertEquals(docs.size(), reader.numDocs()); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 228 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 229 | // Check searcher |
| 230 | IndexSearcher searcher = new IndexSearcher(reader); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 231 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 232 | // textClass |
| 233 | // All texts of text class "news" |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 234 | assertEquals(2, |
| 235 | searcher.search(new TermQuery(new Term("textClass", "news")), |
| 236 | 10).totalHits); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 237 | |
| 238 | // textClass |
| 239 | // All texts of text class "sports" |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 240 | assertEquals(2, |
| 241 | searcher.search(new TermQuery(new Term("textClass", "sports")), |
| 242 | 10).totalHits); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 243 | |
| 244 | // TextIndex |
| 245 | // All docs containing "l:nehmen" |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 246 | assertEquals(1, |
| 247 | searcher.search(new TermQuery(new Term("text", "l:nehmen")), |
| 248 | 10).totalHits); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 249 | |
| 250 | // TextIndex |
| 251 | // All docs containing "s:den" |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 252 | assertEquals(2, |
| 253 | searcher.search(new TermQuery(new Term("text", "s:den")), |
| 254 | 10).totalHits); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 255 | |
| 256 | /* |
| 257 | assertEquals(3, |
| 258 | searcher.search( |
| 259 | new TermQuery( |
| 260 | new Term("text", "T") |
| 261 | ), 10 |
| 262 | ).totalHits |
| 263 | ); |
| 264 | */ |
| 265 | |
| 266 | // BooleanQuery |
| 267 | // All docs containing "s:den" and "l:sie" |
| 268 | TermQuery s_den = new TermQuery(new Term("text", "s:den")); |
| 269 | TermQuery l_sie = new TermQuery(new Term("text", "l:sie")); |
| 270 | BooleanQuery bool = new BooleanQuery(); |
| 271 | bool.add(s_den, BooleanClause.Occur.MUST); |
| 272 | bool.add(l_sie, BooleanClause.Occur.MUST); |
| 273 | |
| 274 | assertEquals(1, searcher.search(bool, 10).totalHits); |
| 275 | |
| 276 | // BooleanQuery |
| 277 | // All docs containing "s:den" or "l:sie" |
| 278 | bool = new BooleanQuery(); |
| 279 | bool.add(s_den, BooleanClause.Occur.SHOULD); |
| 280 | bool.add(l_sie, BooleanClause.Occur.SHOULD); |
| 281 | assertEquals(2, searcher.search(bool, 10).totalHits); |
| 282 | |
| 283 | |
| 284 | // RegexpQuery |
| 285 | // All docs containing ".{4}en" (liefen und Hunden) |
| 286 | RegexpQuery srquery = new RegexpQuery(new Term("text", "s:.{4}en")); |
| 287 | assertEquals(2, searcher.search(srquery, 10).totalHits); |
| 288 | |
| 289 | // RegexpQuery |
| 290 | // All docs containing "E." (Er) (2x) |
| 291 | srquery = new RegexpQuery(new Term("text", "s:E.")); |
| 292 | assertEquals(2, searcher.search(srquery, 10).totalHits); |
| 293 | |
| 294 | SpanRegexQueryWrapper ssrquery = new SpanRegexQueryWrapper("text", |
| 295 | "s:E."); |
| 296 | assertEquals(2, searcher.search(ssrquery.toQuery(), 10).totalHits); |
| 297 | |
| 298 | |
| 299 | // RegexpQuery |
| 300 | // All docs containing "E." (er) (0x) |
| 301 | srquery = new RegexpQuery(new Term("text", "s:e.")); |
| 302 | assertEquals(0, searcher.search(srquery, 10).totalHits); |
| 303 | |
| 304 | ssrquery = new SpanRegexQueryWrapper("text", "s:e."); |
| 305 | assertEquals(0, searcher.search(ssrquery.toQuery(), 10).totalHits); |
| 306 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 307 | // RegexpQuery |
| 308 | // All docs containing "E."/i ([Ee]r) (2x) |
| 309 | srquery = new RegexpQuery(new Term("text", "i:e.")); |
| 310 | assertEquals(2, searcher.search(srquery, 10).totalHits); |
| 311 | |
| 312 | ssrquery = new SpanRegexQueryWrapper("text", "s:e.", true); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 313 | assertEquals("SpanMultiTermQueryWrapper(text:/i:e./)", |
| 314 | ssrquery.toQuery().toString()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 315 | assertEquals(2, searcher.search(ssrquery.toQuery(), 10).totalHits); |
| 316 | |
| 317 | // All docs containing "ng"/x (Angst) (2x) |
| 318 | srquery = new RegexpQuery(new Term("text", "s:.*ng.*")); |
| 319 | assertEquals(2, searcher.search(srquery, 10).totalHits); |
| 320 | |
| Akron | 34f73da | 2017-08-09 13:33:41 +0200 | [diff] [blame] | 321 | |
| 322 | // Check http://comments.gmane.org/gmane.comp.jakarta.lucene.user/52283 |
| 323 | // for Carstens question on wildcards |
| 324 | // Wildcardquery |
| 325 | // All docs containing ".{4}en" (liefen und Hunden) |
| 326 | WildcardQuery swquery = new WildcardQuery(new Term("text", "s:*ng*")); |
| 327 | assertEquals("text:s:*ng*", swquery.toString()); |
| 328 | assertEquals(2, searcher.search(swquery, 10).totalHits); |
| 329 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 330 | // [base=angst] |
| 331 | SpanTermQuery stq = new SpanTermQuery(new Term("text", "l:angst")); |
| 332 | assertEquals(2, searcher.search(srquery, 10).totalHits); |
| 333 | |
| 334 | // vor Angst |
| 335 | // [orth=vor][orth=Angst] |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 336 | SpanNearQuery snquery = new SpanNearQuery( |
| 337 | new SpanQuery[] { new SpanTermQuery(new Term("text", "s:vor")), |
| 338 | new SpanTermQuery(new Term("text", "s:Angst")) }, |
| 339 | 1, true); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 340 | assertEquals(1, searcher.search(snquery, 10).totalHits); |
| 341 | |
| 342 | // Spannearquery [p:VVFIN][]{,5}[m:nom:sg:fem] |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 343 | snquery = new SpanNearQuery( |
| 344 | new SpanQuery[] { |
| 345 | new SpanTermQuery(new Term("text", "p:VVFIN")), |
| 346 | new SpanSegmentQueryWrapper("text", "m:c:nom", "m:n:sg", |
| 347 | "m:g:fem").toQuery() }, |
| 348 | 5, // slop |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 349 | true // inOrder |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 350 | // Possible: CollectPayloads |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 351 | ); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 352 | assertEquals(1, searcher.search(snquery, 10).totalHits); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 353 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 354 | |
| 355 | // Spannearquery [p:VVFIN][m:acc:sg:masc] |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 356 | snquery = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery( |
| 357 | new Term("text", "p:VVFIN")), |
| 358 | new SpanNearQuery( |
| 359 | new SpanQuery[] { |
| 360 | new SpanTermQuery(new Term("text", "m:c:acc")), |
| 361 | new SpanNearQuery( |
| 362 | new SpanQuery[] { |
| 363 | new SpanTermQuery(new Term( |
| 364 | "text", "m:n:sg")), |
| 365 | new SpanTermQuery(new Term( |
| 366 | "text", "m:g:masc")) }, |
| 367 | -1, false) }, |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 368 | -1, // slop |
| 369 | false // inOrder |
| 370 | // Possible: CollectPayloads |
| 371 | ) |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 372 | // new SpanTermQuery(new Term("text", "m:-acc:--sg:masc")) |
| 373 | }, 0, // slop |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 374 | true // inOrder |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 375 | // Possible: CollectPayloads |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 376 | ); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 377 | assertEquals(1, searcher.search(snquery, 10).totalHits); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 378 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 379 | |
| 380 | // Spannearquery [p:VVFIN|m:3:sg:past:ind] |
| 381 | // Exact match! |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 382 | snquery = new SpanNearQuery( |
| 383 | new SpanQuery[] { |
| 384 | new SpanTermQuery(new Term("text", "p:VVFIN")), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 385 | new SpanNearQuery(new SpanQuery[] { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 386 | new SpanTermQuery(new Term("text", "m:p:3")), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 387 | new SpanNearQuery(new SpanQuery[] { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 388 | new SpanTermQuery( |
| 389 | new Term("text", "m:n:sg")), |
| 390 | new SpanNearQuery( |
| 391 | new SpanQuery[] { |
| 392 | new SpanTermQuery( |
| 393 | new Term("text", |
| 394 | "m:t:past")), |
| 395 | new SpanTermQuery( |
| 396 | new Term("text", |
| 397 | "m:m:ind")), }, |
| 398 | -1, false) }, |
| 399 | -1, false) }, |
| 400 | -1, false) }, |
| 401 | // new SpanTermQuery(new Term("text", "m:---3:--sg:past:-ind")) |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 402 | -1, // slop |
| 403 | false // inOrder |
| 404 | // Possible: CollectPayloads |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 405 | ); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 406 | assertEquals(2, searcher.search(snquery, 10).totalHits); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 407 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 408 | // To make sure, this is not equal: |
| 409 | // Spannearquery [p:VVFIN & m:3:sg:past:ind] |
| 410 | // Exact match! |
| 411 | // Maybe it IS equal |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 412 | snquery = new SpanNearQuery( |
| 413 | new SpanQuery[] { |
| 414 | new SpanTermQuery(new Term("text", "p:VVFIN")), |
| 415 | new SpanTermQuery(new Term("text", "m:p:3")), |
| 416 | new SpanTermQuery(new Term("text", "m:n:sg")), |
| 417 | new SpanTermQuery(new Term("text", "m:t:past")), |
| 418 | new SpanTermQuery(new Term("text", "m:m:ind")), }, |
| 419 | -1, // slop |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 420 | false // inOrder |
| 421 | // Possible: CollectPayloads |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 422 | ); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 423 | assertNotEquals(2, searcher.search(snquery, 10).totalHits); |
| 424 | // assertEquals(2, searcher.search(snquery, 10).totalHits); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 425 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 426 | // Spannearquery [p:VVFIN & m:3:sg & past:ind] |
| 427 | SpanSegmentQueryWrapper sniquery = new SpanSegmentQueryWrapper("text", |
| 428 | "p:VVFIN", "m:p:3", "m:n:sg", "m:t:past", "m:m:ind"); |
| 429 | assertEquals(2, searcher.search(sniquery.toQuery(), 10).totalHits); |
| 430 | |
| 431 | |
| 432 | // Todo: |
| 433 | |
| 434 | /* |
| 435 | sniquery = new SpanSegmentQuery( |
| 436 | "text", |
| 437 | "p:VVFIN", |
| 438 | "m:p:3", |
| 439 | "m:n:sg", |
| 440 | "m:t:past", |
| 441 | "m:m:ind" |
| 442 | ); |
| 443 | */ |
| 444 | |
| 445 | // Spannearquery [p:VVFIN][]{,5}[m:nom:sg:fem] |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 446 | snquery = new SpanNearQuery( |
| 447 | new SpanQuery[] { |
| 448 | new SpanTermQuery(new Term("text", "p:VVFIN")), |
| 449 | new SpanSegmentQueryWrapper("text", "m:c:nom", "m:n:sg", |
| 450 | "m:g:fem").toQuery() }, |
| 451 | 5, // slop |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 452 | true // inOrder |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 453 | // Possible: CollectPayloads |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 454 | ); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 455 | assertEquals(1, searcher.search(snquery, 10).totalHits); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 456 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 457 | sniquery = new SpanSegmentQueryWrapper("text", "p:VVFIN", "m:p:3", |
| 458 | "m:t:past", "m:m:ind", "m:n:sg"); |
| 459 | assertEquals(2, searcher.search(sniquery.toQuery(), 10).totalHits); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 460 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 461 | // [p = VVFIN & m:p = 3 & m:t = past & m:n != pl] or |
| 462 | // [p = VVFIN & m:p = 3 & m:t = past & !m:n = pl] |
| 463 | // TODO: Problem: What should happen in case the category does not exist? |
| 464 | // pssible solution: & ( m:n != pl & exists(m:n)) |
| 465 | sniquery = new SpanSegmentQueryWrapper("text", "p:VVFIN", "m:p:3", |
| 466 | "m:t:past"); |
| 467 | SpanQuery snqquery = new SpanNotQuery(sniquery.toQuery(), |
| 468 | new SpanTermQuery(new Term("text", "m:n:pl"))); |
| 469 | assertEquals(2, searcher.search(snqquery, 10).totalHits); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 470 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 471 | // [p = NN & (m:c: = dat | m:c = acc)] |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 472 | snquery = new SpanNearQuery( |
| 473 | new SpanQuery[] { new SpanTermQuery(new Term("text", "p:NN")), |
| 474 | new SpanOrQuery( |
| 475 | new SpanTermQuery(new Term("text", "m:c:nom")), |
| 476 | new SpanTermQuery( |
| 477 | new Term("text", "m:c:acc"))) }, |
| 478 | -1, false); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 479 | |
| 480 | assertEquals(2, searcher.search(snqquery, 10).totalHits); |
| 481 | |
| 482 | // [p = NN & !(m:c: = nom | m:c = acc)] |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 483 | snqquery = new SpanNotQuery(new SpanTermQuery(new Term("text", "p:NN")), |
| 484 | new SpanOrQuery(new SpanTermQuery(new Term("text", "m:c:nom")), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 485 | new SpanTermQuery(new Term("text", "m:c:acc")))); |
| 486 | assertEquals(1, searcher.search(snqquery, 10).totalHits); |
| 487 | |
| 488 | // [p = NN & !(m:c = nom)] |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 489 | snqquery = new SpanNotQuery(new SpanTermQuery(new Term("text", "p:NN")), |
| 490 | new SpanTermQuery(new Term("text", "m:c:nom"))); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 491 | assertEquals(3, searcher.search(snqquery, 10).totalHits); |
| 492 | |
| 493 | // [p=NN & !(m:c = acc)] |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 494 | snqquery = new SpanNotQuery(new SpanTermQuery(new Term("text", "p:NN")), |
| 495 | new SpanTermQuery(new Term("text", "m:c:acc"))); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 496 | assertEquals(2, searcher.search(snqquery, 10).totalHits); |
| 497 | |
| 498 | // [p=PPER][][p=ART] |
| 499 | snquery = new SpanNearQuery( |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 500 | new SpanQuery[] { new SpanTermQuery(new Term("text", "p:PPER")), |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 501 | new SpanNearQuery(new SpanQuery[] { |
| 502 | new SpanTermQuery(new Term("text", "T")), |
| 503 | new SpanTermQuery(new Term("text", "p:ART")) }, |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 504 | 0, true), }, |
| 505 | 0, true); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 506 | assertEquals(1, searcher.search(snquery, 10).totalHits); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 507 | |
| 508 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 509 | // Todo: |
| 510 | // [orth=się][]{2,4}[base=bać] |
| 511 | // [orth=się][orth!="[.!?,:]"]{,5}[base=bać]|[base=bać][base="on|ja|ty|my|wy"]?[orth=się] |
| 512 | // [pos=subst & orth="a.*"]{2} |
| 513 | // [tag=subst:sg:nom:n] |
| 514 | // [case==acc & case==gen] ?? |
| 515 | // [case~acc & case~gen] |
| 516 | // [case~~acc] |
| 517 | // [base=bać][orth!=się]+[orth=się] within s |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 518 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 519 | // [][][p:VAFIN] within s |
| 520 | // [][p:VAFIN] within s |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 521 | |
| 522 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 523 | // [][][p:VAFIN] |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 524 | snquery = new SpanNearQuery( |
| 525 | new SpanQuery[] { |
| 526 | new SpanNearQuery(new SpanQuery[] { |
| 527 | new SpanTermQuery(new Term("text", "T")), |
| 528 | new SpanTermQuery(new Term("text", "T")) }, 0, |
| 529 | true), |
| 530 | new SpanTermQuery(new Term("text", "p:VAFIN")) }, |
| 531 | 0, true); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 532 | assertEquals(1, searcher.search(snquery, 10).totalHits); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 533 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 534 | /* |
| 535 | http://stackoverflow.com/questions/1311199/finding-the-position-of-search-hits-from-lucene |
| 536 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 537 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 538 | StringBuilder payloadString = new StringBuilder(); |
| 539 | Map<Term, TermContext> termContexts = new HashMap<>(); |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 540 | for (LeafReaderContext atomic : reader.leaves()) { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 541 | Bits bitset = atomic.reader().getLiveDocs(); |
| 542 | // Spans spans = NearSpansOrdered(); |
| 543 | Spans spans = snquery.getSpans(atomic, bitset, termContexts); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 544 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 545 | while (spans.next()) { |
| 546 | int docid = atomic.docBase + spans.doc(); |
| 547 | if (spans.isPayloadAvailable()) { |
| 548 | for (byte[] payload : spans.getPayload()) { |
| 549 | /* retrieve payload for current matching span */ |
| 550 | payloadString.append(new String(payload)); |
| 551 | payloadString.append(" | "); |
| 552 | }; |
| 553 | }; |
| 554 | }; |
| 555 | }; |
| 556 | // assertEquals(33, payloadString.length()); |
| 557 | assertEquals(0, payloadString.length()); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 558 | |
| 559 | |
| 560 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 561 | // [][][p:VAFIN] |
| 562 | // without collecting payloads |
| 563 | snquery = new SpanNearQuery( |
| 564 | new SpanQuery[] { |
| 565 | new SpanNearQuery(new SpanQuery[] { |
| 566 | new SpanTermQuery(new Term("text", "T")), |
| 567 | new SpanTermQuery(new Term("text", "T")) }, 0, |
| 568 | true, false), |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 569 | new SpanTermQuery(new Term("text", "p:VAFIN")) }, |
| 570 | 0, true, false); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 571 | assertEquals(1, searcher.search(snquery, 10).totalHits); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 572 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 573 | payloadString = new StringBuilder(); |
| 574 | termContexts = new HashMap<>(); |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 575 | for (LeafReaderContext atomic : reader.leaves()) { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 576 | Bits bitset = atomic.reader().getLiveDocs(); |
| 577 | // Spans spans = NearSpansOrdered(); |
| 578 | Spans spans = snquery.getSpans(atomic, bitset, termContexts); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 579 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 580 | while (spans.next()) { |
| 581 | int docid = atomic.docBase + spans.doc(); |
| 582 | for (byte[] payload : spans.getPayload()) { |
| 583 | /* retrieve payload for current matching span */ |
| 584 | payloadString.append(new String(payload)); |
| 585 | payloadString.append(" | "); |
| 586 | }; |
| 587 | }; |
| 588 | }; |
| 589 | assertEquals(0, payloadString.length()); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 590 | |
| 591 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 592 | // [][][p:VAFIN] in s |
| 593 | // ([e:s:<][]*[T] | [T & e:s:<]) [T] ([p:VAFIN & e:s:>] | [T][]*[e:s:>] |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 594 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 595 | /* |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 596 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 597 | SpanSegmentWithinQuery ssequery = new SpanSegmentWithinQuery( |
| 598 | "text","s", new SpanSegmentSequenceQuery("text", "T", "T", "p:VAFIN") |
| 599 | ); |
| 600 | assertEquals(0, searcher.search(ssequery.toQuery(), 10).totalHits); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 601 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 602 | payloadString = new StringBuilder(); |
| 603 | termContexts = new HashMap<>(); |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 604 | for (LeafReaderContext atomic : reader.leaves()) { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 605 | Bits bitset = atomic.reader().getLiveDocs(); |
| 606 | // Spans spans = NearSpansOrdered(); |
| 607 | Spans spans = ssequery.toQuery().getSpans(atomic, bitset, termContexts); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 608 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 609 | while (spans.next()) { |
| 610 | int docid = atomic.docBase + spans.doc(); |
| 611 | for (byte[] payload : spans.getPayload()) { |
| 612 | /// retrieve payload for current matching span |
| 613 | payloadString.append(new String(payload)); |
| 614 | payloadString.append(" | "); |
| 615 | }; |
| 616 | }; |
| 617 | }; |
| 618 | assertEquals(0, payloadString.length(), 1); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 619 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 620 | ssequery = new SpanSegmentWithinQuery( |
| 621 | "text","s", new SpanSegmentSequenceQuery("text", "T", "p:VAFIN") |
| 622 | ); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 623 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 624 | assertEquals("for " + ssequery.toQuery(), |
| 625 | 1, searcher.search(ssequery.toQuery(), 10).totalHits); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 626 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 627 | payloadString = new StringBuilder(); |
| 628 | termContexts = new HashMap<>(); |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 629 | for (LeafReaderContext atomic : reader.leaves()) { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 630 | Bits bitset = atomic.reader().getLiveDocs(); |
| 631 | // Spans spans = NearSpansOrdered(); |
| 632 | Spans spans = ssequery.toQuery().getSpans(atomic, bitset, termContexts); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 633 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 634 | while (spans.next()) { |
| 635 | int docid = atomic.docBase + spans.doc(); |
| 636 | for (byte[] payload : spans.getPayload()) { |
| 637 | // retrieve payload for current matching span |
| 638 | payloadString.append(new String(payload)); |
| 639 | payloadString.append(" | "); |
| 640 | }; |
| 641 | fail("Doc: " + docid + " with " + spans.start() + "-" + spans.end() + " || " + payloadString.toString()); |
| 642 | }; |
| 643 | }; |
| 644 | assertEquals(20, payloadString.length()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 645 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 646 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 647 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 648 | // --------------------______> |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 649 | |
| 650 | |
| 651 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 652 | // Spans spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), ssequery.toQuery()); |
| 653 | /* |
| 654 | TopDocs topDocs = is.search(snq, 1); |
| 655 | Set<String> payloadSet = new HashSet<String>(); |
| 656 | for (int i = 0; i < topDocs.scoreDocs.length; i++) { |
| 657 | while (spans.next()) { |
| 658 | Collection<byte[]> payloads = spans.getPayload(); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 659 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 660 | for (final byte [] payload : payloads) { |
| 661 | payloadSet.add(new String(payload, "UTF-8")); |
| 662 | } |
| 663 | } |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 664 | } |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 665 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 666 | |
| 667 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 668 | /* |
| 669 | Alternativ: |
| 670 | IndexReader reader = writer.getReader(); |
| 671 | writer.close(); |
| 672 | IndexSearcher searcher = newSearcher(reader); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 673 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 674 | PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getTopReaderContext()); |
| 675 | |
| 676 | Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(PayloadHelper.FIELD, "rr"))); |
| 677 | if(VERBOSE) |
| 678 | System.out.println("Num payloads:" + payloads.size()); |
| 679 | for (final byte [] bytes : payloads) { |
| 680 | if(VERBOSE) |
| 681 | System.out.println(new String(bytes, "UTF-8")); |
| 682 | } |
| 683 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 684 | |
| 685 | |
| 686 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 687 | /* new: */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 688 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 689 | // PayloadHelper helper = new PayloadHelper(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 690 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 691 | // Map<Term, TermContext> termContexts = new HashMap<>(); |
| 692 | //Spans spans; |
| 693 | //spans = snquery.getSpans(searcher.getIndexReader()); |
| 694 | // searcher = helper.setUp(similarity, 1000); |
| 695 | /* |
| 696 | IndexReader reader = search.getReader(querycontainer.getFoundry()); |
| 697 | Spans luceneSpans; |
| 698 | Bits bitset = atomic.reader().getLiveDocs(); |
| 699 | for (byte[] payload : luceneSpans.getPayload()) |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 700 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 701 | /* Iterate over all matching documents */ |
| 702 | /* |
| 703 | while (luceneSpans.next() && total < config.getMaxhits()) { |
| 704 | Span matchSpan; |
| 705 | StringBuilder payloadString = new StringBuilder(); |
| 706 | int docid = atomic.docBase + luceneSpans.doc(); |
| 707 | String docname = search.retrieveDocname(docid, |
| 708 | querycontainer.getFoundry()); |
| 709 | total++; |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 710 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 711 | for (byte[] payload : luceneSpans.getPayload()) |
| 712 | */ |
| 713 | /* retrieve payload for current matching span */ |
| 714 | // payloadString.append(new String(payload)); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 715 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 716 | /* create span containing result */ |
| 717 | /* |
| 718 | matchSpan = new Span(docname); |
| 719 | matchSpan.setIndexdocid(docid); |
| 720 | matchSpan.setLayer(querycontainer.getLayer()); |
| 721 | matchSpan.storePayloads(payloadString.toString()); |
| 722 | matchSpans.add(matchSpan); |
| 723 | */ |
| 724 | /* |
| 725 | * topdocs = searcher.search(new ConstantScoreQuery(corpusQ add |
| 726 | * position to list of positions to be considered for later |
| 727 | * searches |
| 728 | */ |
| 729 | /* |
| 730 | validValues.put(docname, |
| 731 | matchSpan.getPayload(config.getPrefix())); |
| 732 | } |
| 733 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 734 | |
| 735 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 736 | // Todo: API made by add() typisiert für queries, strings |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 737 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 738 | // SpanPayloadCheckQuery for sentences! |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 739 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 740 | /* Support regular expression in SpanSegmentQuery */ |
| 741 | // new Regexp(); |
| 742 | // new Term(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 743 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 744 | /* |
| 745 | Vielleicht: spanSegmentQuery(new Term(), new Wildcard(), new Regex()); |
| 746 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 747 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 748 | // And Not -> |
| 749 | // SpanTermDiffQuery |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 750 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 751 | /* |
| 752 | SpanNearQuery poquery = new SpanNearQuery( |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 753 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 754 | ); |
| 755 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 756 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 757 | reader.close(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 758 | |
| 759 | |
| 760 | }; |
| 761 | }; |