| Eliza Margaretha | 0192918 | 2014-02-19 11:48:59 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.index; |
| 2 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 3 | import java.util.*; |
| 4 | import java.io.*; |
| 5 | |
| 6 | import de.ids_mannheim.korap.analysis.MultiTermToken; |
| 7 | import de.ids_mannheim.korap.query.wrap.SpanSegmentQueryWrapper; |
| 8 | import de.ids_mannheim.korap.query.wrap.SpanRegexQueryWrapper; |
| 9 | import de.ids_mannheim.korap.query.wrap.SpanSequenceQueryWrapper; |
| 10 | import de.ids_mannheim.korap.query.SpanWithinQuery; |
| 11 | |
| 12 | import static de.ids_mannheim.korap.Test.*; |
| 13 | |
| 14 | import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| 15 | import org.apache.lucene.analysis.TokenFilter; |
| 16 | import org.apache.lucene.analysis.TokenStream; |
| 17 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| 18 | |
| 19 | import org.apache.lucene.index.Term; |
| 20 | import org.apache.lucene.index.TermsEnum; |
| 21 | import org.apache.lucene.index.TermContext; |
| 22 | |
| 23 | import org.apache.lucene.index.DocsAndPositionsEnum; |
| 24 | import org.apache.lucene.index.DirectoryReader; |
| 25 | import org.apache.lucene.index.IndexWriter; |
| 26 | import org.apache.lucene.index.IndexWriterConfig; |
| 27 | import org.apache.lucene.index.IndexWriterConfig.OpenMode; |
| 28 | import org.apache.lucene.index.AtomicReaderContext; |
| 29 | |
| 30 | import org.apache.lucene.queryparser.classic.ParseException; |
| 31 | import org.apache.lucene.queryparser.classic.QueryParser; |
| 32 | |
| 33 | import org.apache.lucene.search.IndexSearcher; |
| 34 | import org.apache.lucene.search.Query; |
| 35 | import org.apache.lucene.search.TermQuery; |
| 36 | import org.apache.lucene.search.BooleanClause; |
| 37 | import org.apache.lucene.search.BooleanQuery; |
| 38 | import org.apache.lucene.search.PhraseQuery; |
| 39 | import org.apache.lucene.search.NumericRangeQuery; |
| 40 | import org.apache.lucene.search.spans.Spans; |
| 41 | import org.apache.lucene.search.spans.SpanQuery; |
| 42 | import org.apache.lucene.search.spans.SpanOrQuery; |
| 43 | import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; |
| 44 | import org.apache.lucene.search.spans.SpanTermQuery; |
| 45 | import org.apache.lucene.search.spans.SpanNearQuery; |
| 46 | import org.apache.lucene.search.spans.SpanNotQuery; |
| 47 | import org.apache.lucene.search.spans.NearSpansOrdered; |
| 48 | import org.apache.lucene.search.WildcardQuery; |
| 49 | import org.apache.lucene.search.ScoreDoc; |
| 50 | import org.apache.lucene.search.TopScoreDocCollector; |
| 51 | import org.apache.lucene.search.TopDocs; |
| 52 | import org.apache.lucene.search.RegexpQuery; |
| 53 | |
| 54 | import org.apache.lucene.store.Directory; |
| 55 | import org.apache.lucene.store.RAMDirectory; |
| 56 | import org.apache.lucene.store.SimpleFSDirectory; // temporary |
| 57 | |
| 58 | import org.apache.lucene.util.Version; |
| 59 | import org.apache.lucene.util.BytesRef; |
| 60 | import org.apache.lucene.util.Bits; |
| 61 | |
| 62 | import static org.junit.Assert.*; |
| 63 | import org.junit.Test; |
| 64 | import org.junit.Ignore; |
| 65 | import org.junit.runner.RunWith; |
| 66 | import org.junit.runners.JUnit4; |
| 67 | |
| 68 | @RunWith(JUnit4.class) |
| 69 | public class TestIndex { // extends LuceneTestCase { |
| 70 | // Create index in RAM |
| 71 | // private Directory index = new RAMDirectory(); |
| 72 | |
| 73 | private Directory index = new RAMDirectory(); |
| 74 | |
| 75 | @Test |
| 76 | public void multiTermToken () { |
| 77 | MultiTermToken test = new MultiTermToken("hunde", "pos:n", "m:gen:pl"); |
| 78 | assertEquals(test.terms.get(0).term, "hunde"); |
| 79 | assertEquals(test.terms.get(1).term, "pos:n"); |
| 80 | assertEquals(test.terms.get(2).term, "m:gen:pl"); |
| 81 | assertEquals(test.terms.get(0).posIncr, 1, 1); |
| 82 | assertEquals(test.terms.get(1).posIncr, 0, 1); |
| 83 | assertEquals(test.terms.get(2).posIncr, 0, 1); |
| 84 | |
| 85 | test = new MultiTermToken("hunde", "pos:n", "m:gen:pl"); |
| 86 | assertEquals(test.terms.get(0).term, "hunde"); |
| 87 | assertEquals(test.terms.get(1).term, "pos:n"); |
| 88 | assertEquals(test.terms.get(2).term, "m:gen:pl"); |
| 89 | assertEquals(test.terms.get(0).posIncr, 1, 1); |
| 90 | assertEquals(test.terms.get(1).posIncr, 0, 1); |
| 91 | assertEquals(test.terms.get(2).posIncr, 0, 1); |
| 92 | }; |
| 93 | |
| 94 | private List initIndexer () throws IOException { |
| 95 | List<Map<String, String>> list = new ArrayList<>(); |
| 96 | |
| 97 | Map<String, String> d1 = new HashMap<String, String>(); |
| 98 | d1.put("id", "w1"); |
| 99 | d1.put("corpus", "wiki"); |
| 100 | d1.put("author", "Nils Diewald"); |
| 101 | d1.put("title", "Wikipedia"); |
| 102 | d1.put("subtitle", "A test"); |
| 103 | d1.put("pubDate", "20130701"); |
| 104 | d1.put("pubPlace", "Mannheim"); |
| 105 | d1.put("textClass", "news sports"); |
| 106 | d1.put("textStr", "Er nahm den Hunden die Angst."); |
| 107 | d1.put("text", "Er#0-2|PPER|er|c:nom;p:3;n:sg;g:masc|<>:s#0-29$<i>7 " + |
| 108 | "nahm#3-7|VVFIN|nehmen|p:3;n:sg;t:past;m:ind| " + |
| 109 | "den#8-11|ART|der|c:acc;n:sg;g:masc| " + |
| 110 | "Hunden#12-18|NN|hund|c:acc;n:sg;g:masc| " + |
| 111 | "die#19-22|ART|der|c:nom;n:sg;g:fem| " + |
| 112 | "Angst#23-28|NN|angst|c:nom;n:sg;g:fem| " + |
| 113 | ".#28-29|$.|.||"); |
| 114 | list.add(d1); |
| 115 | |
| 116 | Map<String, String> d2 = new HashMap<String, String>(); |
| 117 | |
| 118 | d2.put("id", "w2"); |
| 119 | d2.put("corpus", "wiki"); |
| 120 | d2.put("author", "Peter Thomas"); |
| 121 | d2.put("title", "Waldartikel"); |
| 122 | d2.put("subtitle", "Another test"); |
| 123 | d2.put("pubDate", "20130723"); |
| 124 | d2.put("pubPlace", "Bielefeld"); |
| 125 | d2.put("textClass", "news"); |
| 126 | d2.put("textStr", "Sie liefen durch den Wald."); |
| 127 | d2.put("text", "Sie#0-3|PPER|sie|c:nom;p:3;n:pl;g:all|<>:s#0-26$<i>6 " + |
| 128 | "liefen#4-10|VVFIN|laufen|p:3;n:pl;t:past;m:ind| " + |
| 129 | "durch#11-16|APPR|durch|| " + |
| 130 | "den#17-20|ART|der|c:acc;n:sg;g:masc| " + |
| 131 | "Wald#21-25|NN|wald|c:acc;n:sg;g:masc| " + |
| 132 | ".#25-26|$.|.||"); |
| 133 | list.add(d2); |
| 134 | |
| 135 | Map<String, String> d3 = new HashMap<String, String>(); |
| 136 | d3.put("id", "w3"); |
| 137 | d3.put("corpus", "zeitung"); |
| 138 | d3.put("author", "Michael Meier"); |
| 139 | d3.put("title", "Angst"); |
| 140 | d3.put("subtitle", "Starr vor Angst"); |
| 141 | d3.put("pubDate", "20130713"); |
| 142 | d3.put("pubPlace", "Bielefeld"); |
| 143 | d3.put("textClass", "sports"); |
| 144 | d3.put("textStr", "Er wagte nicht, sich zu ruehren. Er war starr vor Angst."); |
| 145 | d3.put("text", "Er#0-2|PPER|er|c:nom;n:sg;g:masc;p:3|<>:s#0-32$<i>8 " + |
| 146 | "wagte#3-8|VVFIN|wagen|p:3;n:sg;t:past;m:ind| " + |
| 147 | "nicht#9-14|PTKNEG|nicht|| " + |
| 148 | ",#14-15|$,|,|| " + |
| 149 | "sich#16-20|PRF|sich|c:acc;p:3;n:sg| " + |
| 150 | "zu#21-23|PTKZU|zu|| " + |
| 151 | "ruehren#24-31|VVFIN|ruehren|| " + |
| 152 | ".#31-32|$.|.|| " + |
| 153 | "Er#33-35|PPER|er|c:nom;p:3;n:sg;g:masc|<>:s#33-56$<i>14 " + |
| 154 | "war#36-39|VAFIN|sein|p:3;n:sg;t:past;m:ind| " + |
| 155 | "starr#40-45|ADJD|starr|comp:pos| " + |
| 156 | "vor#46-49|APPR|vor|| " + |
| 157 | "Angst#50-55|NN|angst|c:dat;n:sg;g:fem| " + |
| 158 | ".#55-56|$.|.||"); |
| 159 | list.add(d3); |
| 160 | |
| 161 | return list; |
| 162 | }; |
| 163 | |
| 164 | @Test |
| Nils Diewald | be5943e | 2014-10-21 19:35:34 +0000 | [diff] [blame] | 165 | public void indexLucene () throws Exception { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 166 | |
| 167 | // Base analyzer for searching and indexing |
| 168 | StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); |
| 169 | |
| 170 | // Based on |
| 171 | // http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/ |
| 172 | // analysis/Analyzer.html?is-external=true |
| 173 | |
| 174 | // Create configuration with base analyzer |
| 175 | IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, analyzer); |
| 176 | |
| 177 | // Add a document 1 with the correct fields |
| 178 | IndexWriter w = new IndexWriter(index, config); |
| 179 | |
| 180 | Collection docs = initIndexer(); |
| 181 | |
| 182 | @SuppressWarnings("unchecked") |
| 183 | Iterator<Map<String,String>> i = (Iterator<Map<String,String>>) docs.iterator(); |
| 184 | |
| 185 | for (; i.hasNext(); ) { |
| 186 | addDoc(w, i.next()); |
| 187 | }; |
| 188 | |
| 189 | assertEquals(3, w.numDocs()); |
| 190 | |
| 191 | w.close(); |
| 192 | |
| 193 | // Check directory |
| 194 | DirectoryReader reader = DirectoryReader.open( index ); |
| 195 | assertEquals(docs.size(), reader.maxDoc()); |
| 196 | assertEquals(docs.size(), reader.numDocs()); |
| 197 | |
| 198 | // Check searcher |
| 199 | IndexSearcher searcher = new IndexSearcher( reader ); |
| 200 | |
| 201 | // textClass |
| 202 | // All texts of text class "news" |
| 203 | assertEquals(2, |
| 204 | searcher.search( |
| 205 | new TermQuery( |
| 206 | new Term("textClass", "news") |
| 207 | ), 10 |
| 208 | ).totalHits |
| 209 | ); |
| 210 | |
| 211 | // textClass |
| 212 | // All texts of text class "sports" |
| 213 | assertEquals(2, |
| 214 | searcher.search( |
| 215 | new TermQuery( |
| 216 | new Term("textClass", "sports") |
| 217 | ), 10 |
| 218 | ).totalHits |
| 219 | ); |
| 220 | |
| 221 | // TextIndex |
| 222 | // All docs containing "l:nehmen" |
| 223 | assertEquals(1, |
| 224 | searcher.search( |
| 225 | new TermQuery( |
| 226 | new Term("text", "l:nehmen") |
| 227 | ), 10 |
| 228 | ).totalHits |
| 229 | ); |
| 230 | |
| 231 | // TextIndex |
| 232 | // All docs containing "s:den" |
| 233 | assertEquals(2, |
| 234 | searcher.search( |
| 235 | new TermQuery( |
| 236 | new Term("text", "s:den") |
| 237 | ), 10 |
| 238 | ).totalHits |
| 239 | ); |
| 240 | |
| 241 | /* |
| 242 | assertEquals(3, |
| 243 | searcher.search( |
| 244 | new TermQuery( |
| 245 | new Term("text", "T") |
| 246 | ), 10 |
| 247 | ).totalHits |
| 248 | ); |
| 249 | */ |
| 250 | |
| 251 | // BooleanQuery |
| 252 | // All docs containing "s:den" and "l:sie" |
| 253 | TermQuery s_den = new TermQuery(new Term("text", "s:den")); |
| 254 | TermQuery l_sie = new TermQuery(new Term("text", "l:sie")); |
| 255 | BooleanQuery bool = new BooleanQuery(); |
| 256 | bool.add(s_den, BooleanClause.Occur.MUST); |
| 257 | bool.add(l_sie, BooleanClause.Occur.MUST); |
| 258 | |
| 259 | assertEquals(1, searcher.search(bool, 10).totalHits); |
| 260 | |
| 261 | // BooleanQuery |
| 262 | // All docs containing "s:den" or "l:sie" |
| 263 | bool = new BooleanQuery(); |
| 264 | bool.add(s_den, BooleanClause.Occur.SHOULD); |
| 265 | bool.add(l_sie, BooleanClause.Occur.SHOULD); |
| 266 | assertEquals(2, searcher.search(bool, 10).totalHits); |
| 267 | |
| 268 | |
| 269 | // RegexpQuery |
| 270 | // All docs containing ".{4}en" (liefen und Hunden) |
| 271 | RegexpQuery srquery = new RegexpQuery( |
| 272 | new Term("text", "s:.{4}en") |
| 273 | ); |
| 274 | assertEquals(2, searcher.search(srquery, 10).totalHits); |
| 275 | |
| 276 | // RegexpQuery |
| 277 | // All docs containing "E." (Er) (2x) |
| 278 | srquery = new RegexpQuery( |
| 279 | new Term("text", "s:E.") |
| 280 | ); |
| 281 | assertEquals(2, searcher.search(srquery, 10).totalHits); |
| 282 | |
| 283 | SpanRegexQueryWrapper ssrquery = new SpanRegexQueryWrapper("text", "s:E."); |
| 284 | assertEquals(2, searcher.search(ssrquery.toQuery(), 10).totalHits); |
| 285 | |
| 286 | |
| 287 | // RegexpQuery |
| 288 | // All docs containing "E." (er) (0x) |
| 289 | srquery = new RegexpQuery( |
| 290 | new Term("text", "s:e.") |
| 291 | ); |
| 292 | assertEquals(0, searcher.search(srquery, 10).totalHits); |
| 293 | |
| 294 | ssrquery = new SpanRegexQueryWrapper("text", "s:e."); |
| 295 | assertEquals(0, searcher.search(ssrquery.toQuery(), 10).totalHits); |
| 296 | |
| 297 | // Check http://comments.gmane.org/gmane.comp.jakarta.lucene.user/52283 |
| 298 | // for Carstens question on wildcards |
| 299 | |
| 300 | // RegexpQuery |
| 301 | // All docs containing "E."/i ([Ee]r) (2x) |
| 302 | srquery = new RegexpQuery( |
| 303 | new Term("text", "i:e.") |
| 304 | ); |
| 305 | assertEquals(2, searcher.search(srquery, 10).totalHits); |
| 306 | |
| 307 | ssrquery = new SpanRegexQueryWrapper("text", "s:e.", true); |
| 308 | assertEquals("SpanMultiTermQueryWrapper(text:/i:e./)", ssrquery.toQuery().toString()); |
| 309 | assertEquals(2, searcher.search(ssrquery.toQuery(), 10).totalHits); |
| 310 | |
| 311 | // All docs containing "ng"/x (Angst) (2x) |
| 312 | srquery = new RegexpQuery( |
| 313 | new Term("text", "s:.*ng.*") |
| 314 | ); |
| 315 | assertEquals(2, searcher.search(srquery, 10).totalHits); |
| 316 | |
| 317 | // [base=angst] |
| 318 | SpanTermQuery stq = new SpanTermQuery(new Term("text", "l:angst")); |
| 319 | assertEquals(2, searcher.search(srquery, 10).totalHits); |
| 320 | |
| 321 | // vor Angst |
| 322 | // [orth=vor][orth=Angst] |
| 323 | SpanNearQuery snquery = new SpanNearQuery( |
| 324 | new SpanQuery[] { |
| 325 | new SpanTermQuery(new Term("text", "s:vor")), |
| 326 | new SpanTermQuery(new Term("text", "s:Angst")) |
| 327 | }, |
| 328 | 1, |
| 329 | true |
| 330 | ); |
| 331 | assertEquals(1, searcher.search(snquery, 10).totalHits); |
| 332 | |
| 333 | // Spannearquery [p:VVFIN][]{,5}[m:nom:sg:fem] |
| 334 | snquery = new SpanNearQuery( |
| 335 | new SpanQuery[] { |
| 336 | new SpanTermQuery(new Term("text", "p:VVFIN")), |
| 337 | new SpanSegmentQueryWrapper("text", "m:c:nom", "m:n:sg", "m:g:fem").toQuery() |
| 338 | }, |
| 339 | 5, // slop |
| 340 | true // inOrder |
| 341 | // Possible: CollectPayloads |
| 342 | ); |
| 343 | assertEquals(1, searcher.search(snquery, 10).totalHits); |
| 344 | |
| 345 | |
| 346 | // Spannearquery [p:VVFIN][m:acc:sg:masc] |
| 347 | snquery = new SpanNearQuery( |
| 348 | new SpanQuery[] { |
| 349 | new SpanTermQuery(new Term("text", "p:VVFIN")), |
| 350 | new SpanNearQuery( |
| 351 | new SpanQuery[] { |
| 352 | new SpanTermQuery(new Term("text", "m:c:acc")), |
| 353 | new SpanNearQuery( |
| 354 | new SpanQuery[] { |
| 355 | new SpanTermQuery(new Term("text", "m:n:sg")), |
| 356 | new SpanTermQuery(new Term("text", "m:g:masc")) |
| 357 | }, |
| 358 | -1, |
| 359 | false |
| 360 | ) |
| 361 | }, |
| 362 | -1, // slop |
| 363 | false // inOrder |
| 364 | // Possible: CollectPayloads |
| 365 | ) |
| 366 | // new SpanTermQuery(new Term("text", "m:-acc:--sg:masc")) |
| 367 | }, |
| 368 | 0, // slop |
| 369 | true // inOrder |
| 370 | // Possible: CollectPayloads |
| 371 | ); |
| 372 | assertEquals(1, searcher.search(snquery, 10).totalHits); |
| 373 | |
| 374 | |
| 375 | // Spannearquery [p:VVFIN|m:3:sg:past:ind] |
| 376 | // Exact match! |
| 377 | snquery = new SpanNearQuery( |
| 378 | new SpanQuery[] { |
| 379 | new SpanTermQuery(new Term("text", "p:VVFIN")), |
| 380 | new SpanNearQuery( |
| 381 | new SpanQuery[] { |
| 382 | new SpanTermQuery(new Term("text", "m:p:3")), |
| 383 | new SpanNearQuery( |
| 384 | new SpanQuery[] { |
| 385 | new SpanTermQuery(new Term("text", "m:n:sg")), |
| 386 | new SpanNearQuery( |
| 387 | new SpanQuery[] { |
| 388 | new SpanTermQuery(new Term("text", "m:t:past")), |
| 389 | new SpanTermQuery(new Term("text", "m:m:ind")), |
| 390 | }, |
| 391 | -1, |
| 392 | false |
| 393 | ) |
| 394 | }, |
| 395 | -1, |
| 396 | false |
| 397 | ) |
| 398 | }, |
| 399 | -1, |
| 400 | false |
| 401 | ) |
| 402 | }, |
| 403 | // new SpanTermQuery(new Term("text", "m:---3:--sg:past:-ind")) |
| 404 | -1, // slop |
| 405 | false // inOrder |
| 406 | // Possible: CollectPayloads |
| 407 | ); |
| 408 | assertEquals(2, searcher.search(snquery, 10).totalHits); |
| 409 | |
| 410 | // To make sure, this is not equal: |
| 411 | // Spannearquery [p:VVFIN & m:3:sg:past:ind] |
| 412 | // Exact match! |
| 413 | // Maybe it IS equal |
| 414 | snquery = new SpanNearQuery( |
| 415 | new SpanQuery[] { |
| 416 | new SpanTermQuery(new Term("text", "p:VVFIN")), |
| 417 | new SpanTermQuery(new Term("text", "m:p:3")), |
| 418 | new SpanTermQuery(new Term("text", "m:n:sg")), |
| 419 | new SpanTermQuery(new Term("text", "m:t:past")), |
| 420 | new SpanTermQuery(new Term("text", "m:m:ind")), |
| 421 | }, |
| 422 | -1, // slop |
| 423 | false // inOrder |
| 424 | // Possible: CollectPayloads |
| 425 | ); |
| 426 | assertNotEquals(2, searcher.search(snquery, 10).totalHits); |
| 427 | // assertEquals(2, searcher.search(snquery, 10).totalHits); |
| 428 | |
| 429 | // Spannearquery [p:VVFIN & m:3:sg & past:ind] |
| 430 | SpanSegmentQueryWrapper sniquery = new SpanSegmentQueryWrapper( |
| 431 | "text", |
| 432 | "p:VVFIN", |
| 433 | "m:p:3", |
| 434 | "m:n:sg", |
| 435 | "m:t:past", |
| 436 | "m:m:ind" |
| 437 | ); |
| 438 | assertEquals(2, searcher.search(sniquery.toQuery(), 10).totalHits); |
| 439 | |
| 440 | |
| 441 | // Todo: |
| 442 | |
| 443 | /* |
| 444 | sniquery = new SpanSegmentQuery( |
| 445 | "text", |
| 446 | "p:VVFIN", |
| 447 | "m:p:3", |
| 448 | "m:n:sg", |
| 449 | "m:t:past", |
| 450 | "m:m:ind" |
| 451 | ); |
| 452 | */ |
| 453 | |
| 454 | // Spannearquery [p:VVFIN][]{,5}[m:nom:sg:fem] |
| 455 | snquery = new SpanNearQuery( |
| 456 | new SpanQuery[] { |
| 457 | new SpanTermQuery(new Term("text", "p:VVFIN")), |
| 458 | new SpanSegmentQueryWrapper("text", "m:c:nom", "m:n:sg", "m:g:fem").toQuery() |
| 459 | }, |
| 460 | 5, // slop |
| 461 | true // inOrder |
| 462 | // Possible: CollectPayloads |
| 463 | ); |
| 464 | assertEquals(1, searcher.search(snquery, 10).totalHits); |
| 465 | |
| 466 | sniquery = new SpanSegmentQueryWrapper("text", "p:VVFIN", "m:p:3", "m:t:past", "m:m:ind", "m:n:sg"); |
| 467 | assertEquals(2, searcher.search(sniquery.toQuery(), 10).totalHits); |
| 468 | |
| 469 | // [p = VVFIN & m:p = 3 & m:t = past & m:n != pl] or |
| 470 | // [p = VVFIN & m:p = 3 & m:t = past & !m:n = pl] |
| 471 | // TODO: Problem: What should happen in case the category does not exist? |
| 472 | // pssible solution: & ( m:n != pl & exists(m:n)) |
| 473 | sniquery = new SpanSegmentQueryWrapper("text", "p:VVFIN", "m:p:3", "m:t:past"); |
| 474 | SpanQuery snqquery = new SpanNotQuery(sniquery.toQuery(), new SpanTermQuery(new Term("text", "m:n:pl"))); |
| 475 | assertEquals(2, searcher.search(snqquery, 10).totalHits); |
| 476 | |
| 477 | // [p = NN & (m:c: = dat | m:c = acc)] |
| 478 | snquery = new SpanNearQuery( |
| 479 | new SpanQuery[] { |
| 480 | new SpanTermQuery(new Term("text", "p:NN")), |
| 481 | new SpanOrQuery( |
| 482 | new SpanTermQuery( new Term("text", "m:c:nom" )), |
| 483 | new SpanTermQuery( new Term("text", "m:c:acc" )) |
| 484 | ) |
| 485 | }, |
| 486 | -1, |
| 487 | false |
| 488 | ); |
| 489 | |
| 490 | assertEquals(2, searcher.search(snqquery, 10).totalHits); |
| 491 | |
| 492 | // [p = NN & !(m:c: = nom | m:c = acc)] |
| 493 | snqquery = new SpanNotQuery( |
| 494 | new SpanTermQuery(new Term("text", "p:NN")), |
| 495 | new SpanOrQuery( |
| 496 | new SpanTermQuery( new Term("text", "m:c:nom" )), |
| 497 | new SpanTermQuery( new Term("text", "m:c:acc" )) |
| 498 | ) |
| 499 | ); |
| 500 | assertEquals(1, searcher.search(snqquery, 10).totalHits); |
| 501 | |
| 502 | // [p = NN & !(m:c = nom)] |
| 503 | snqquery = new SpanNotQuery( |
| 504 | new SpanTermQuery( new Term("text", "p:NN")), |
| 505 | new SpanTermQuery( new Term("text", "m:c:nom" )) |
| 506 | ); |
| 507 | assertEquals(3, searcher.search(snqquery, 10).totalHits); |
| 508 | |
| 509 | // [p=NN & !(m:c = acc)] |
| 510 | snqquery = new SpanNotQuery( |
| 511 | new SpanTermQuery( new Term("text", "p:NN")), |
| 512 | new SpanTermQuery( new Term("text", "m:c:acc" )) |
| 513 | ); |
| 514 | assertEquals(2, searcher.search(snqquery, 10).totalHits); |
| 515 | |
| 516 | // [p=PPER][][p=ART] |
| 517 | snquery = new SpanNearQuery( |
| 518 | new SpanQuery[] { |
| 519 | new SpanTermQuery( new Term("text", "p:PPER")), |
| 520 | new SpanNearQuery( |
| 521 | new SpanQuery[] { |
| 522 | new SpanTermQuery( new Term("text", "T")), |
| 523 | new SpanTermQuery( new Term("text", "p:ART")) |
| 524 | }, |
| 525 | 0, |
| 526 | true), |
| 527 | }, |
| 528 | 0, |
| 529 | true |
| 530 | ); |
| 531 | assertEquals(1, searcher.search(snquery, 10).totalHits); |
| 532 | |
| 533 | |
| 534 | // Todo: |
| 535 | // [orth=się][]{2,4}[base=bać] |
| 536 | // [orth=się][orth!="[.!?,:]"]{,5}[base=bać]|[base=bać][base="on|ja|ty|my|wy"]?[orth=się] |
| 537 | // [pos=subst & orth="a.*"]{2} |
| 538 | // [tag=subst:sg:nom:n] |
| 539 | // [case==acc & case==gen] ?? |
| 540 | // [case~acc & case~gen] |
| 541 | // [case~~acc] |
| 542 | // [base=bać][orth!=się]+[orth=się] within s |
| 543 | |
| 544 | // [][][p:VAFIN] within s |
| 545 | // [][p:VAFIN] within s |
| 546 | |
| 547 | |
| 548 | // [][][p:VAFIN] |
| 549 | snquery = new SpanNearQuery( |
| 550 | new SpanQuery[] { |
| 551 | new SpanNearQuery( |
| 552 | new SpanQuery[] { |
| 553 | new SpanTermQuery( new Term("text", "T") ), |
| 554 | new SpanTermQuery( new Term("text", "T") ) |
| 555 | }, |
| 556 | 0, |
| 557 | true |
| 558 | ), |
| 559 | new SpanTermQuery( new Term("text", "p:VAFIN") ) |
| 560 | }, |
| 561 | 0, |
| 562 | true |
| 563 | ); |
| 564 | assertEquals(1, searcher.search(snquery, 10).totalHits); |
| 565 | |
| 566 | /* |
| 567 | http://stackoverflow.com/questions/1311199/finding-the-position-of-search-hits-from-lucene |
| 568 | */ |
| 569 | |
| 570 | StringBuilder payloadString = new StringBuilder(); |
| 571 | Map<Term, TermContext> termContexts = new HashMap<>(); |
| 572 | for (AtomicReaderContext atomic : reader.leaves()) { |
| 573 | Bits bitset = atomic.reader().getLiveDocs(); |
| 574 | // Spans spans = NearSpansOrdered(); |
| 575 | Spans spans = snquery.getSpans(atomic, bitset, termContexts); |
| 576 | |
| 577 | while (spans.next()) { |
| 578 | int docid = atomic.docBase + spans.doc(); |
| 579 | if (spans.isPayloadAvailable()) { |
| 580 | for (byte[] payload : spans.getPayload()) { |
| 581 | /* retrieve payload for current matching span */ |
| 582 | payloadString.append(new String(payload)); |
| 583 | payloadString.append(" | "); |
| 584 | }; |
| 585 | }; |
| 586 | }; |
| 587 | }; |
| 588 | // assertEquals(33, payloadString.length()); |
| 589 | assertEquals(0, payloadString.length()); |
| 590 | |
| 591 | |
| 592 | |
| 593 | // [][][p:VAFIN] |
| 594 | // without collecting payloads |
| 595 | snquery = new SpanNearQuery( |
| 596 | new SpanQuery[] { |
| 597 | new SpanNearQuery( |
| 598 | new SpanQuery[] { |
| 599 | new SpanTermQuery( new Term("text", "T") ), |
| 600 | new SpanTermQuery( new Term("text", "T") ) |
| 601 | }, |
| 602 | 0, |
| 603 | true, |
| 604 | false |
| 605 | ), |
| 606 | new SpanTermQuery( new Term("text", "p:VAFIN") ) |
| 607 | }, |
| 608 | 0, |
| 609 | true, |
| 610 | false |
| 611 | ); |
| 612 | assertEquals(1, searcher.search(snquery, 10).totalHits); |
| 613 | |
| 614 | payloadString = new StringBuilder(); |
| 615 | termContexts = new HashMap<>(); |
| 616 | for (AtomicReaderContext atomic : reader.leaves()) { |
| 617 | Bits bitset = atomic.reader().getLiveDocs(); |
| 618 | // Spans spans = NearSpansOrdered(); |
| 619 | Spans spans = snquery.getSpans(atomic, bitset, termContexts); |
| 620 | |
| 621 | while (spans.next()) { |
| 622 | int docid = atomic.docBase + spans.doc(); |
| 623 | for (byte[] payload : spans.getPayload()) { |
| 624 | /* retrieve payload for current matching span */ |
| 625 | payloadString.append(new String(payload)); |
| 626 | payloadString.append(" | "); |
| 627 | }; |
| 628 | }; |
| 629 | }; |
| 630 | assertEquals(0, payloadString.length()); |
| 631 | |
| 632 | |
| 633 | // [][][p:VAFIN] in s |
| Nils Diewald | cc7c0b3 | 2014-07-31 19:58:22 +0000 | [diff] [blame] | 634 | // ([e:s:<][]*[T] | [T & e:s:<]) [T] ([p:VAFIN & e:s:>] | [T][]*[e:s:>] |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 635 | |
| 636 | /* |
| Nils Diewald | cc7c0b3 | 2014-07-31 19:58:22 +0000 | [diff] [blame] | 637 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 638 | SpanSegmentWithinQuery ssequery = new SpanSegmentWithinQuery( |
| 639 | "text","s", new SpanSegmentSequenceQuery("text", "T", "T", "p:VAFIN") |
| 640 | ); |
| 641 | assertEquals(0, searcher.search(ssequery.toQuery(), 10).totalHits); |
| 642 | |
| 643 | payloadString = new StringBuilder(); |
| 644 | termContexts = new HashMap<>(); |
| 645 | for (AtomicReaderContext atomic : reader.leaves()) { |
| 646 | Bits bitset = atomic.reader().getLiveDocs(); |
| 647 | // Spans spans = NearSpansOrdered(); |
| 648 | Spans spans = ssequery.toQuery().getSpans(atomic, bitset, termContexts); |
| 649 | |
| 650 | while (spans.next()) { |
| 651 | int docid = atomic.docBase + spans.doc(); |
| 652 | for (byte[] payload : spans.getPayload()) { |
| 653 | /// retrieve payload for current matching span |
| 654 | payloadString.append(new String(payload)); |
| 655 | payloadString.append(" | "); |
| 656 | }; |
| 657 | }; |
| 658 | }; |
| 659 | assertEquals(0, payloadString.length(), 1); |
| 660 | |
| 661 | ssequery = new SpanSegmentWithinQuery( |
| 662 | "text","s", new SpanSegmentSequenceQuery("text", "T", "p:VAFIN") |
| 663 | ); |
| 664 | |
| 665 | assertEquals("for " + ssequery.toQuery(), |
| 666 | 1, searcher.search(ssequery.toQuery(), 10).totalHits); |
| 667 | |
| 668 | payloadString = new StringBuilder(); |
| 669 | termContexts = new HashMap<>(); |
| 670 | for (AtomicReaderContext atomic : reader.leaves()) { |
| 671 | Bits bitset = atomic.reader().getLiveDocs(); |
| 672 | // Spans spans = NearSpansOrdered(); |
| 673 | Spans spans = ssequery.toQuery().getSpans(atomic, bitset, termContexts); |
| 674 | |
| 675 | while (spans.next()) { |
| 676 | int docid = atomic.docBase + spans.doc(); |
| 677 | for (byte[] payload : spans.getPayload()) { |
| 678 | // retrieve payload for current matching span |
| 679 | payloadString.append(new String(payload)); |
| 680 | payloadString.append(" | "); |
| 681 | }; |
| 682 | fail("Doc: " + docid + " with " + spans.start() + "-" + spans.end() + " || " + payloadString.toString()); |
| 683 | }; |
| 684 | }; |
| 685 | assertEquals(20, payloadString.length()); |
| 686 | |
| 687 | */ |
| 688 | |
| 689 | // --------------------______> |
| 690 | |
| 691 | |
| 692 | |
| 693 | // Spans spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), ssequery.toQuery()); |
| 694 | /* |
| 695 | TopDocs topDocs = is.search(snq, 1); |
| 696 | Set<String> payloadSet = new HashSet<String>(); |
| 697 | for (int i = 0; i < topDocs.scoreDocs.length; i++) { |
| 698 | while (spans.next()) { |
| 699 | Collection<byte[]> payloads = spans.getPayload(); |
| 700 | |
| 701 | for (final byte [] payload : payloads) { |
| 702 | payloadSet.add(new String(payload, "UTF-8")); |
| 703 | } |
| 704 | } |
| 705 | } |
| 706 | */ |
| 707 | |
| 708 | |
| 709 | /* |
| 710 | Alternativ: |
| 711 | IndexReader reader = writer.getReader(); |
| 712 | writer.close(); |
| 713 | IndexSearcher searcher = newSearcher(reader); |
| 714 | |
| 715 | PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getTopReaderContext()); |
| 716 | |
| 717 | Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(PayloadHelper.FIELD, "rr"))); |
| 718 | if(VERBOSE) |
| 719 | System.out.println("Num payloads:" + payloads.size()); |
| 720 | for (final byte [] bytes : payloads) { |
| 721 | if(VERBOSE) |
| 722 | System.out.println(new String(bytes, "UTF-8")); |
| 723 | } |
| 724 | */ |
| 725 | |
| 726 | |
| 727 | |
| 728 | /* new: */ |
| 729 | |
| 730 | // PayloadHelper helper = new PayloadHelper(); |
| 731 | |
| 732 | // Map<Term, TermContext> termContexts = new HashMap<>(); |
| 733 | //Spans spans; |
| 734 | //spans = snquery.getSpans(searcher.getIndexReader()); |
| 735 | // searcher = helper.setUp(similarity, 1000); |
| 736 | /* |
| 737 | IndexReader reader = search.getReader(querycontainer.getFoundry()); |
| 738 | Spans luceneSpans; |
| 739 | Bits bitset = atomic.reader().getLiveDocs(); |
| 740 | for (byte[] payload : luceneSpans.getPayload()) |
| 741 | |
| 742 | /* Iterate over all matching documents */ |
| 743 | /* |
| 744 | while (luceneSpans.next() && total < config.getMaxhits()) { |
| 745 | Span matchSpan; |
| 746 | StringBuilder payloadString = new StringBuilder(); |
| 747 | int docid = atomic.docBase + luceneSpans.doc(); |
| 748 | String docname = search.retrieveDocname(docid, |
| 749 | querycontainer.getFoundry()); |
| 750 | total++; |
| 751 | |
| 752 | for (byte[] payload : luceneSpans.getPayload()) |
| 753 | */ |
| 754 | /* retrieve payload for current matching span */ |
| 755 | // payloadString.append(new String(payload)); |
| 756 | |
| 757 | /* create span containing result */ |
| 758 | /* |
| 759 | matchSpan = new Span(docname); |
| 760 | matchSpan.setIndexdocid(docid); |
| 761 | matchSpan.setLayer(querycontainer.getLayer()); |
| 762 | matchSpan.storePayloads(payloadString.toString()); |
| 763 | matchSpans.add(matchSpan); |
| 764 | */ |
| 765 | /* |
| 766 | * topdocs = searcher.search(new ConstantScoreQuery(corpusQ add |
| 767 | * position to list of positions to be considered for later |
| 768 | * searches |
| 769 | */ |
| 770 | /* |
| 771 | validValues.put(docname, |
| 772 | matchSpan.getPayload(config.getPrefix())); |
| 773 | } |
| 774 | */ |
| 775 | |
| 776 | |
| 777 | // Todo: API made by add() typisiert für queries, strings |
| 778 | |
| 779 | // SpanPayloadCheckQuery for sentences! |
| 780 | |
| 781 | /* Support regular expression in SpanSegmentQuery */ |
| 782 | // new Regexp(); |
| 783 | // new Term(); |
| 784 | |
| 785 | /* |
| 786 | Vielleicht: spanSegmentQuery(new Term(), new Wildcard(), new Regex()); |
| 787 | */ |
| 788 | |
| 789 | // And Not -> |
| 790 | // SpanTermDiffQuery |
| 791 | |
| 792 | /* |
| 793 | SpanNearQuery poquery = new SpanNearQuery( |
| 794 | |
| 795 | ); |
| 796 | */ |
| 797 | |
| 798 | reader.close(); |
| 799 | |
| 800 | |
| 801 | }; |
| 802 | }; |