blob: 92fb03817af210033c1b6cab248aecef654c5a2f [file] [log] [blame]
Eliza Margaretha01929182014-02-19 11:48:59 +00001package de.ids_mannheim.korap.index;
2
Nils Diewaldf399a672013-11-18 17:55:22 +00003import java.util.*;
4import java.io.*;
5
Akronc63697c2015-06-17 22:32:02 +02006import de.ids_mannheim.korap.index.MultiTerm;
Nils Diewalde4986d72015-02-27 17:35:00 +00007import de.ids_mannheim.korap.index.MultiTermToken;
Nils Diewaldf399a672013-11-18 17:55:22 +00008import de.ids_mannheim.korap.query.wrap.SpanSegmentQueryWrapper;
9import de.ids_mannheim.korap.query.wrap.SpanRegexQueryWrapper;
10import de.ids_mannheim.korap.query.wrap.SpanSequenceQueryWrapper;
11import de.ids_mannheim.korap.query.SpanWithinQuery;
12
Nils Diewald5c375702015-02-09 20:58:24 +000013import de.ids_mannheim.korap.util.CorpusDataException;
14
Nils Diewaldf399a672013-11-18 17:55:22 +000015import static de.ids_mannheim.korap.Test.*;
16
17import org.apache.lucene.analysis.standard.StandardAnalyzer;
18import org.apache.lucene.analysis.TokenFilter;
19import org.apache.lucene.analysis.TokenStream;
20import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
21
22import org.apache.lucene.index.Term;
23import org.apache.lucene.index.TermsEnum;
24import org.apache.lucene.index.TermContext;
25
26import org.apache.lucene.index.DocsAndPositionsEnum;
27import org.apache.lucene.index.DirectoryReader;
28import org.apache.lucene.index.IndexWriter;
29import org.apache.lucene.index.IndexWriterConfig;
30import org.apache.lucene.index.IndexWriterConfig.OpenMode;
Akron700c1eb2015-09-25 16:57:30 +020031import org.apache.lucene.index.LeafReaderContext;
Nils Diewaldf399a672013-11-18 17:55:22 +000032
33import org.apache.lucene.queryparser.classic.ParseException;
34import org.apache.lucene.queryparser.classic.QueryParser;
35
36import org.apache.lucene.search.IndexSearcher;
37import org.apache.lucene.search.Query;
38import org.apache.lucene.search.TermQuery;
39import org.apache.lucene.search.BooleanClause;
40import org.apache.lucene.search.BooleanQuery;
41import org.apache.lucene.search.PhraseQuery;
42import org.apache.lucene.search.NumericRangeQuery;
43import org.apache.lucene.search.spans.Spans;
44import org.apache.lucene.search.spans.SpanQuery;
45import org.apache.lucene.search.spans.SpanOrQuery;
46import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
47import org.apache.lucene.search.spans.SpanTermQuery;
48import org.apache.lucene.search.spans.SpanNearQuery;
49import org.apache.lucene.search.spans.SpanNotQuery;
50import org.apache.lucene.search.spans.NearSpansOrdered;
51import org.apache.lucene.search.WildcardQuery;
52import org.apache.lucene.search.ScoreDoc;
53import org.apache.lucene.search.TopScoreDocCollector;
54import org.apache.lucene.search.TopDocs;
55import org.apache.lucene.search.RegexpQuery;
56
57import org.apache.lucene.store.Directory;
58import org.apache.lucene.store.RAMDirectory;
59import org.apache.lucene.store.SimpleFSDirectory; // temporary
60
61import org.apache.lucene.util.Version;
62import org.apache.lucene.util.BytesRef;
63import org.apache.lucene.util.Bits;
64
65import static org.junit.Assert.*;
66import org.junit.Test;
67import org.junit.Ignore;
68import org.junit.runner.RunWith;
69import org.junit.runners.JUnit4;
70
71@RunWith(JUnit4.class)
72public class TestIndex { // extends LuceneTestCase {
73 // Create index in RAM
74 // private Directory index = new RAMDirectory();
75
76 private Directory index = new RAMDirectory();
77
Akronbb5d1732015-06-22 01:22:40 +020078
Akronc63697c2015-06-17 22:32:02 +020079 @Test
80 public void multiTerm () throws CorpusDataException {
81 MultiTerm test = new MultiTerm("test");
82 assertEquals(test.getTerm(), "test");
83 assertEquals(test.getPayload(), null);
84 assertEquals(test.getStart(), 0);
85 assertEquals(test.getEnd(), 0);
86 assertFalse(test.hasStoredOffsets());
87 assertEquals(test.toString(), "test");
88
89 test = new MultiTerm("test#0-4");
90 assertEquals(test.getTerm(), "test");
91 assertEquals(test.getPayload(), null);
92 assertEquals(test.getStart(), 0);
93 assertEquals(test.getEnd(), 4);
94 assertFalse(test.hasStoredOffsets());
95 assertEquals(test.toString(), "test#0-4");
96
97 test = new MultiTerm("<>:s:test#0-4$<i>67");
98 assertEquals(test.getTerm(), "<>:s:test");
99 assertEquals(test.getPayload().toString(), "[0 0 0 43]");
100 assertEquals(test.getStart(), 0);
101 assertEquals(test.getEnd(), 4);
102 assertFalse(test.hasStoredOffsets());
103 assertTrue(test.toString().startsWith("<>:s:test#0-4$"));
104
105 test = new MultiTerm("xip/l:\\#normal#0-5$<i>3999");
106 assertEquals(test.getTerm(), "xip/l:#normal");
107 assertEquals(test.getPayload().toString(), "[0 0 f 9f]");
108 assertEquals(test.getStart(), 0);
109 assertEquals(test.getEnd(), 5);
110 assertFalse(test.hasStoredOffsets());
111 assertTrue(test.toString().startsWith("xip/l:\\#normal#0-5$"));
112 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000113
Akronbb5d1732015-06-22 01:22:40 +0200114
Nils Diewaldf399a672013-11-18 17:55:22 +0000115 @Test
Nils Diewald5c375702015-02-09 20:58:24 +0000116 public void multiTermToken () throws CorpusDataException {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000117 MultiTermToken test = new MultiTermToken("hunde", "pos:n", "m:gen:pl");
118 assertEquals(test.terms.get(0).term, "hunde");
119 assertEquals(test.terms.get(1).term, "pos:n");
120 assertEquals(test.terms.get(2).term, "m:gen:pl");
Nils Diewaldf399a672013-11-18 17:55:22 +0000121
Nils Diewaldbb33da22015-03-04 16:24:25 +0000122 test = new MultiTermToken("hunde", "pos:n", "m:gen:pl");
123 assertEquals(test.terms.get(0).term, "hunde");
124 assertEquals(test.terms.get(1).term, "pos:n");
125 assertEquals(test.terms.get(2).term, "m:gen:pl");
Nils Diewaldf399a672013-11-18 17:55:22 +0000126 };
127
Akronbb5d1732015-06-22 01:22:40 +0200128
Nils Diewaldf399a672013-11-18 17:55:22 +0000129 private List initIndexer () throws IOException {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000130 List<Map<String, String>> list = new ArrayList<>();
Nils Diewaldf399a672013-11-18 17:55:22 +0000131
Nils Diewaldbb33da22015-03-04 16:24:25 +0000132 Map<String, String> d1 = new HashMap<String, String>();
133 d1.put("id", "w1");
134 d1.put("corpus", "wiki");
135 d1.put("author", "Nils Diewald");
136 d1.put("title", "Wikipedia");
137 d1.put("subtitle", "A test");
138 d1.put("pubDate", "20130701");
139 d1.put("pubPlace", "Mannheim");
140 d1.put("textClass", "news sports");
141 d1.put("textStr", "Er nahm den Hunden die Angst.");
142 d1.put("text", "Er#0-2|PPER|er|c:nom;p:3;n:sg;g:masc|<>:s#0-29$<i>7 "
143 + "nahm#3-7|VVFIN|nehmen|p:3;n:sg;t:past;m:ind| "
144 + "den#8-11|ART|der|c:acc;n:sg;g:masc| "
145 + "Hunden#12-18|NN|hund|c:acc;n:sg;g:masc| "
146 + "die#19-22|ART|der|c:nom;n:sg;g:fem| "
147 + "Angst#23-28|NN|angst|c:nom;n:sg;g:fem| " + ".#28-29|$.|.||");
148 list.add(d1);
Nils Diewaldf399a672013-11-18 17:55:22 +0000149
Nils Diewaldbb33da22015-03-04 16:24:25 +0000150 Map<String, String> d2 = new HashMap<String, String>();
Nils Diewaldf399a672013-11-18 17:55:22 +0000151
Nils Diewaldbb33da22015-03-04 16:24:25 +0000152 d2.put("id", "w2");
153 d2.put("corpus", "wiki");
154 d2.put("author", "Peter Thomas");
155 d2.put("title", "Waldartikel");
156 d2.put("subtitle", "Another test");
157 d2.put("pubDate", "20130723");
158 d2.put("pubPlace", "Bielefeld");
159 d2.put("textClass", "news");
160 d2.put("textStr", "Sie liefen durch den Wald.");
161 d2.put("text", "Sie#0-3|PPER|sie|c:nom;p:3;n:pl;g:all|<>:s#0-26$<i>6 "
162 + "liefen#4-10|VVFIN|laufen|p:3;n:pl;t:past;m:ind| "
163 + "durch#11-16|APPR|durch|| "
164 + "den#17-20|ART|der|c:acc;n:sg;g:masc| "
165 + "Wald#21-25|NN|wald|c:acc;n:sg;g:masc| " + ".#25-26|$.|.||");
166 list.add(d2);
Nils Diewaldf399a672013-11-18 17:55:22 +0000167
Nils Diewaldbb33da22015-03-04 16:24:25 +0000168 Map<String, String> d3 = new HashMap<String, String>();
169 d3.put("id", "w3");
170 d3.put("corpus", "zeitung");
171 d3.put("author", "Michael Meier");
172 d3.put("title", "Angst");
173 d3.put("subtitle", "Starr vor Angst");
174 d3.put("pubDate", "20130713");
175 d3.put("pubPlace", "Bielefeld");
176 d3.put("textClass", "sports");
177 d3.put("textStr",
178 "Er wagte nicht, sich zu ruehren. Er war starr vor Angst.");
179 d3.put("text", "Er#0-2|PPER|er|c:nom;n:sg;g:masc;p:3|<>:s#0-32$<i>8 "
180 + "wagte#3-8|VVFIN|wagen|p:3;n:sg;t:past;m:ind| "
181 + "nicht#9-14|PTKNEG|nicht|| " + ",#14-15|$,|,|| "
182 + "sich#16-20|PRF|sich|c:acc;p:3;n:sg| "
183 + "zu#21-23|PTKZU|zu|| " + "ruehren#24-31|VVFIN|ruehren|| "
184 + ".#31-32|$.|.|| "
185 + "Er#33-35|PPER|er|c:nom;p:3;n:sg;g:masc|<>:s#33-56$<i>14 "
186 + "war#36-39|VAFIN|sein|p:3;n:sg;t:past;m:ind| "
187 + "starr#40-45|ADJD|starr|comp:pos| " + "vor#46-49|APPR|vor|| "
188 + "Angst#50-55|NN|angst|c:dat;n:sg;g:fem| " + ".#55-56|$.|.||");
189 list.add(d3);
Nils Diewaldf399a672013-11-18 17:55:22 +0000190
Nils Diewaldbb33da22015-03-04 16:24:25 +0000191 return list;
Nils Diewaldf399a672013-11-18 17:55:22 +0000192 };
193
194 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +0000195 public void indexLucene () throws Exception {
Nils Diewaldf399a672013-11-18 17:55:22 +0000196
Nils Diewaldbb33da22015-03-04 16:24:25 +0000197 // Base analyzer for searching and indexing
Akron700c1eb2015-09-25 16:57:30 +0200198 StandardAnalyzer analyzer = new StandardAnalyzer();
Nils Diewaldf399a672013-11-18 17:55:22 +0000199
Nils Diewaldbb33da22015-03-04 16:24:25 +0000200 // Based on
201 // http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/
202 // analysis/Analyzer.html?is-external=true
Nils Diewaldf399a672013-11-18 17:55:22 +0000203
Nils Diewaldbb33da22015-03-04 16:24:25 +0000204 // Create configuration with base analyzer
Akron700c1eb2015-09-25 16:57:30 +0200205 IndexWriterConfig config = new IndexWriterConfig(analyzer);
Nils Diewaldf399a672013-11-18 17:55:22 +0000206
Nils Diewaldbb33da22015-03-04 16:24:25 +0000207 // Add a document 1 with the correct fields
208 IndexWriter w = new IndexWriter(index, config);
Nils Diewaldf399a672013-11-18 17:55:22 +0000209
Nils Diewaldbb33da22015-03-04 16:24:25 +0000210 Collection docs = initIndexer();
Nils Diewaldf399a672013-11-18 17:55:22 +0000211
Nils Diewaldbb33da22015-03-04 16:24:25 +0000212 @SuppressWarnings("unchecked")
213 Iterator<Map<String, String>> i = (Iterator<Map<String, String>>) docs
214 .iterator();
Nils Diewaldf399a672013-11-18 17:55:22 +0000215
Nils Diewaldbb33da22015-03-04 16:24:25 +0000216 for (; i.hasNext();) {
217 addDoc(w, i.next());
218 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000219
Nils Diewaldbb33da22015-03-04 16:24:25 +0000220 assertEquals(3, w.numDocs());
Nils Diewaldf399a672013-11-18 17:55:22 +0000221
Nils Diewaldbb33da22015-03-04 16:24:25 +0000222 w.close();
Nils Diewaldf399a672013-11-18 17:55:22 +0000223
Nils Diewaldbb33da22015-03-04 16:24:25 +0000224 // Check directory
225 DirectoryReader reader = DirectoryReader.open(index);
226 assertEquals(docs.size(), reader.maxDoc());
227 assertEquals(docs.size(), reader.numDocs());
Nils Diewaldf399a672013-11-18 17:55:22 +0000228
Nils Diewaldbb33da22015-03-04 16:24:25 +0000229 // Check searcher
230 IndexSearcher searcher = new IndexSearcher(reader);
Nils Diewaldf399a672013-11-18 17:55:22 +0000231
Nils Diewaldbb33da22015-03-04 16:24:25 +0000232 // textClass
233 // All texts of text class "news"
Eliza Margaretha6f989202016-10-14 21:48:29 +0200234 assertEquals(2,
235 searcher.search(new TermQuery(new Term("textClass", "news")),
236 10).totalHits);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000237
238 // textClass
239 // All texts of text class "sports"
Eliza Margaretha6f989202016-10-14 21:48:29 +0200240 assertEquals(2,
241 searcher.search(new TermQuery(new Term("textClass", "sports")),
242 10).totalHits);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000243
244 // TextIndex
245 // All docs containing "l:nehmen"
Eliza Margaretha6f989202016-10-14 21:48:29 +0200246 assertEquals(1,
247 searcher.search(new TermQuery(new Term("text", "l:nehmen")),
248 10).totalHits);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000249
250 // TextIndex
251 // All docs containing "s:den"
Eliza Margaretha6f989202016-10-14 21:48:29 +0200252 assertEquals(2,
253 searcher.search(new TermQuery(new Term("text", "s:den")),
254 10).totalHits);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000255
256 /*
257 assertEquals(3,
258 searcher.search(
259 new TermQuery(
260 new Term("text", "T")
261 ), 10
262 ).totalHits
263 );
264 */
265
266 // BooleanQuery
267 // All docs containing "s:den" and "l:sie"
268 TermQuery s_den = new TermQuery(new Term("text", "s:den"));
269 TermQuery l_sie = new TermQuery(new Term("text", "l:sie"));
270 BooleanQuery bool = new BooleanQuery();
271 bool.add(s_den, BooleanClause.Occur.MUST);
272 bool.add(l_sie, BooleanClause.Occur.MUST);
273
274 assertEquals(1, searcher.search(bool, 10).totalHits);
275
276 // BooleanQuery
277 // All docs containing "s:den" or "l:sie"
278 bool = new BooleanQuery();
279 bool.add(s_den, BooleanClause.Occur.SHOULD);
280 bool.add(l_sie, BooleanClause.Occur.SHOULD);
281 assertEquals(2, searcher.search(bool, 10).totalHits);
282
283
284 // RegexpQuery
285 // All docs containing ".{4}en" (liefen und Hunden)
286 RegexpQuery srquery = new RegexpQuery(new Term("text", "s:.{4}en"));
287 assertEquals(2, searcher.search(srquery, 10).totalHits);
288
289 // RegexpQuery
290 // All docs containing "E." (Er) (2x)
291 srquery = new RegexpQuery(new Term("text", "s:E."));
292 assertEquals(2, searcher.search(srquery, 10).totalHits);
293
294 SpanRegexQueryWrapper ssrquery = new SpanRegexQueryWrapper("text",
295 "s:E.");
296 assertEquals(2, searcher.search(ssrquery.toQuery(), 10).totalHits);
297
298
299 // RegexpQuery
300 // All docs containing "E." (er) (0x)
301 srquery = new RegexpQuery(new Term("text", "s:e."));
302 assertEquals(0, searcher.search(srquery, 10).totalHits);
303
304 ssrquery = new SpanRegexQueryWrapper("text", "s:e.");
305 assertEquals(0, searcher.search(ssrquery.toQuery(), 10).totalHits);
306
Nils Diewaldbb33da22015-03-04 16:24:25 +0000307 // RegexpQuery
308 // All docs containing "E."/i ([Ee]r) (2x)
309 srquery = new RegexpQuery(new Term("text", "i:e."));
310 assertEquals(2, searcher.search(srquery, 10).totalHits);
311
312 ssrquery = new SpanRegexQueryWrapper("text", "s:e.", true);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200313 assertEquals("SpanMultiTermQueryWrapper(text:/i:e./)",
314 ssrquery.toQuery().toString());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000315 assertEquals(2, searcher.search(ssrquery.toQuery(), 10).totalHits);
316
317 // All docs containing "ng"/x (Angst) (2x)
318 srquery = new RegexpQuery(new Term("text", "s:.*ng.*"));
319 assertEquals(2, searcher.search(srquery, 10).totalHits);
320
Akron34f73da2017-08-09 13:33:41 +0200321
322 // Check http://comments.gmane.org/gmane.comp.jakarta.lucene.user/52283
323 // for Carstens question on wildcards
324 // Wildcardquery
325 // All docs containing ".{4}en" (liefen und Hunden)
326 WildcardQuery swquery = new WildcardQuery(new Term("text", "s:*ng*"));
327 assertEquals("text:s:*ng*", swquery.toString());
328 assertEquals(2, searcher.search(swquery, 10).totalHits);
329
Nils Diewaldbb33da22015-03-04 16:24:25 +0000330 // [base=angst]
331 SpanTermQuery stq = new SpanTermQuery(new Term("text", "l:angst"));
332 assertEquals(2, searcher.search(srquery, 10).totalHits);
333
334 // vor Angst
335 // [orth=vor][orth=Angst]
Eliza Margaretha6f989202016-10-14 21:48:29 +0200336 SpanNearQuery snquery = new SpanNearQuery(
337 new SpanQuery[] { new SpanTermQuery(new Term("text", "s:vor")),
338 new SpanTermQuery(new Term("text", "s:Angst")) },
339 1, true);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000340 assertEquals(1, searcher.search(snquery, 10).totalHits);
341
342 // Spannearquery [p:VVFIN][]{,5}[m:nom:sg:fem]
Eliza Margaretha6f989202016-10-14 21:48:29 +0200343 snquery = new SpanNearQuery(
344 new SpanQuery[] {
345 new SpanTermQuery(new Term("text", "p:VVFIN")),
346 new SpanSegmentQueryWrapper("text", "m:c:nom", "m:n:sg",
347 "m:g:fem").toQuery() },
348 5, // slop
Nils Diewaldbb33da22015-03-04 16:24:25 +0000349 true // inOrder
Eliza Margaretha6f989202016-10-14 21:48:29 +0200350 // Possible: CollectPayloads
Nils Diewaldf399a672013-11-18 17:55:22 +0000351 );
Nils Diewaldbb33da22015-03-04 16:24:25 +0000352 assertEquals(1, searcher.search(snquery, 10).totalHits);
Nils Diewaldf399a672013-11-18 17:55:22 +0000353
Nils Diewaldbb33da22015-03-04 16:24:25 +0000354
355 // Spannearquery [p:VVFIN][m:acc:sg:masc]
Eliza Margaretha6f989202016-10-14 21:48:29 +0200356 snquery = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(
357 new Term("text", "p:VVFIN")),
358 new SpanNearQuery(
359 new SpanQuery[] {
360 new SpanTermQuery(new Term("text", "m:c:acc")),
361 new SpanNearQuery(
362 new SpanQuery[] {
363 new SpanTermQuery(new Term(
364 "text", "m:n:sg")),
365 new SpanTermQuery(new Term(
366 "text", "m:g:masc")) },
367 -1, false) },
Nils Diewaldbb33da22015-03-04 16:24:25 +0000368 -1, // slop
369 false // inOrder
370 // Possible: CollectPayloads
371 )
Eliza Margaretha6f989202016-10-14 21:48:29 +0200372 // new SpanTermQuery(new Term("text", "m:-acc:--sg:masc"))
373 }, 0, // slop
Nils Diewaldbb33da22015-03-04 16:24:25 +0000374 true // inOrder
Eliza Margaretha6f989202016-10-14 21:48:29 +0200375 // Possible: CollectPayloads
Nils Diewaldf399a672013-11-18 17:55:22 +0000376 );
Nils Diewaldbb33da22015-03-04 16:24:25 +0000377 assertEquals(1, searcher.search(snquery, 10).totalHits);
Nils Diewaldf399a672013-11-18 17:55:22 +0000378
Nils Diewaldbb33da22015-03-04 16:24:25 +0000379
380 // Spannearquery [p:VVFIN|m:3:sg:past:ind]
381 // Exact match!
Eliza Margaretha6f989202016-10-14 21:48:29 +0200382 snquery = new SpanNearQuery(
383 new SpanQuery[] {
384 new SpanTermQuery(new Term("text", "p:VVFIN")),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000385 new SpanNearQuery(new SpanQuery[] {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200386 new SpanTermQuery(new Term("text", "m:p:3")),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000387 new SpanNearQuery(new SpanQuery[] {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200388 new SpanTermQuery(
389 new Term("text", "m:n:sg")),
390 new SpanNearQuery(
391 new SpanQuery[] {
392 new SpanTermQuery(
393 new Term("text",
394 "m:t:past")),
395 new SpanTermQuery(
396 new Term("text",
397 "m:m:ind")), },
398 -1, false) },
399 -1, false) },
400 -1, false) },
401 // new SpanTermQuery(new Term("text", "m:---3:--sg:past:-ind"))
Nils Diewaldbb33da22015-03-04 16:24:25 +0000402 -1, // slop
403 false // inOrder
404 // Possible: CollectPayloads
Nils Diewaldf399a672013-11-18 17:55:22 +0000405 );
Nils Diewaldbb33da22015-03-04 16:24:25 +0000406 assertEquals(2, searcher.search(snquery, 10).totalHits);
Nils Diewaldf399a672013-11-18 17:55:22 +0000407
Nils Diewaldbb33da22015-03-04 16:24:25 +0000408 // To make sure, this is not equal:
409 // Spannearquery [p:VVFIN & m:3:sg:past:ind]
410 // Exact match!
411 // Maybe it IS equal
Eliza Margaretha6f989202016-10-14 21:48:29 +0200412 snquery = new SpanNearQuery(
413 new SpanQuery[] {
414 new SpanTermQuery(new Term("text", "p:VVFIN")),
415 new SpanTermQuery(new Term("text", "m:p:3")),
416 new SpanTermQuery(new Term("text", "m:n:sg")),
417 new SpanTermQuery(new Term("text", "m:t:past")),
418 new SpanTermQuery(new Term("text", "m:m:ind")), },
419 -1, // slop
Nils Diewaldbb33da22015-03-04 16:24:25 +0000420 false // inOrder
421 // Possible: CollectPayloads
Nils Diewaldf399a672013-11-18 17:55:22 +0000422 );
Nils Diewaldbb33da22015-03-04 16:24:25 +0000423 assertNotEquals(2, searcher.search(snquery, 10).totalHits);
424 // assertEquals(2, searcher.search(snquery, 10).totalHits);
Nils Diewaldf399a672013-11-18 17:55:22 +0000425
Nils Diewaldbb33da22015-03-04 16:24:25 +0000426 // Spannearquery [p:VVFIN & m:3:sg & past:ind]
427 SpanSegmentQueryWrapper sniquery = new SpanSegmentQueryWrapper("text",
428 "p:VVFIN", "m:p:3", "m:n:sg", "m:t:past", "m:m:ind");
429 assertEquals(2, searcher.search(sniquery.toQuery(), 10).totalHits);
430
431
432 // Todo:
433
434 /*
435 sniquery = new SpanSegmentQuery(
436 "text",
437 "p:VVFIN",
438 "m:p:3",
439 "m:n:sg",
440 "m:t:past",
441 "m:m:ind"
442 );
443 */
444
445 // Spannearquery [p:VVFIN][]{,5}[m:nom:sg:fem]
Eliza Margaretha6f989202016-10-14 21:48:29 +0200446 snquery = new SpanNearQuery(
447 new SpanQuery[] {
448 new SpanTermQuery(new Term("text", "p:VVFIN")),
449 new SpanSegmentQueryWrapper("text", "m:c:nom", "m:n:sg",
450 "m:g:fem").toQuery() },
451 5, // slop
Nils Diewaldbb33da22015-03-04 16:24:25 +0000452 true // inOrder
Eliza Margaretha6f989202016-10-14 21:48:29 +0200453 // Possible: CollectPayloads
Nils Diewaldf399a672013-11-18 17:55:22 +0000454 );
Nils Diewaldbb33da22015-03-04 16:24:25 +0000455 assertEquals(1, searcher.search(snquery, 10).totalHits);
Nils Diewaldf399a672013-11-18 17:55:22 +0000456
Nils Diewaldbb33da22015-03-04 16:24:25 +0000457 sniquery = new SpanSegmentQueryWrapper("text", "p:VVFIN", "m:p:3",
458 "m:t:past", "m:m:ind", "m:n:sg");
459 assertEquals(2, searcher.search(sniquery.toQuery(), 10).totalHits);
Nils Diewaldf399a672013-11-18 17:55:22 +0000460
Nils Diewaldbb33da22015-03-04 16:24:25 +0000461 // [p = VVFIN & m:p = 3 & m:t = past & m:n != pl] or
462 // [p = VVFIN & m:p = 3 & m:t = past & !m:n = pl]
463 // TODO: Problem: What should happen in case the category does not exist?
464 // pssible solution: & ( m:n != pl & exists(m:n))
465 sniquery = new SpanSegmentQueryWrapper("text", "p:VVFIN", "m:p:3",
466 "m:t:past");
467 SpanQuery snqquery = new SpanNotQuery(sniquery.toQuery(),
468 new SpanTermQuery(new Term("text", "m:n:pl")));
469 assertEquals(2, searcher.search(snqquery, 10).totalHits);
Nils Diewaldf399a672013-11-18 17:55:22 +0000470
Nils Diewaldbb33da22015-03-04 16:24:25 +0000471 // [p = NN & (m:c: = dat | m:c = acc)]
Eliza Margaretha6f989202016-10-14 21:48:29 +0200472 snquery = new SpanNearQuery(
473 new SpanQuery[] { new SpanTermQuery(new Term("text", "p:NN")),
474 new SpanOrQuery(
475 new SpanTermQuery(new Term("text", "m:c:nom")),
476 new SpanTermQuery(
477 new Term("text", "m:c:acc"))) },
478 -1, false);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000479
480 assertEquals(2, searcher.search(snqquery, 10).totalHits);
481
482 // [p = NN & !(m:c: = nom | m:c = acc)]
Eliza Margaretha6f989202016-10-14 21:48:29 +0200483 snqquery = new SpanNotQuery(new SpanTermQuery(new Term("text", "p:NN")),
484 new SpanOrQuery(new SpanTermQuery(new Term("text", "m:c:nom")),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000485 new SpanTermQuery(new Term("text", "m:c:acc"))));
486 assertEquals(1, searcher.search(snqquery, 10).totalHits);
487
488 // [p = NN & !(m:c = nom)]
Eliza Margaretha6f989202016-10-14 21:48:29 +0200489 snqquery = new SpanNotQuery(new SpanTermQuery(new Term("text", "p:NN")),
490 new SpanTermQuery(new Term("text", "m:c:nom")));
Nils Diewaldbb33da22015-03-04 16:24:25 +0000491 assertEquals(3, searcher.search(snqquery, 10).totalHits);
492
493 // [p=NN & !(m:c = acc)]
Eliza Margaretha6f989202016-10-14 21:48:29 +0200494 snqquery = new SpanNotQuery(new SpanTermQuery(new Term("text", "p:NN")),
495 new SpanTermQuery(new Term("text", "m:c:acc")));
Nils Diewaldbb33da22015-03-04 16:24:25 +0000496 assertEquals(2, searcher.search(snqquery, 10).totalHits);
497
498 // [p=PPER][][p=ART]
499 snquery = new SpanNearQuery(
Eliza Margaretha6f989202016-10-14 21:48:29 +0200500 new SpanQuery[] { new SpanTermQuery(new Term("text", "p:PPER")),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000501 new SpanNearQuery(new SpanQuery[] {
502 new SpanTermQuery(new Term("text", "T")),
503 new SpanTermQuery(new Term("text", "p:ART")) },
Eliza Margaretha6f989202016-10-14 21:48:29 +0200504 0, true), },
505 0, true);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000506 assertEquals(1, searcher.search(snquery, 10).totalHits);
Nils Diewaldf399a672013-11-18 17:55:22 +0000507
508
Nils Diewaldbb33da22015-03-04 16:24:25 +0000509 // Todo:
510 // [orth=się][]{2,4}[base=bać]
511 // [orth=się][orth!="[.!?,:]"]{,5}[base=bać]|[base=bać][base="on|ja|ty|my|wy"]?[orth=się]
512 // [pos=subst & orth="a.*"]{2}
513 // [tag=subst:sg:nom:n]
514 // [case==acc & case==gen] ??
515 // [case~acc & case~gen]
516 // [case~~acc]
517 // [base=bać][orth!=się]+[orth=się] within s
Nils Diewaldf399a672013-11-18 17:55:22 +0000518
Nils Diewaldbb33da22015-03-04 16:24:25 +0000519 // [][][p:VAFIN] within s
520 // [][p:VAFIN] within s
Nils Diewaldf399a672013-11-18 17:55:22 +0000521
522
Nils Diewaldbb33da22015-03-04 16:24:25 +0000523 // [][][p:VAFIN]
Eliza Margaretha6f989202016-10-14 21:48:29 +0200524 snquery = new SpanNearQuery(
525 new SpanQuery[] {
526 new SpanNearQuery(new SpanQuery[] {
527 new SpanTermQuery(new Term("text", "T")),
528 new SpanTermQuery(new Term("text", "T")) }, 0,
529 true),
530 new SpanTermQuery(new Term("text", "p:VAFIN")) },
531 0, true);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000532 assertEquals(1, searcher.search(snquery, 10).totalHits);
Nils Diewaldf399a672013-11-18 17:55:22 +0000533
Nils Diewaldbb33da22015-03-04 16:24:25 +0000534 /*
535 http://stackoverflow.com/questions/1311199/finding-the-position-of-search-hits-from-lucene
536 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000537
Nils Diewaldbb33da22015-03-04 16:24:25 +0000538 StringBuilder payloadString = new StringBuilder();
539 Map<Term, TermContext> termContexts = new HashMap<>();
Akron700c1eb2015-09-25 16:57:30 +0200540 for (LeafReaderContext atomic : reader.leaves()) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000541 Bits bitset = atomic.reader().getLiveDocs();
542 // Spans spans = NearSpansOrdered();
543 Spans spans = snquery.getSpans(atomic, bitset, termContexts);
Nils Diewaldf399a672013-11-18 17:55:22 +0000544
Nils Diewaldbb33da22015-03-04 16:24:25 +0000545 while (spans.next()) {
546 int docid = atomic.docBase + spans.doc();
547 if (spans.isPayloadAvailable()) {
548 for (byte[] payload : spans.getPayload()) {
549 /* retrieve payload for current matching span */
550 payloadString.append(new String(payload));
551 payloadString.append(" | ");
552 };
553 };
554 };
555 };
556 // assertEquals(33, payloadString.length());
557 assertEquals(0, payloadString.length());
Nils Diewaldf399a672013-11-18 17:55:22 +0000558
559
560
Nils Diewaldbb33da22015-03-04 16:24:25 +0000561 // [][][p:VAFIN]
562 // without collecting payloads
563 snquery = new SpanNearQuery(
564 new SpanQuery[] {
565 new SpanNearQuery(new SpanQuery[] {
566 new SpanTermQuery(new Term("text", "T")),
567 new SpanTermQuery(new Term("text", "T")) }, 0,
568 true, false),
Eliza Margaretha6f989202016-10-14 21:48:29 +0200569 new SpanTermQuery(new Term("text", "p:VAFIN")) },
570 0, true, false);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000571 assertEquals(1, searcher.search(snquery, 10).totalHits);
Nils Diewaldf399a672013-11-18 17:55:22 +0000572
Nils Diewaldbb33da22015-03-04 16:24:25 +0000573 payloadString = new StringBuilder();
574 termContexts = new HashMap<>();
Akron700c1eb2015-09-25 16:57:30 +0200575 for (LeafReaderContext atomic : reader.leaves()) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000576 Bits bitset = atomic.reader().getLiveDocs();
577 // Spans spans = NearSpansOrdered();
578 Spans spans = snquery.getSpans(atomic, bitset, termContexts);
Nils Diewaldf399a672013-11-18 17:55:22 +0000579
Nils Diewaldbb33da22015-03-04 16:24:25 +0000580 while (spans.next()) {
581 int docid = atomic.docBase + spans.doc();
582 for (byte[] payload : spans.getPayload()) {
583 /* retrieve payload for current matching span */
584 payloadString.append(new String(payload));
585 payloadString.append(" | ");
586 };
587 };
588 };
589 assertEquals(0, payloadString.length());
Nils Diewaldf399a672013-11-18 17:55:22 +0000590
591
Nils Diewaldbb33da22015-03-04 16:24:25 +0000592 // [][][p:VAFIN] in s
593 // ([e:s:<][]*[T] | [T & e:s:<]) [T] ([p:VAFIN & e:s:>] | [T][]*[e:s:>]
Nils Diewaldf399a672013-11-18 17:55:22 +0000594
Nils Diewaldbb33da22015-03-04 16:24:25 +0000595 /*
Eliza Margaretha6f989202016-10-14 21:48:29 +0200596
Nils Diewaldbb33da22015-03-04 16:24:25 +0000597 SpanSegmentWithinQuery ssequery = new SpanSegmentWithinQuery(
598 "text","s", new SpanSegmentSequenceQuery("text", "T", "T", "p:VAFIN")
599 );
600 assertEquals(0, searcher.search(ssequery.toQuery(), 10).totalHits);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200601
Nils Diewaldbb33da22015-03-04 16:24:25 +0000602 payloadString = new StringBuilder();
603 termContexts = new HashMap<>();
Akron700c1eb2015-09-25 16:57:30 +0200604 for (LeafReaderContext atomic : reader.leaves()) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000605 Bits bitset = atomic.reader().getLiveDocs();
606 // Spans spans = NearSpansOrdered();
607 Spans spans = ssequery.toQuery().getSpans(atomic, bitset, termContexts);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200608
Nils Diewaldbb33da22015-03-04 16:24:25 +0000609 while (spans.next()) {
610 int docid = atomic.docBase + spans.doc();
611 for (byte[] payload : spans.getPayload()) {
612 /// retrieve payload for current matching span
613 payloadString.append(new String(payload));
614 payloadString.append(" | ");
615 };
616 };
617 };
618 assertEquals(0, payloadString.length(), 1);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200619
Nils Diewaldbb33da22015-03-04 16:24:25 +0000620 ssequery = new SpanSegmentWithinQuery(
621 "text","s", new SpanSegmentSequenceQuery("text", "T", "p:VAFIN")
622 );
Eliza Margaretha6f989202016-10-14 21:48:29 +0200623
Nils Diewaldbb33da22015-03-04 16:24:25 +0000624 assertEquals("for " + ssequery.toQuery(),
625 1, searcher.search(ssequery.toQuery(), 10).totalHits);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200626
Nils Diewaldbb33da22015-03-04 16:24:25 +0000627 payloadString = new StringBuilder();
628 termContexts = new HashMap<>();
Akron700c1eb2015-09-25 16:57:30 +0200629 for (LeafReaderContext atomic : reader.leaves()) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000630 Bits bitset = atomic.reader().getLiveDocs();
631 // Spans spans = NearSpansOrdered();
632 Spans spans = ssequery.toQuery().getSpans(atomic, bitset, termContexts);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200633
Nils Diewaldbb33da22015-03-04 16:24:25 +0000634 while (spans.next()) {
635 int docid = atomic.docBase + spans.doc();
636 for (byte[] payload : spans.getPayload()) {
637 // retrieve payload for current matching span
638 payloadString.append(new String(payload));
639 payloadString.append(" | ");
640 };
641 fail("Doc: " + docid + " with " + spans.start() + "-" + spans.end() + " || " + payloadString.toString());
642 };
643 };
644 assertEquals(20, payloadString.length());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200645
Nils Diewaldbb33da22015-03-04 16:24:25 +0000646 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000647
Nils Diewaldbb33da22015-03-04 16:24:25 +0000648 // --------------------______>
Nils Diewaldf399a672013-11-18 17:55:22 +0000649
650
651
Nils Diewaldbb33da22015-03-04 16:24:25 +0000652 // Spans spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), ssequery.toQuery());
653 /*
654 TopDocs topDocs = is.search(snq, 1);
655 Set<String> payloadSet = new HashSet<String>();
656 for (int i = 0; i < topDocs.scoreDocs.length; i++) {
657 while (spans.next()) {
658 Collection<byte[]> payloads = spans.getPayload();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200659
Nils Diewaldbb33da22015-03-04 16:24:25 +0000660 for (final byte [] payload : payloads) {
661 payloadSet.add(new String(payload, "UTF-8"));
662 }
663 }
Nils Diewaldf399a672013-11-18 17:55:22 +0000664 }
Nils Diewaldbb33da22015-03-04 16:24:25 +0000665 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000666
667
Nils Diewaldbb33da22015-03-04 16:24:25 +0000668 /*
669 Alternativ:
670 IndexReader reader = writer.getReader();
671 writer.close();
672 IndexSearcher searcher = newSearcher(reader);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200673
Nils Diewaldbb33da22015-03-04 16:24:25 +0000674 PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getTopReaderContext());
675
676 Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(PayloadHelper.FIELD, "rr")));
677 if(VERBOSE)
678 System.out.println("Num payloads:" + payloads.size());
679 for (final byte [] bytes : payloads) {
680 if(VERBOSE)
681 System.out.println(new String(bytes, "UTF-8"));
682 }
683 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000684
685
686
Nils Diewaldbb33da22015-03-04 16:24:25 +0000687 /* new: */
Nils Diewaldf399a672013-11-18 17:55:22 +0000688
Nils Diewaldbb33da22015-03-04 16:24:25 +0000689 // PayloadHelper helper = new PayloadHelper();
Nils Diewaldf399a672013-11-18 17:55:22 +0000690
Nils Diewaldbb33da22015-03-04 16:24:25 +0000691 // Map<Term, TermContext> termContexts = new HashMap<>();
692 //Spans spans;
693 //spans = snquery.getSpans(searcher.getIndexReader());
694 // searcher = helper.setUp(similarity, 1000);
695 /*
696 IndexReader reader = search.getReader(querycontainer.getFoundry());
697 Spans luceneSpans;
698 Bits bitset = atomic.reader().getLiveDocs();
699 for (byte[] payload : luceneSpans.getPayload())
Eliza Margaretha6f989202016-10-14 21:48:29 +0200700
Nils Diewaldbb33da22015-03-04 16:24:25 +0000701 /* Iterate over all matching documents */
702 /*
703 while (luceneSpans.next() && total < config.getMaxhits()) {
704 Span matchSpan;
705 StringBuilder payloadString = new StringBuilder();
706 int docid = atomic.docBase + luceneSpans.doc();
707 String docname = search.retrieveDocname(docid,
708 querycontainer.getFoundry());
709 total++;
Eliza Margaretha6f989202016-10-14 21:48:29 +0200710
Nils Diewaldbb33da22015-03-04 16:24:25 +0000711 for (byte[] payload : luceneSpans.getPayload())
712 */
713 /* retrieve payload for current matching span */
714 // payloadString.append(new String(payload));
Nils Diewaldf399a672013-11-18 17:55:22 +0000715
Nils Diewaldbb33da22015-03-04 16:24:25 +0000716 /* create span containing result */
717 /*
718 matchSpan = new Span(docname);
719 matchSpan.setIndexdocid(docid);
720 matchSpan.setLayer(querycontainer.getLayer());
721 matchSpan.storePayloads(payloadString.toString());
722 matchSpans.add(matchSpan);
723 */
724 /*
725 * topdocs = searcher.search(new ConstantScoreQuery(corpusQ add
726 * position to list of positions to be considered for later
727 * searches
728 */
729 /*
730 validValues.put(docname,
731 matchSpan.getPayload(config.getPrefix()));
732 }
733 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000734
735
Nils Diewaldbb33da22015-03-04 16:24:25 +0000736 // Todo: API made by add() typisiert für queries, strings
Nils Diewaldf399a672013-11-18 17:55:22 +0000737
Nils Diewaldbb33da22015-03-04 16:24:25 +0000738 // SpanPayloadCheckQuery for sentences!
Nils Diewaldf399a672013-11-18 17:55:22 +0000739
Nils Diewaldbb33da22015-03-04 16:24:25 +0000740 /* Support regular expression in SpanSegmentQuery */
741 // new Regexp();
742 // new Term();
Nils Diewaldf399a672013-11-18 17:55:22 +0000743
Nils Diewaldbb33da22015-03-04 16:24:25 +0000744 /*
745 Vielleicht: spanSegmentQuery(new Term(), new Wildcard(), new Regex());
746 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000747
Nils Diewaldbb33da22015-03-04 16:24:25 +0000748 // And Not ->
749 // SpanTermDiffQuery
Nils Diewaldf399a672013-11-18 17:55:22 +0000750
Nils Diewaldbb33da22015-03-04 16:24:25 +0000751 /*
752 SpanNearQuery poquery = new SpanNearQuery(
Eliza Margaretha6f989202016-10-14 21:48:29 +0200753
Nils Diewaldbb33da22015-03-04 16:24:25 +0000754 );
755 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000756
Nils Diewaldbb33da22015-03-04 16:24:25 +0000757 reader.close();
Nils Diewaldf399a672013-11-18 17:55:22 +0000758
759
760 };
761};